From d38f517a7a25ca9f233c52bbf24dd2864ea3d3f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20Mollier?= Date: Wed, 9 Dec 2020 14:27:08 +0100 Subject: Import seqmagick_0.8.4.orig.tar.gz [dgit import orig seqmagick_0.8.4.orig.tar.gz] --- .gitignore | 9 + .travis.yml | 24 + CHANGELOG.rst | 127 + CONTRIB | 1 + DEVELOPING.rst | 77 + INSTALL | 19 + LICENSE | 675 ++++++ MANIFEST.in | 8 + README.rst | 71 + docs/.gitignore | 1 + docs/Makefile | 130 + docs/_static/.gitignore | 0 docs/_static/fork.png | Bin 0 -> 9438 bytes docs/backtrans_align.rst | 8 + docs/changelog.rst | 127 + docs/conf.py | 239 ++ docs/convert_mogrify.rst | 76 + docs/extensions.rst | 27 + docs/extract_ids.rst | 7 + docs/index.rst | 135 ++ docs/info.rst | 24 + docs/make_extensions.py | 22 + docs/primer_trim.rst | 7 + docs/quality_filter.rst | 8 + examples/aligned.fasta | 2475 ++++++++++++++++++++ examples/apply-function/myfunctions.py | 19 + examples/apply-function/run_filter.sh | 8 + examples/dewrapped.fasta | 296 +++ examples/dewrapped.phy | 135 ++ examples/include-from-file/do_filter.sh | 3 + examples/include-from-file/selection.txt | 3 + examples/quality-filter/sample.barcodes.csv | 1 + examples/quality-filter/sample.fna | 24 + examples/quality-filter/sample.qual | 56 + examples/range.fasta | 7 + examples/test.fasta | 417 ++++ examples/wrapped.fasta | 8 + examples/wrapped.phy | 28 + logo/seqmagick_logo.png | Bin 0 -> 57891 bytes logo/seqmagick_logo.psd | Bin 0 -> 1339899 bytes logo/seqmagick_logo_blue.png | Bin 0 -> 52817 bytes logo/seqmagick_logo_dark_blue.png | Bin 0 -> 53947 bytes logo/seqmagick_logo_red.png | Bin 0 -> 58419 bytes logo/seqmagick_logo_sans_bg.png | Bin 0 -> 63633 bytes logo/seqmagick_logo_small.png | Bin 0 -> 14372 bytes requirements-rtd.txt | 3 + requirements.txt | 9 + seqmagick.py | 7 + seqmagick/__init__.py | 7 + seqmagick/fileformat.py | 80 + seqmagick/scripts/__init__.py | 0 seqmagick/scripts/cli.py | 73 + seqmagick/subcommands/__init__.py | 8 + seqmagick/subcommands/backtrans_align.py | 167 ++ seqmagick/subcommands/common.py | 238 ++ seqmagick/subcommands/convert.py | 362 +++ seqmagick/subcommands/extract_ids.py | 41 + seqmagick/subcommands/info.py | 179 ++ seqmagick/subcommands/mogrify.py | 34 + seqmagick/subcommands/primer_trim.py | 328 +++ seqmagick/subcommands/quality_filter.py | 776 ++++++ seqmagick/test/__init__.py | 0 seqmagick/test/integration/__init__.py | 13 + seqmagick/test/integration/data/__init__.py | 0 seqmagick/test/integration/data/input1.fasta | 7 + seqmagick/test/integration/data/input2.fasta | 6 + seqmagick/test/integration/data/input2.fasta.bz2 | Bin 0 -> 91 bytes seqmagick/test/integration/data/input2.fasta.gz | Bin 0 -> 84 bytes seqmagick/test/integration/data/input3.fasta | 6 + seqmagick/test/integration/data/input4_ambig.fasta | 4 + seqmagick/test/integration/data/input5.fasta | 14 + seqmagick/test/integration/data/input6.fasta | 6 + seqmagick/test/integration/data/output2.fasta | 6 + seqmagick/test/integration/data/output2.nex | 10 + seqmagick/test/integration/data/output2.phy | 4 + .../test/integration/data/output2_ungap_cut.fasta | 4 + seqmagick/test/integration/data/output3.fasta | 6 + seqmagick/test/integration/data/output3.nex | 10 + seqmagick/test/integration/data/output4.fasta | 4 + seqmagick/test/integration/data/output4.nex | 10 + seqmagick/test/integration/data/output5.fasta | 4 + seqmagick/test/integration/test_convert.py | 222 ++ seqmagick/test/integration/test_extract_ids.py | 47 + seqmagick/test/integration/test_info.py | 48 + seqmagick/test/integration/test_mogrify.py | 57 + seqmagick/test/test_primer_trim.py | 116 + seqmagick/test/test_subcommands_backtrans_align.py | 76 + seqmagick/test/test_subcommands_common.py | 201 ++ seqmagick/test/test_subcommands_convert.py | 139 ++ seqmagick/test/test_subcommands_quality_filter.py | 279 +++ seqmagick/test/test_transform.py | 673 ++++++ seqmagick/transform.py | 810 +++++++ setup.py | 48 + tox.ini | 15 + 94 files changed, 10459 insertions(+) create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 CHANGELOG.rst create mode 100644 CONTRIB create mode 100644 DEVELOPING.rst create mode 100644 INSTALL create mode 100644 LICENSE create mode 100644 MANIFEST.in create mode 100644 README.rst create mode 100644 docs/.gitignore create mode 100644 docs/Makefile create mode 100644 docs/_static/.gitignore create mode 100644 docs/_static/fork.png create mode 100644 docs/backtrans_align.rst create mode 100644 docs/changelog.rst create mode 100644 docs/conf.py create mode 100644 docs/convert_mogrify.rst create mode 100644 docs/extensions.rst create mode 100644 docs/extract_ids.rst create mode 100644 docs/index.rst create mode 100644 docs/info.rst create mode 100755 docs/make_extensions.py create mode 100644 docs/primer_trim.rst create mode 100644 docs/quality_filter.rst create mode 100644 examples/aligned.fasta create mode 100644 examples/apply-function/myfunctions.py create mode 100755 examples/apply-function/run_filter.sh create mode 100644 examples/dewrapped.fasta create mode 100644 examples/dewrapped.phy create mode 100755 examples/include-from-file/do_filter.sh create mode 100644 examples/include-from-file/selection.txt create mode 100644 examples/quality-filter/sample.barcodes.csv create mode 100644 examples/quality-filter/sample.fna create mode 100644 examples/quality-filter/sample.qual create mode 100644 examples/range.fasta create mode 100644 examples/test.fasta create mode 100644 examples/wrapped.fasta create mode 100644 examples/wrapped.phy create mode 100644 logo/seqmagick_logo.png create mode 100644 logo/seqmagick_logo.psd create mode 100644 logo/seqmagick_logo_blue.png create mode 100644 logo/seqmagick_logo_dark_blue.png create mode 100644 logo/seqmagick_logo_red.png create mode 100644 logo/seqmagick_logo_sans_bg.png create mode 100644 logo/seqmagick_logo_small.png create mode 100644 requirements-rtd.txt create mode 100644 requirements.txt create mode 100755 seqmagick.py create mode 100644 seqmagick/__init__.py create mode 100644 seqmagick/fileformat.py create mode 100644 seqmagick/scripts/__init__.py create mode 100644 seqmagick/scripts/cli.py create mode 100644 seqmagick/subcommands/__init__.py create mode 100644 seqmagick/subcommands/backtrans_align.py create mode 100644 seqmagick/subcommands/common.py create mode 100644 seqmagick/subcommands/convert.py create mode 100644 seqmagick/subcommands/extract_ids.py create mode 100644 seqmagick/subcommands/info.py create mode 100644 seqmagick/subcommands/mogrify.py create mode 100644 seqmagick/subcommands/primer_trim.py create mode 100644 seqmagick/subcommands/quality_filter.py create mode 100644 seqmagick/test/__init__.py create mode 100644 seqmagick/test/integration/__init__.py create mode 100644 seqmagick/test/integration/data/__init__.py create mode 100644 seqmagick/test/integration/data/input1.fasta create mode 100644 seqmagick/test/integration/data/input2.fasta create mode 100644 seqmagick/test/integration/data/input2.fasta.bz2 create mode 100644 seqmagick/test/integration/data/input2.fasta.gz create mode 100644 seqmagick/test/integration/data/input3.fasta create mode 100644 seqmagick/test/integration/data/input4_ambig.fasta create mode 100644 seqmagick/test/integration/data/input5.fasta create mode 100644 seqmagick/test/integration/data/input6.fasta create mode 100644 seqmagick/test/integration/data/output2.fasta create mode 100644 seqmagick/test/integration/data/output2.nex create mode 100644 seqmagick/test/integration/data/output2.phy create mode 100644 seqmagick/test/integration/data/output2_ungap_cut.fasta create mode 100644 seqmagick/test/integration/data/output3.fasta create mode 100644 seqmagick/test/integration/data/output3.nex create mode 100644 seqmagick/test/integration/data/output4.fasta create mode 100644 seqmagick/test/integration/data/output4.nex create mode 100644 seqmagick/test/integration/data/output5.fasta create mode 100644 seqmagick/test/integration/test_convert.py create mode 100644 seqmagick/test/integration/test_extract_ids.py create mode 100644 seqmagick/test/integration/test_info.py create mode 100644 seqmagick/test/integration/test_mogrify.py create mode 100644 seqmagick/test/test_primer_trim.py create mode 100644 seqmagick/test/test_subcommands_backtrans_align.py create mode 100644 seqmagick/test/test_subcommands_common.py create mode 100644 seqmagick/test/test_subcommands_convert.py create mode 100644 seqmagick/test/test_subcommands_quality_filter.py create mode 100644 seqmagick/test/test_transform.py create mode 100644 seqmagick/transform.py create mode 100755 setup.py create mode 100644 tox.ini diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4dd73a4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +seqmagick/data/ver +*.pyc +.tox +build/ +dist/ +MANIFEST +doctrees/ +seqmagick.egg-info/ +docs/*.help diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..562f71b --- /dev/null +++ b/.travis.yml @@ -0,0 +1,24 @@ +language: python +python: + # - "2.7" + # - "pypy" + # - "3.4" + # - "3.5" + - "3.6" + - "3.7" + - "3.8" + +# Install numpy, then BioPython +# BioPython doesn't always play well with pip install. +install: + - "if [[ $TRAVIS_PYTHON_VERSION != 'pypy' ]]; then pip install -q numpy; fi" + - "pip install -q biopython nose pygtrie" + - "pip install ." + +script: + - seqmagick --version + - nosetests + +notifications: + email: + - noah.hoffman@gmail.com diff --git a/CHANGELOG.rst b/CHANGELOG.rst new file mode 100644 index 0000000..4222705 --- /dev/null +++ b/CHANGELOG.rst @@ -0,0 +1,127 @@ +Changes for seqmagick +===================== + +0.8.0 +----- + +* Supports Python 3.5+ +* Drops support for Python 3.4 +* Fix issue: "seqmagick with no params gives KeyError:None" [GH-77] +* Fix for Biopython 1.71 dual coding support [GH-76]; also fixes issue: "Translation error with new BioPython" [GH-79] +* Send logging to stderr, not stdout [GH-75] + +0.7.0 +----- + +* Supports Python 3.4+ +* Drops support for python 2.7 +* requires biopython >= 1.70 +* Drops support for bz2 compression [see GH-66] +* New option ``convert --sample-seed`` to make ``--sample`` deterministic. + +0.6.2 +----- + +* New ``quality-filter --pct-ambiguous`` switch [GH-53] +* setup.py enforces biopython>=1.58,<=1.66 (1.67 is not compatible) [GH-59] +* This is the last release that will support Python 2! + +0.6.1 +----- + +* Allow string wrapping when input isn't FASTA. [GH-45] +* Fix ``--pattern-include``, ``--pattern-exclude``, and ``--pattern-replace`` + for sequences without descriptions (e.g., from NEXUS files). [GH-47] +* Fix mogrify example. [GH-52] + +0.6.0 +----- + +* Map ``.nex`` extension to NEXUS-format (--alphabet must be specified if writing) +* Use reservoir sampling in ``--sample`` selector (lower memory use) +* Support specifying negative indices to ``--cut`` [GH-33] +* Optionally allow invalid codons in ``backtrans-align`` [GH-34] +* Map ``.fq`` extension to FASTQ format +* Optional multithreaded I/O in ``info`` [GH-36] +* Print sequence name on length mismatch in ``backtrans-align`` [GH-37] +* Support for ``+`` and ``-`` in head and tail to mimick Linux head and tail commands. +* Fix scoring for mixed-case sequences in ``primer-trim``. +* Fix bug in ``primer-trim`` - failed when sequence had multiple 5' gaps compared to the primer. +* Clarify documentation and fix bug in convert/mogrify ``--pattern-replace`` [GH-39] +* Support for gzip files in ``seqmagick convert --sort`` + +0.5.0 +----- + +* Change ``seqmagick extract-ids --source-format`` to ``--input-format`` to match + other commands (GH-29) +* Support gzip- and bzip2-compressed inputs and outputs for most commands (GH-30) +* Change default input format for ``sff`` to ``sff-trim``, which respects the + clipping locations embedded in each sequence record. +* Add ``--details-out`` option to ``seqmagick quality-filter``, which writes + details on each read processed. +* Match barcode/primer ``seqmagick quality-filter`` against a trie; allows + per-specimen barcodes. +* Remove ``--failure-out`` option from ``seqmagick quality-filter``. See ``--details-out`` +* Raise an error if number of codons does not match number of amino acids in + ``seqmagick backtrans-align`` +* Add ``--sample`` subcommand (GH-31) + +0.4.0 +----- + +* Fix bug in ``--squeeze`` +* More informative messages in ``seqmagick primer-trim`` +* Added ``--alphabet`` flag to allow writing NEXUS (GH-23) +* Exiting without error on SIGPIPE in extract-ids, info (GH-17) +* Ambiguities are translated as 'X' in --translate (GH-16) +* Allowing '.' or '-' as gap character (GH-18) +* ``--name-prefix`` and ``--name-suffix`` no longer create a mangled description (GH-19) +* Files owned by another user can be mogrified, as long as they are group writeable (GH-14) +* Add ``backtrans-align`` subcommand, which maps unaligned nucleotides onto a + protein alignment (GH-20) +* Allow FASTQ as input to quality-filter +* Significantly expand functionality of quality-filter: identify and trim + barcodes/primers; report detailed failure information. +* Cleanup, additional tests +* Add ``--drop`` filter to convert and mogrify (GH-24) +* Apply current umask when creating files (GH-26) +* Support stdin in ``seqmagick info`` (GH-27) +* Support translating ambiguous nucleotides, if codon translation is unambiguous + +0.3.1 +----- + +* Fix bug in ``quality-filter`` MinLengthFilter +* Case consistency in seqmagick + +0.3.0 +----- + +* Internal reorganization - transformations are converted to partial functions, + then applied. +* Argument order now affects order of tranformation application. +* Change default output format to 'align' for TTYs in seqmagick info +* Add BioPython as dependency (closes GH-7) +* Add ``primer-trim`` subcommand +* Add option to apply custom function(s) to sequences +* Add new filtering options: ``--squeeze-threshold``, ``--min-ungapped-length`` + ``--include-from-file`` ``--exclude-from-file`` +* Removed seqmagick muscle +* Added new subcommand ``quality-filter`` +* Added new subcommand ``extract-ids`` (closes GH-13) +* Allow use of '-' to indicate stdin / stdout (closes GH-11) +* Add mapping from .phyx to ``phylip-relaxed`` (targeted for BioPython 1.58) + +0.2.0 +----- + +* Refactoring +* Added hyphenation to multi-word command line options (e.g. + ``--deduplicatetaxa`` -> ``--deduplicate-taxa``) +* Add support for ``.needle``, ``.sff`` formats +* Close GH-4 + +0.1.0 +----- +Initial release diff --git a/CONTRIB b/CONTRIB new file mode 100644 index 0000000..a414031 --- /dev/null +++ b/CONTRIB @@ -0,0 +1 @@ +See https://github.com/fhcrc/seqmagick/graphs/contributors diff --git a/DEVELOPING.rst b/DEVELOPING.rst new file mode 100644 index 0000000..eada8dd --- /dev/null +++ b/DEVELOPING.rst @@ -0,0 +1,77 @@ +====================== + Developing seqmagick +====================== + +Requirements +============ + +Note that building docs, publishing to pypi, etc require some +additional dependencies. It's best to work in a virtualenv:: + + python3 -m venv py3-env + source py3-env/bin/activate + pip install -r requirements.txt + +Git workflow +============ + +We aspire to more or less use a `feature branch workflow +`_ +for development. Briefly (for those working on the main fork): + +* Features or bugfixes should start life as a GitHub issue +* Work on the feature occurs in a "feature branch" named like + '%i-brief-description' % issue_number +* When completed (with tests passing) the feature branch is merged + into dev (a pull request at this point might be appropriate if you + want to request a code review). +* When it's time for a release, dev is merged into master (as a + result, the head of the `master` branch is always on a release + version). + +versioning +========== + +The package version is defined by the git tag (using +``git describe --tags --dirty``), in the form '..', +eg:: + + git tag -a -m 'version 0.7.0' 0.7.0 + +Because setup.py determines the package version at the time the +package tarball is created, the repo must be clean (ie, no uncommitted +changes to versioned files) and there must be no further commits after +adding the tag when preparing to upload a tarball to PyPi. + +preparing a release +=================== + +First, make sure you have committed all changes. + +Run tests, and make sure docs build without errors:: + + nosetests + (cd docs && make html) + +Push one last time to master to trigger tests on travis:: + + git push origin master + +Go to travis (https://travis-ci.org/fhcrc/seqmagick) and make sure the +tests have completed. + +Add a new tag (see above). Push the tag to GitHub:: + + git push --tags + +Build and upload a tarball to PyPi:: + + python setup.py clean + rm -r build dist + python setup.py sdist + twine upload dist/* + +Build and push docs to GutHub pages:: + + (cd docs && make html) + ghp-import --no-jekyll -p docs/_build/html diff --git a/INSTALL b/INSTALL new file mode 100644 index 0000000..7eeb38c --- /dev/null +++ b/INSTALL @@ -0,0 +1,19 @@ +The most convenient way to install is using pip (best to make sure you +are using the most recent version of pip): + + pip install -U pip + pip install seqmagick + +If the above doesn't work or you want to install from a clone of the +project: First, install BioPython (http://www.biopython.org). NumPy +(which parts of BioPython depend on) is not required for seqmagick to +function but will most likely be installed as a dependency of +BioPython. Once done, install system-wide with: + + sudo python setup.py install + +To install for a specific version of python: + /path/to/python setup.py install + +Yet another installation option: + python setup.py install --home=/home/username/local diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..10926e8 --- /dev/null +++ b/LICENSE @@ -0,0 +1,675 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. + diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..b7bb557 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,8 @@ +include seqmagick/test/integration/data/* + +global-exclude *.so +global-exclude *.pyd +global-exclude *.pyc +global-exclude .git* +global-exclude .DS_Store +global-exclude *.png diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..681d11b --- /dev/null +++ b/README.rst @@ -0,0 +1,71 @@ +========= +seqmagick +========= + +.. image:: https://travis-ci.org/fhcrc/seqmagick.svg?branch=master + :target: https://travis-ci.org/fhcrc/seqmagick + +We often have to convert sequence files between formats and do little +manipulations on them, and it's not worth writing scripts for that. +``seqmagick`` is a kickass little utility to expose the file format conversion +in BioPython in a convenient way. Instead of having a big mess of scripts, +there is one that takes arguments:: + + seqmagick convert a.fasta b.phy # convert from fasta to phylip + seqmagick mogrify --ungap a.fasta # remove all gaps from a.fasta, in place + seqmagick info *.{fasta,sto} # describe all FASTA and Stockholm + # files in the current directory + +Requirements +============ + +* Python >= 3.5 +* biopython >= 1.78 + +Installation +============ + +Use pip:: + + pip install seqmagick + +Note that as of version 0.8.0, this package requires Python 3.5+. If +you want to use the most recent version compatible with Python 2.7:: + + pip install seqmagick==0.6.2 + +Features +======== + +* Modifying sequences: Remove gaps, reverse complement, reverse, change case, + + - Remove gaps + - Reverse & reverse complement + - Trim to a range of residues + - Change case + - Sort by length or ID + - `more`_ + +* Displaying `information `_ about + sequence files +* Subsetting sequence files by: + + - Position + - ID + - Deduplication + - `more`_ + +* Filtering sequences by `quality score + `_ +* Trimming alignments to a `region of interest + `_ defined by the + forward and reverse primers + +Want to learn more? Head to the `Documentation`_. + +``seqmagick`` is free software under the GPL v3. + + +.. _`Documentation`: http://seqmagick.readthedocs.org/en/latest/ + +.. _`more`: http://seqmagick.readthedocs.org/en/latest/convert_mogrify.html diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..e35d885 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1 @@ +_build diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..c77b783 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,130 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/seqmagick.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/seqmagick.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/seqmagick" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/seqmagick" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + make -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." diff --git a/docs/_static/.gitignore b/docs/_static/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/docs/_static/fork.png b/docs/_static/fork.png new file mode 100644 index 0000000..dc0763a Binary files /dev/null and b/docs/_static/fork.png differ diff --git a/docs/backtrans_align.rst b/docs/backtrans_align.rst new file mode 100644 index 0000000..4862cac --- /dev/null +++ b/docs/backtrans_align.rst @@ -0,0 +1,8 @@ +``backtrans-align`` +=================== + +Given a protein alignment and unaligned nucleotides, align the nucleotides +using the protein alignment. Protein and nucleotide sequence files must +contain the same number of sequences, in the same order, with the same IDs. + +.. literalinclude:: backtrans_align.help diff --git a/docs/changelog.rst b/docs/changelog.rst new file mode 100644 index 0000000..4222705 --- /dev/null +++ b/docs/changelog.rst @@ -0,0 +1,127 @@ +Changes for seqmagick +===================== + +0.8.0 +----- + +* Supports Python 3.5+ +* Drops support for Python 3.4 +* Fix issue: "seqmagick with no params gives KeyError:None" [GH-77] +* Fix for Biopython 1.71 dual coding support [GH-76]; also fixes issue: "Translation error with new BioPython" [GH-79] +* Send logging to stderr, not stdout [GH-75] + +0.7.0 +----- + +* Supports Python 3.4+ +* Drops support for python 2.7 +* requires biopython >= 1.70 +* Drops support for bz2 compression [see GH-66] +* New option ``convert --sample-seed`` to make ``--sample`` deterministic. + +0.6.2 +----- + +* New ``quality-filter --pct-ambiguous`` switch [GH-53] +* setup.py enforces biopython>=1.58,<=1.66 (1.67 is not compatible) [GH-59] +* This is the last release that will support Python 2! + +0.6.1 +----- + +* Allow string wrapping when input isn't FASTA. [GH-45] +* Fix ``--pattern-include``, ``--pattern-exclude``, and ``--pattern-replace`` + for sequences without descriptions (e.g., from NEXUS files). [GH-47] +* Fix mogrify example. [GH-52] + +0.6.0 +----- + +* Map ``.nex`` extension to NEXUS-format (--alphabet must be specified if writing) +* Use reservoir sampling in ``--sample`` selector (lower memory use) +* Support specifying negative indices to ``--cut`` [GH-33] +* Optionally allow invalid codons in ``backtrans-align`` [GH-34] +* Map ``.fq`` extension to FASTQ format +* Optional multithreaded I/O in ``info`` [GH-36] +* Print sequence name on length mismatch in ``backtrans-align`` [GH-37] +* Support for ``+`` and ``-`` in head and tail to mimick Linux head and tail commands. +* Fix scoring for mixed-case sequences in ``primer-trim``. +* Fix bug in ``primer-trim`` - failed when sequence had multiple 5' gaps compared to the primer. +* Clarify documentation and fix bug in convert/mogrify ``--pattern-replace`` [GH-39] +* Support for gzip files in ``seqmagick convert --sort`` + +0.5.0 +----- + +* Change ``seqmagick extract-ids --source-format`` to ``--input-format`` to match + other commands (GH-29) +* Support gzip- and bzip2-compressed inputs and outputs for most commands (GH-30) +* Change default input format for ``sff`` to ``sff-trim``, which respects the + clipping locations embedded in each sequence record. +* Add ``--details-out`` option to ``seqmagick quality-filter``, which writes + details on each read processed. +* Match barcode/primer ``seqmagick quality-filter`` against a trie; allows + per-specimen barcodes. +* Remove ``--failure-out`` option from ``seqmagick quality-filter``. See ``--details-out`` +* Raise an error if number of codons does not match number of amino acids in + ``seqmagick backtrans-align`` +* Add ``--sample`` subcommand (GH-31) + +0.4.0 +----- + +* Fix bug in ``--squeeze`` +* More informative messages in ``seqmagick primer-trim`` +* Added ``--alphabet`` flag to allow writing NEXUS (GH-23) +* Exiting without error on SIGPIPE in extract-ids, info (GH-17) +* Ambiguities are translated as 'X' in --translate (GH-16) +* Allowing '.' or '-' as gap character (GH-18) +* ``--name-prefix`` and ``--name-suffix`` no longer create a mangled description (GH-19) +* Files owned by another user can be mogrified, as long as they are group writeable (GH-14) +* Add ``backtrans-align`` subcommand, which maps unaligned nucleotides onto a + protein alignment (GH-20) +* Allow FASTQ as input to quality-filter +* Significantly expand functionality of quality-filter: identify and trim + barcodes/primers; report detailed failure information. +* Cleanup, additional tests +* Add ``--drop`` filter to convert and mogrify (GH-24) +* Apply current umask when creating files (GH-26) +* Support stdin in ``seqmagick info`` (GH-27) +* Support translating ambiguous nucleotides, if codon translation is unambiguous + +0.3.1 +----- + +* Fix bug in ``quality-filter`` MinLengthFilter +* Case consistency in seqmagick + +0.3.0 +----- + +* Internal reorganization - transformations are converted to partial functions, + then applied. +* Argument order now affects order of tranformation application. +* Change default output format to 'align' for TTYs in seqmagick info +* Add BioPython as dependency (closes GH-7) +* Add ``primer-trim`` subcommand +* Add option to apply custom function(s) to sequences +* Add new filtering options: ``--squeeze-threshold``, ``--min-ungapped-length`` + ``--include-from-file`` ``--exclude-from-file`` +* Removed seqmagick muscle +* Added new subcommand ``quality-filter`` +* Added new subcommand ``extract-ids`` (closes GH-13) +* Allow use of '-' to indicate stdin / stdout (closes GH-11) +* Add mapping from .phyx to ``phylip-relaxed`` (targeted for BioPython 1.58) + +0.2.0 +----- + +* Refactoring +* Added hyphenation to multi-word command line options (e.g. + ``--deduplicatetaxa`` -> ``--deduplicate-taxa``) +* Add support for ``.needle``, ``.sff`` formats +* Close GH-4 + +0.1.0 +----- +Initial release diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..c05ac71 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,239 @@ +# -*- coding: utf-8 -*- +# +# seqmagick documentation build configuration file, created by +# sphinx-quickstart on Thu May 19 16:18:13 2011. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os +import time +import subprocess + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +sys.path.insert(0, os.path.abspath('..')) +from seqmagick import __version__ + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'seqmagick' +copyright = u'2011-{}, The Matsen Group'.format(time.strftime('%Y')) + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# + +# The short X.Y version. +version = __version__.split('+')[0] + +# The full version, including alpha/beta/rc tags. +release = __version__ + +# generate help text for each subcommand +subcommands = [ + 'backtrans-align', + 'convert', + 'extract-ids', + 'info', + 'primer-trim', + 'quality-filter', +] + +for cmd in subcommands: + print('generating help text for {}'.format(cmd)) + p = subprocess.run(['../seqmagick.py', cmd, '-h'], stdout=subprocess.PIPE, + universal_newlines=True) + with open(cmd.replace('-', '_') + '.help', 'w') as f: + f.write(p.stdout) + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'sphinxdoc' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +html_logo = '../logo/seqmagick_logo_small.png' + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'seqmagickdoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +#latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'seqmagick.tex', u'seqmagick Documentation', + u'Matsen Group', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Additional stuff for the LaTeX preamble. +#latex_preamble = '' + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'seqmagick', u'seqmagick Documentation', + [u'Matsen Group'], 1) +] diff --git a/docs/convert_mogrify.rst b/docs/convert_mogrify.rst new file mode 100644 index 0000000..2739f63 --- /dev/null +++ b/docs/convert_mogrify.rst @@ -0,0 +1,76 @@ +``convert`` and ``mogrify`` +=========================== + +Convert and mogrify achieve similar goals. ``convert`` performs some operation +on a file (from changing format to something more complicated) and writes to a +new file. ``mogrify`` modifies a file in place, and would not normally be used +to convert formats. + +The two have similar signatures:: + + seqmagick convert [options] infile outfile + +vs:: + + seqmagick mogrify [options] infile + +Options are shared between convert and mogrify. + +Examples +-------- + +Basic Conversion +^^^^^^^^^^^^^^^^ + +``convert`` can be used to convert between any file types BioPython supports +(which is many). For a full list of supported types, see the `BioPython SeqIO +wiki page`_. + +By default, file type is inferred from file extension, so:: + + seqmagick convert a.fasta a.sto + +converts an existing file ``a.fasta`` from FASTA to Stockholm format. **Neat!** +But there's more. + +Sequence Modification +^^^^^^^^^^^^^^^^^^^^^ + +A wealth of options await you when you're ready to do something slightly more +complicated with your sequences. + +Let's say I just want a few of my sequences:: + + $ seqmagick convert --head 5 examples/test.fasta examples/test.head.fasta + $ seqmagick info examples/test*.fasta + name alignment min_len max_len avg_len num_seqs + examples/test.fasta FALSE 972 9719 1573.67 15 + examples/test.head.fasta FALSE 978 990 984.00 5 + +Or I want to remove any gaps, reverse complement, select the last 5 sequences, +and remove any duplicates from an alignment in place:: + + seqmagick mogrify --tail 5 --reverse-complement --ungap --deduplicate-sequences examples/test.fasta + +You can even define your own functions in python and use them via +``--apply-function``. + +.. note:: + To maximize flexibility, most transformations passed as options to + ``mogrify`` and ``convert`` are processed *in order*, so:: + + seqmagick convert --min-length 50 --cut 1:5 a.fasta b.fasta + + will work fine, but:: + + seqmagick convert --cut 1:5 --min-length 50 a.fasta b.fasta + + will never return records, since the cutting transformation happens before + the minimum length predicate is applied. + +Command-line Arguments +********************** + +.. literalinclude:: convert.help + +.. _`BioPython SeqIO wiki page`: http://www.biopython.org/wiki/SeqIO#File_Formats diff --git a/docs/extensions.rst b/docs/extensions.rst new file mode 100644 index 0000000..f9a596a --- /dev/null +++ b/docs/extensions.rst @@ -0,0 +1,27 @@ +========= ============== +Extension Format +========= ============== +.afa fasta +.aln clustal +.fa fasta +.faa fasta +.fas fasta +.fasta fasta +.fastq fastq +.ffn fasta +.fna fasta +.fq fastq +.frn fasta +.gb genbank +.gbk genbank +.needle emboss +.nex nexus +.phy phylip +.phylip phylip +.phyx phylip-relaxed +.qual qual +.sff sff-trim +.sth stockholm +.sto stockholm +========= ============== + diff --git a/docs/extract_ids.rst b/docs/extract_ids.rst new file mode 100644 index 0000000..b557402 --- /dev/null +++ b/docs/extract_ids.rst @@ -0,0 +1,7 @@ +``extract-ids`` +=============== + +``seqmagick extract-ids`` is extremely simple - all the IDs from a sequence file +are printed to stdout (by default) or the file of your choosing: + +.. literalinclude:: extract_ids.help diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..c686e89 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,135 @@ +.. seqmagick documentation master file, created by + sphinx-quickstart on Thu May 19 16:18:13 2011. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + + +.. Contents: + + +.. "Fork me on github" + + +.. raw:: html + + Fork me on GitHub + + +========= +seqmagick +========= + +.. contents:: + :depth: 4 + :class: new + +.. toctree:: + :maxdepth: 1 + + changelog + + +Motivation +========== + +We often have to convert between sequence formats and do little tasks on them, +and it's not worth writing scripts for that. Seqmagick is a kickass little +utility built in the spirit of imagemagick_ to expose the file format +conversion in Biopython in a convenient way. Instead of having a big mess of +scripts, there is one that takes arguments:: + + seqmagick convert a.fasta b.phy # convert from fasta to phylip + seqmagick mogrify --ungap a.fasta # remove all gaps from a.fasta, in place + seqmagick info *.fasta # describe all FASTA files in the current directory + +And more. + +Installation +============ + +Install the latest release with:: + + pip install seqmagick + +This should also install `BioPython`_. NumPy (which parts of BioPython +depend on) is not required for ``seqmagick`` to function, but may be +installed as a dependency of ``BioPython``. + +To install the bleeding edge version:: + + pip install git+https://github.com/fhcrc/seqmagick.git@master#egg-info=seqmagick + +Note that as of version 0.8.0, this package requires Python 3.5+. If +you want to use the most recent version compatible with Python 2.7:: + + pip install seqmagick==0.6.2 + +Use +=== + +Seqmagick can be used to query information about sequence files, convert +between types, and modify sequence files. All functions are accessed through +subcommands:: + + seqmagick [options] arguments + +List of Subcommands +=================== + +.. toctree:: + :maxdepth: 2 + + convert_mogrify + backtrans_align + extract_ids + info + quality_filter + primer_trim + +Supported File Extensions +========================= + +By default, ``seqmagick`` infers the file type from extension. Currently mapped +extensions are: + +.. include:: extensions.rst + +.. note:: + + NEXUS-format output requires the ``--alphabet`` flag. + +Default Format +-------------- + +When reading from stdin or writing to stdout, ``seqmagick`` defaults to fasta +format. This behavior may be overridden with the ``--input-format`` and +``--output-format`` flags. + +If an extension is not listed, you can either rename the file to a supported +extension, or specify it manually via ``--input-format`` or ``--output-format``. + +Compressed file support +----------------------- + +most commands support gzip (files ending in ``.gz``) and bzip (files ending in +``.bz2`` or ``.bz``) compressed inputs and outputs. File types for these files +are inferred using the extension of the file after stripping the file extension +indicating that the file is compressed, so ``input.fasta.gz`` would be inferred +to be in FASTA format. + +Acknowledgements +================ + +seqmagick is written and maintained by the `Matsen Group`_ at the Fred +Hutchinson Cancer Research Center. + + +Contributing +============ + +We welcome contributions! Simply fork the repository `on GitHub`_ and send a pull request. + +.. _`on GitHub`: http://github.com/fhcrc/seqmagick/ +.. _`Matsen Group`: http://matsen.fhcrc.org/ +.. _imagemagick: http://www.imagemagick.org/script/command-line-tools.php +.. _`BioPython`: http://www.biopython.org/ diff --git a/docs/info.rst b/docs/info.rst new file mode 100644 index 0000000..efbe656 --- /dev/null +++ b/docs/info.rst @@ -0,0 +1,24 @@ +``info`` +======== + +``seqmagick info`` describes one or more sequence files + +Example +******* +:: + + seqmagick info examples/*.fasta + + name alignment min_len max_len avg_len num_seqs + examples/aligned.fasta TRUE 9797 9797 9797.00 15 + examples/dewrapped.fasta TRUE 240 240 240.00 148 + examples/range.fasta TRUE 119 119 119.00 2 + examples/test.fasta FALSE 972 9719 1573.67 15 + examples/wrapped.fasta FALSE 120 237 178.50 2 + +Output can be in comma-separated, tab-separated, or aligned formats. See +``seqmagick info -h`` for details. + +Usage: + +.. literalinclude:: info.help diff --git a/docs/make_extensions.py b/docs/make_extensions.py new file mode 100755 index 0000000..e5a464a --- /dev/null +++ b/docs/make_extensions.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python + +from seqmagick import fileformat + +extension_map = fileformat.EXTENSION_TO_TYPE +items = extension_map.items() +items.sort() + +max_key_length = max((len('Extension'), max(len(k) for k in extension_map.keys()))) +max_val_length = max((len('Format'), max(len(v) for v in extension_map.values()))) +format_string = '{0:' + str(max_key_length) + 's} {1:' + str(max_val_length) + 's}' + +with open('extensions.rst', 'w') as fp: + def print_row(k, v): + print >> fp, format_string.format(k, v) + print_row('=' * max_key_length, '=' * max_val_length) + print_row('Extension', 'Format') + print_row('=' * max_key_length, '=' * max_val_length) + for k, v in items: + print_row(k, v) + print_row('=' * max_key_length, '=' * max_val_length) + print >> fp, '' diff --git a/docs/primer_trim.rst b/docs/primer_trim.rst new file mode 100644 index 0000000..b212d9b --- /dev/null +++ b/docs/primer_trim.rst @@ -0,0 +1,7 @@ +``primer-trim`` +--------------- + +``primer-trim`` trims an alignment to a region defined by a set of forward and +reverse primers. Usage is as follows: + +.. literalinclude:: primer_trim.help diff --git a/docs/quality_filter.rst b/docs/quality_filter.rst new file mode 100644 index 0000000..625a1e7 --- /dev/null +++ b/docs/quality_filter.rst @@ -0,0 +1,8 @@ +``quality-filter`` +------------------ + +``quality-filter`` truncates and removes sequences that don't match a set of +quality criteria. The subcommand takes a FASTA and quality score file, and +writes the results to an output file: + +.. literalinclude:: quality_filter.help diff --git a/examples/aligned.fasta b/examples/aligned.fasta new file mode 100644 index 0000000..bfd200c --- /dev/null +++ b/examples/aligned.fasta @@ -0,0 +1,2475 @@ +>gi|66864576|gb|DQ027786.1| HIV-1 isolate QA013_2282M_C6 from Kenya envelope glycoprotein (env) gene, partial cdsgi|66864574|gb|DQ027785.1| HIV-1 isolate QA013_1790M_B6 from Kenya envelope glycoprotein (env) gene, partial cds +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +-------------------------------------------------------ACTCT +GTGTCACGTTAAACTGCATTGAATGGATAAATAATAATAGCACTA--------------- +-------------------ACGCCACTGGT--------------------------CAGG +ACATGAATGAAAAACTGCTCTTTCAATATAGCCACAGAAGTAAGAGATAAGAAAAAGCAA +GTACAGGCAC--TTTTATAAACTTGATGTAGTACAGATAGATAATA---GTAATACTAGC +TATAGATTAATAAATTGTAATACCT--GCCATTACACAGGCATGTCCAAAGGTAACCTTT +GAGCCAATTCCCATACATTATTGTGCCCCAGCTGG--TTGCAATTCTAAAATGTAATAAT +AAGAAATTCAATGGGACGGGTCCATGCAAAAACGTCAGCACAGT--AGTGTACACATGGG +ATTAGGCCAGTAGTGTCAACTCAGCTGTTGTTGAATGGCAGTCTAGCAGAAGAAGA--TA +ATAATTAGATCTGAAAATTTAACAAATAATGCAAAAATTATAATAGTACAGCTTAATGAG +TCTGT--CAATTAGTTGCACAAGGCCCTACAACAATACAAGAAAAGGTGAACATAT---- +--GGGACCAGGGCGAGCACTCT-----ACAGAAAAAATAGTTGGAGATATAAGACAAGCA +CATTGTAACATTAGTGGAAGGGAATGGAATAA--CGTTACAGCAGGTAGCTGACAAATTA +AGAAACCTTC------------TTAATAAAACAACAATAATTTTTAAACCACCTGCGAGG +AGACCTAGAAATTACAACACACAGCTTTAATTGTCTAGGGGAATTTTTCTACTGCAACAC +ATCAA--CTGTTTAATAATAGTAAATGGGAATCAAATAGT---AGTACAGGGGA---AAA +TAA---------AAATGAAGATATAATCACTCCCATGCAGAATAAAACAAATTATAAACA +TGTGGCAGGGAGTAGGAAAAGCAATGTATGCCCCTCC--TTGAAGGACACATCAGTTGTT +CGTCAATTATTACAGGACTATTGTTGACAAGAGATGGTGGTG-TAACT--TCGGAG---- +-------------TTCGGAGACCTTCAGACCTGGGGGAGGAGAC---------------- +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +----------------- +>gi|66864568|gb|DQ027782.1| HIV-1 isolate QA013_385M_C3 from Kenya envelope glycoprotein (env) gene, partial cdsgi|66864564|gb|DQ027780.1| HIV-1 isolate QA013_105M_C2_1 from Kenya envelope glycoprotein (env) gene, partial cdsgi|66864562|gb|DQ027779.1| HIV-1 isolate QA013_70M_B1 from Kenya envelope glycoprotein (env) gene, partial cdsgi|66864566|gb|DQ027781.1| HIV-1 isolate QA013_264M_A2 from Kenya envelope glycoprotein (env) gene, partial cds +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +-------------------------------------------------------ACTCT +GTGTCACTTTAAACTGCACTGAATGGAAGAATAATGGTAGCACTA--------------- +-------------------ACGTCACTGAT--------------------------CAGG +ACATGAATGAAAAACTGCTCTTTCAATATAACCACAGAAGTAAGAGATAAGAAGAAGCAA +GTACAGGCAC--TTTTATAAACTTGATGTAGTACAGATAGATAATA---GTA---CCAGC +TATAGATTAATAAATTGTAATACCTCTG--ATTACACAGGCATGTCCAAAGGTAACCTTT +GAGCCAATTCCCATACATTATTGTGCCCCAGCTGG--ATTCAATTCTAAAATGTAATGAT +AAGAAGTTCAATGGGACGGGTCCATGCAAAAACGTCAGCACAGTACA--GTACACATGGG +ATTAGGCCAGTAGTGTCAACTCAGCTGTTGTTGAATGGCAGTCTAGCAGAAGAAGA--GA +ATAATTAGATCTGAAAATCTAACAAATAATGCAAAAATTATAATAGTACAGCTTAATGAG +TCTGT--ACCTTAATTGCACAAGGCCCTACAACAATACAAGAAAAGGTGAACATAT---- +--GGGACCAGGGCGAGCACTCT-----TTAGAAAGAATAGTTGGAGATATAAGACAAGCA +TATTGTAGCATTAGTGGAATGGGATGGAATAAAACTT--CAGCAGGTAGCTGACAAATTA +AGAAACCTTC------------TTAATAAAACAACAATAATTTTTAAACCACCCGCGGGG +GGACCTAGAAATTACAACACACAGCTTTAATTGTGGAGGGGAATTTTTCTACTGCAACAC +ATCAAGA--GTTTAATAATAGTGAATGGAAATCAAATAGT---AGTACAGGGGG------ +------------AAATGAAAGTATAATCATACTCCCATAGAATAAAACAAATTATAAACA +TGTGGCAGGGAGTAGGAAAAGCAATGTATGCCCCTCCCATTGAAGGAC--ATCAATTGTT +CATCAAATATTACAGGACTATTGTTGACAAGAGATGGTGGTG-TAACTAATCAGAG---- +------------TGAATGAGACCTTCAGACCTGGGGGAGGAGAC---------------- +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +----------------- +>gi|66864572|gb|DQ027784.1| HIV-1 isolate QA013_987M_C4 from Kenya envelope glycoprotein (env) gene, partial cdsgi|66864570|gb|DQ027783.1| HIV-1 isolate QA013_559M_C11 from Kenya envelope glycoprotein (env) gene, partial cdshxb2 +TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACA +CACAAGGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGATCAGATATCCAC +TGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGAGAAGTTAGAAGAAGCCA +ACAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGAATGGATGACCCGG +AGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACATGGCCCGAG +AGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCG +CTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGAT +CCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGA +GCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCT +TGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTC +AGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACCTGAAAG +CGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCGGCTTGCTGAAGCGCGCACGG +CAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAATTTTGACTAGCGGAGGCTAGA +AGGAGAGAGATGGGTGCGAGAGCGTCAGTATTAAGCGGGGGAGAATTAGATCGATGGGAA +AAAATTCGGTTAAGGCCAGGGGGAAAGAAAAAATATAAATTAAAACATATAGTATGGGCA +AGCAGGGAGCTAGAACGATTCGCAGTTAATCCTGGCCTGTTAGAAACATCAGAAGGCTGT +AGACAAATACTGGGACAGCTACAACCATCCCTTCAGACAGGATCAGAAGAACTTAGATCA +TTATATAATACAGTAGCAACCCTCTATTGTGTGCATCAAAGGATAGAGATAAAAGACACC +AAGGAAGCTTTAGACAAGATAGAGGAAGAGCAAAACAAAAGTAAGAAAAAAGCACAGCAA +GCAGCAGCTGACACAGGACACAGCAATCAGGTCAGCCAAAATTACCCTATAGTGCAGAAC +ATCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGGGTAAAA +GTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTGATACCCATGTTTTCAGCATTATCAGAA +GGAGCCACCCCACAAGATTTAAACACCATGCTAAACACAGTGGGGGGACATCAAGCAGCC +ATGCAAATGTTAAAAGAGACCATCAATGAGGAAGCTGCAGAATGGGATAGAGTGCATCCA +GTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGGAAGTGACATAGCA +GGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGACAAATAATCCACCTATCCCA +GTAGGAGAAATTTATAAAAGATGGATAATCCTGGGATTAAATAAAATAGTAAGAATGTAT +AGCCCTACCAGCATTCTGGACATAAGACAAGGACCAAAGGAACCCTTTAGAGACTATGTA +GACCGGTTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAGGAGGTAAAAAATTGGATG +ACAGAAACCTTGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTG +GGACCAGCGGCTACACTAGAAGAAATGATGACAGCATGTCAGGGAGTAGGAGGACCCGGC +CATAAGGCAAGAGTTTTGGCTGAAGCAATGAGCCAAGTAACAAATTCAGCTACCATAATG +ATGCAGAGAGGCAATTTTAGGAACCAAAGAAAGATTGTTAAGTGTTTCAATTGTGGCAAA +GAAGGGCACACAGCCAGAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTGGAAATGTGGA +AAGGAAGGACACCAAATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATC +TGGCCTTCCTACAAGGGAAGGCCAGGGAATTTTCTTCAGAGCAGACCAGAGCCAACAGCC +CCACCAGAAGAGAGCTTCAGGTCTGGGGTAGAGACAACAACTCCCCCTCAGAAGCAGGAG +CCGATAGACAAGGAACTGTATCCTTTAACTTCCCTCAGGTCACTCTTTGGCAACGACCCC +TCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATG +ATACAGTATTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAA +TTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATA +AAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGT +TGACTCAGATTGGTTGCACTTTAAATTTTCCCATTAGCCCTATTGAGACTGTACCAGTAA +AATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAA +TAAAAGCATTAGTAGAAATTTGTACAGAGATGGAAAAGGAAGGGAAAATTTCAAAAATTG +GGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAAT +GGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTC +AATTAGGAATACCACATCCCGCAGGGTTAAAAAAGAAAAAATCAGTAACAGTACTGGATG +TGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTA +CCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCAC +AGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTT +TTAGAAAACAAAATCCAGACATAGTTATCTATCAATACATGGATGATTTGTATGTAGGAT +CTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATCTGTTGA +GGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGG +GTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACA +GCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAATTGAATTGGGCAAGTCAGATTT +ACCCAGGGATTAAAGTAAGGCAATTATGTAAACTCCTTAGAGGAACCAAAGCACTAACAG +AAGTAATACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGAGAGATTCTAA +AAGAACCAGTACATGGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGA +AGCAGGGGCAAGGCCAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTGAAAA +CAGGAAAATATGCAAGAATGAGGGGTGCCCACACTAATGATGTAAAACAATTAACAGAGG +CAGTGCAAAAAATAACCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAAC +TGCCCATACAAAAGGAAACATGGGAAACATGGTGGACAGAGTATTGGCAAGCCACCTGGA +TTCCTGAGTGGGAGTTTGTTAATACCCCTCCCTTAGTGAAATTATGGTACCAGTTAGAGA +AAGAACCCATAGTAGGAGCAGAAACCTTCTATGTAGATGGGGCAGCTAACAGGGAGACTA +AATTAGGAAAAGCAGGATATGTTACTAATAGAGGAAGACAAAAAGTTGTCACCCTAACTG +ACACAACAAATCAGAAGACTGAGTTACAAGCAATTTATCTAGCTTTGCAGGATTCGGGAT +TAGAAGTAAACATAGTAACAGACTCACAATATGCATTAGGAATCATTCAAGCACAACCAG +ATCAAAGTGAATCAGAGTTAGTCAATCAAATAATAGAGCAGTTAATAAAAAAGGAAAAGG +TCTATCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATAAAT +TAGTCAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGATG +AACATGAGAAATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTGCCACCTG +TAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGAGAAGCCATGC +ATGGACAAGTAGACTGTAGTCCAGGAATATGGCAACTAGATTGTACACATTTAGAAGGAA +AAGTTATCCTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTTATTCCAG +CAGAAACAGGGCAGGAAACAGCATATTTTCTTTTAAAATTAGCAGGAAGATGGCCAGTAA +AAACAATACATACTGACAATGGCAGCAATTTCACCGGTGCTACGGTTAGGGCCGCCTGTT +GGTGGGCGGGAATCAAGCAGGAATTTGGAATTCCCTACAATCCCCAAAGTCAAGGAGTAG +TAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAAC +ATCTTAAGACAGCAGTACAAATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGA +TTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATACAAACTA +AAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCA +GAAATCCACTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAA +TACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATTAGGGATT +ATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATTAGAACA +TGGAAAAGTTTAGTAAAACACCATATGTATGTTTCAGGGAAAGCTAGGGGATGGTTTTAT +AGACATCACTATGAAAGCCCTCATCCAAGAATAAGTTCAGAAGTACACATCCCACTAGGG +GATGCTAGATTGGTAATAACAACATATTGGGGTCTGCATACAGGAGAAAGAGACTGGCAT +TTGGGTCAGGGAGTCTCCATAGAATGGAGGAAAAAGAGATATAGCACACAAGTAGACCCT +GAACTAGCAGACCAACTAATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATA +AGAAAGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAAC +AAGGTAGGATCTCTACAATACTTGGCACTAGCAGCATTAATAACACCAAAAAAGATAAAG +CCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGATGGAACAAGCCCCAGAAGACC +AAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAGAGCTTTTAGAGGAGCTTAAGA +ATGAAGCTGTTAGACATTTTCCTAGGATTTGGCTCCATGGCTTAGGGCAACATATCTATG +AAACTTATGGGGATACTTGGGCAGGAGTGGAAGCCATAATAAGAATTCTGCAACAACTGC +TGTTTATCCATTTTCAGAATTGGGTGTCGACATAGCAGAATAGGCGTTACTCGACAGAGG +AGAGCAAGAAATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATCCAGGAAGTCA +GCCTAAAACTGCTTGTACCAATTGCTATTGTAAAAAGTGTTGCTTTCATTGCCAAGTTTG +TTTCATAACAAAAGCCTTAGGCATCTCCTATGGCAGGAAGAAGCGGAGACAGCGACGAAG +AGCTCATCAGAACAGTCAGACTCATCAAGCTTCTCTATCAAAGCAGTAAGTAGTACATGT +AACGCAACCTATACCAATAGTAGCAATAGTAGCATTAGTAGTAGCAATAATAATAGCAAT +AGTTGTGTGGTCCATAGTAATCATAGAATATAGGAAAATATTAAGACAAAGAAAAATAGA +CAGGTTAATTGATAGACTAATAGAAAGAGCAGAAGACAGTGGCAATGAGAGTGAAGGAGA +AATATCAGCACTTGTGGAGATGGGGGTGGAGATGGGGCACCATGCTCCTTGGGATGTTGA +TGATCTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGA +AGGAAGCAACCACCACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTAC +ATAATGTTTGGGCCACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAGTAGTAT +TGGTAAATGTGACAGAAAATTTTAACATGTGGAAAAATGACATGGTAGAACAGATGCATG +AGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCT +GTGTTAGTTTAAAGTGCACTGATTTGAAG---AATGATACTAATA--------------- +-------------------CCAATAGTAGT------AGCGGGAGAATGATAATGGAGAAA +GGAGAGATAAAAAACTGCTCTTTCAATATCAGCACAAGCATAAGAGGTAAGGTGCAGAAA +GAATATGCATTTTTTTATAAACTTGATATAATACCAATAGATAATG---ATACTACCAGC +TATAAGTTGACAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTT +GAGCCAATTCCCATACATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAAT +AAGACGTTCAATGGAACAGGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGA +ATTAGGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAGAGGTA +GTAATTAGATCTGTCAATTTCACGGACAATGCTAAAACCATAATAGTACAGCTGAACACA +TCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGAATCCGTATCCAG +AGAGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATA---GGAAATATGAGACAAGCA +CATTGTAACATTAGTAGAGCAAAATGGAATAACACTTTAAAACAGATAGCTAGCAAATTA +AGAGAACAATTTGGAA------ATAATAAAACAATAATCTTTAAGCAATCCTCAG-GAGG +GGACCCAGAAATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTAATTC +AACACAACTGTTTAATAGTACT---TGGTTTAATAGTACTTGGAGTACTGAAGGGTCAAA +TAACACTGAAGGAAGTGACACAATCACCCT-CCCATGCAGAATAAAACAAATTATAAACA +TGTGGCAGAAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTT +CATCAAATATTACAGGGCTGCTATTAACAAGAGATGGTGGTAATAGCA--ACAATG---- +------------AGTCCGAGATCTTCAGACCTGGAGGAGGAGATATGAGGGACAATTGGA +GAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCATTAGGAGTAGCACCCACCA +AGGCAAAGAGAAGAGTGGTGCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTTCC +TTGGGTTCTTGGGAGCAGCAGGAAGCACTATGGGCGCAGCCTCAATGACGCTGACGGTAC +AGGCCAGACAATTATTGTCTGGTATAGTGCAGCAGCAGAACAATTTGCTGAGGGCTATTG +AGGCGCAACAGCATCTGTTGCAACTCACAGTCTGGGGCATCAAGCAGCTCCAGGCAAGAA +TCCTGGCTGTGGAAAGATACCTAAAGGATCAACAGCTCCTGGGGATTTGGGGTTGCTCTG +GAAAACTCATTTGCACCACTGCTGTGCCTTGGAATGCTAGTTGGAGTAATAAATCTCTGG +AACAGATTTGGAATCACACGACCTGGATGGAGTGGGACAGAGAAATTAACAATTACACAA +GCTTAATACACTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGAAT +TATTGGAATTAGATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGT +GGTATATAAAATTATTCATAATGATAGTAGGAGGCTTGGTAGGTTTAAGAATAGTTTTTG +CTGTACTTTCTATAGTGAATAGAGTTAGGCAGGGATATTCACCATTATCGTTTCAGACCC +ACCTCCCAACCCCGAGGGGACCCGACAGGCCCGAAGGAATAGAAGAAGAAGGTGGAGAGA +GAGACAGAGACAGATCCATTCGATTAGTGAACGGATCCTTGGCACTTATCTGGGACGATC +TGCGGAGCCTGTGCCTCTTCAGCTACCACCGCTTGAGAGACTTACTCTTGATTGTAACGA +GGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTGGAATCTCC +TACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAGCTTGCTCAATGCCACAGCCA +TAGCAGTAGCTGAGGGGACAGATAGGGTTATAGAAGTAGTACAAGGAGCTTGTAGAGCTA +TTCGCCACATACCTAGAAGAATAAGACAGGGCTTGGAAAGGATTTTGCTATAAGATGGGT +GGCAAGTGGTCAAAAAGTAGTGTGATTGGATGGCCTACTGTAAGGGAAAGAATGAGACGA +GCTGAGCCAGCAGCAGATAGGGTGGGAGCAGCATCTCGAGACCTGGAAAAACATGGAGCA +ATCACAAGTAGCAATACAGCAGCTACCAATGCTGCTTGTGCCTGGCTAGAAGCACAAGAG +GAGGAGGAGGTGGGTTTTCCAGTCACACCTCAGGTACCTTTAAGACCAATGACTTACAAG +GCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGCTAATTCAC +TCCCAAAGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCT +GATTAGCAGAACTACACACCAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGC +TACAAGCTAGTACCAGTTGAGCCAGATAAGATAGAAGAGGCCAATAAAGGAGAGAACACC +AGCTTGTTACACCCTGTGAGCCTGCATGGGATGGATGACCCGGAGAGAGAAGTGTTAGAG +TGGAGGTTTGACAGCCGCCTAGCATTTCATCACGTGGCCCGAGAGCTGCATCCGGAGTAC +TTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGG +AGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCT +GCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGG +CTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGT +GTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGT +GTGGAAAATCTCTAGCA +>gi|66864561|gb|DQ027778.1| HIV-1 isolate QA013_2282M_A10 from Kenya nonfunctional envelope glycoprotein (env) gene, partial sequencegi|66864559|gb|DQ027777.1| HIV-1 isolate QA013_1790M_C1 from Kenya envelope glycoprotein (env) gene, partial cds +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +-------------------------------------------------------TCTCT +GCGTTACCTTAGATTGTCATCAT---------AATGTCACCACTGTCACCACTAACAATA +CCAATGACACTGCATATCACCAATGCCAAT---------------ATCACCAATGACAAG +GCCGACATGAAAAACTGCTCTTACAATGTGACCACA--AATAAGGGATAAGCAACAGAAA +GTATACTCACTTTTTTATAGACTTGATCTAGTACCAACTGAAAGTA---G-----ATAGT +TATAGATTAATAAATTGTAACACCTCCGTCATTAAACAGGCTTGTCCAAAAGTAACCTTT +GA--CAATTCCCATACATTATTGTGCCCCAGCTAGTTTTGCGATTCTAAAGTGTAAAGAT +AAGGAGTTCAAT--AACAGGGCCATGCAAGAATGTCAGCACAGTACAATGCACACATGGG +ATCAAGCCAGTAGTATCAACTC--CTGCTGTTAAATGGCAGCCTAGCAGAAGGAGAGGTA +AGAATTAGATCTGAAAATATCACAAACAATGC--AAAACATAATAGTACAATTGCCCACA +CCTATAACAATTACTTGTACCAGACCCAACAACAATACAAGAAA--GTGTACATAT---- +--AGGACCAGGACAAGCATTCTATGCAACAGGTGCAATAATAGGGGATATAAGACGAGC- +-ATTGTAATGTTAATAAAACACAATGGAATACAGCTTTGCAAAAGGTAGCTAACAAATTA +AAAACATA--------------TTAACAAAACAATAATCTTTACTCACTCATCAG-GAGG +GGATATAGAAATTACAACACATAGTTTTAAT--TAGAGGAGAGTTTTTCTATTGTAATAC +ATCAGGCCTGTTTAATAGCACT---TGGGTAAACAATACC---AACACAGGGGA--CAAC +TAACACAGAATCAAATGACAATATAACGAT-CCCATGCAGAATAAAGCAAATTATAA--A +TGTGGCAGAGAACAGGACAAGCAATATATGCCCCTCCCATACAAGGAATAATAAGGTGTG +AATCAA--ATTACAGGACTAATATTAACAAAAGATAGTAAGAATAATG--CTAATA---- +------------GAACTGAAACCTTCAGACCTAGAG--AGAGAT---------------- +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +----------------- +>gi|66864557|gb|DQ027776.1| HIV-1 isolate QA013_987M_B12 from Kenya envelope glycoprotein (env) gene, partial cdsgi|66864555|gb|DQ027775.1| HIV-1 isolate QA013_765M_B1 from Kenya envelope glycoprotein (env) gene, partial cds +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +-------------------------------------------------------TCTCT +GCGTTACTTTAGATTGTCAT------------AATGTCACCAATG--------------- +-------------------ACAATGCCAAT------------------------------ +---ATCATGAAAAACTGCTCTT--AATGTGACCACAGTAATAAGGGATAAGCAACAGAAA +GTATACTCACTTTTTTATAGACTTGATATAGTA--AACTGAAAGTA---ATA---CTAGT +TATAGATTAATAAATTGTAACACCTCCGTCATTAAACAGGCTTGTCCAA--GTAACCTTT +GAGCCAATTCCCATACATTATTGTGCCCCAGCTGGTTTTGCGACTCTAAAGTGTAAAG-- +AAGGAGTTCAATGGAACAGGGCCATGCAAGAATGTCAGCACAGTACAATGCGCACATGGG +ATCAAGCCA--AGTATCAACTCAGCTGCTGTTAAATGGCAGCCTAGCAGAAGGAGAGGTA +AGAATTAGATCTGAAAATA--ACAAACAATGCCAAAAACCTAATAGTACAATTGACCACA +CCTATAAAAATTAATTGTACCAGACCTA--AACAATACAAGAAAAAGTGTACATAT---- +--AGGACCAGGACAAGCATTCTATGCAACAGGTGAAATAATAGGG--TATAAGACAAGCA +CATTGTAATGTTAGTGAAACACAATGGCATAAAACTTTGCAAGAGGTAGCTAAC--ATTA +AAAACATACT------------TTAACAAAACAATAATCTTTGATCACTCATCAG-GAGG +GGATATAGAAATTACA--ACATAGTTTTAATTGTGGAGGAGAGTTTTTCTATTGTAATAC +ATCAGGCCTGTTTAATAGCACT---TGGGAA--CGATGCC---AGCACACAGGAGTCAAA +TAACACAGAATCAAATACCACTATAACGAT-CCCATGCAGAATAA--CAAGTTATAAATA +TGTGGCAGAGAACAGGACAAGCAATATATGCCCCTCCCATACAAGGAATAATAAG--GTG +AATCAAATATTACAGGACTAATATTAACAAGAGATGGTGGGGATAATT--CTAGGG---- +------------AAAATGAAACC--CAGACCTGGAGGAGGAGAT---------------- +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +------------------------------------------------------------ +----------------- +>gi|66864553|gb|DQ027774.1| HIV-1 isolate QA013_559M_A1 from Kenya envelope glycoprotein (env) gene, partial cdsgi|66864614|gb|DQ027773.1| HIV-1 isolate QA013_385M_B5 from Kenya envelope glycoprotein (env) gene, partial cdsdiff --git a/examples/apply-function/myfunctions.py b/examples/apply-function/myfunctions.py new file mode 100644 index 0000000..e0f2a50 --- /dev/null +++ b/examples/apply-function/myfunctions.py @@ -0,0 +1,19 @@ +""" +A collection of functions to apply +""" +import hashlib + +def no_gaps(records): + for record in records: + if not '-' in str(record.seq): + yield record + +def hash_starts_numeric(records): + """ + Very useful function that only accepts records with a numeric start to + their sha-1 hash. + """ + for record in records: + seq_hash = hashlib.sha1(str(record.seq).encode('utf-8')).hexdigest() + if seq_hash[0].isdigit(): + yield record diff --git a/examples/apply-function/run_filter.sh b/examples/apply-function/run_filter.sh new file mode 100755 index 0000000..0c27bf2 --- /dev/null +++ b/examples/apply-function/run_filter.sh @@ -0,0 +1,8 @@ +#!/bin/bash + + +seqmagick convert --apply-function myfunctions.py:no_gaps \ + ../aligned.fasta empty.fasta + +seqmagick convert --apply-function myfunctions.py:hash_starts_numeric \ + ../aligned.fasta hashed.fasta diff --git a/examples/dewrapped.fasta b/examples/dewrapped.fasta new file mode 100644 index 0000000..1379bd7 --- /dev/null +++ b/examples/dewrapped.fasta @@ -0,0 +1,296 @@ +>Ref_1_700 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01BFKMA_3 +AGTAGCATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01BESA1_5 +AGTAGTACCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01BTN0L_2 +AGTAGTATCAACTCAACTGCTGCTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01CMG0I_4 +AGTAGTATCAACTCAACTGCTGTTAAACGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01AKCZB_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGCCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01D21W0_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCGAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01EJ82H_6 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAAAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGCATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01DHBXA_5 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAAAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01EGZ72_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAAAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAGATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGCATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01CR15A_6 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAAA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01EOH19_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAAA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACGACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01EEG9D_2 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---ATAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01D0V98_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCCCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01BVKYZ_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATACCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01EURR1_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGCGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01EC9GP_3 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTAAAAATTAATTGTACAAGGCCTAATAATAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01B9A6N_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTAAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGAGTATAAG +>FTWCYXX01AH60X_10 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTAAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01CDGOW_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAACTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01A3035_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTATACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01BOFEE_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGCACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGACATAAG +>FTWCYXX01D527F_6 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGCACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01CEP6E_2 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAACAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01B9M38_11 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAAAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01ATQR4_2 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAAAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01CEBF5_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGCATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01APBT8_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGAACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01B1JXC_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAAGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01B3EQU_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGAGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01CGWAF_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATCCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01B0APB_7 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATAGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01B6UVG_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACACAATAGGGGATATAAG +>FTWCYXX01EDI76_2 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGAATATAAG +>FTWCYXX01C5103_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGAGTATAAG +>FTWCYXX01E5NP5_3 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAA +>FTWCYXX01BR0C5_2 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAGTAGGGGATATAAG +>FTWCYXX01AOC8L_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACGTAATAGGGGATATAAG +>FTWCYXX01BK1NQ_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGATATAATAGGGGATATAAG +>FTWCYXX01D30DM_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGG---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01ANS4X_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCGGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01DLI1O_3 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAGT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01D3HR3_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGGGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01CWOZU_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACGATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01CLMY5_2 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAGCAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01EGLQ5_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAGTAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01C0BPR_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGTCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01AYW4L_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACGAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01BWTVE_23 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTATAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01B15KV_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAGAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGAGGATATAAG +>FTWCYXX01E4U9P_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATGGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01BZ7KG_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATATTAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01DJM6W_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATGTAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01ERSZV_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCGCAAATAACGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01EE0XE_2 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATGATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAGCAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATAGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01CO8A3_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAGAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01EEVDM_3 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGGAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01BJF3C_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAGGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01EN1H7_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGTAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01BXB8X_1 +AGTAATATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01D577V_1 +GGTAGTATCAACTCAACTGCTGTTAAATGGCAGCCTAGCAGAAAAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGGCGAGCATTCTATGGA---------------AGTGACATAATAGGGGATATAAG +>FTWCYXX01CKIMU_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAACAATGCCAAAATTATAATAGTACAACTTGTCCAACCTGTGACAATTAAGTGTATCAGACCTAACAACAATACAAGAAAAAGTATACGT------ATAGGACCAGGACAAGCATTCTATGCAACA------------GGTGACATAATAGGGGATATAAG +>FTWCYXX01EFLKV_1 +AGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAAGA---GTAATAATTAGATCTCAAAATATCACAAATAATGCCAAAAATATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAGAGTATAAAT------ATAGGACCAGGACAAGCATTCTATGCAACA------------GGTGACATAATAGGGGATATAAG +>FTWCYXX01AJ9L4_11 +AGTAGTATCAACTCAATTGCTGTTGAATGGCAGTCTAGCAGAAAAAAAT---ATAACAATTAGATCTGAAAATATCACAAACAATGCCAAAATTATAATAGTACAACTTGTCCAACCTGTGACAATTAAGTGTATCAGACCTAACAACAATACAAGAAAAAGTATACGT------ATAGGACCAGGACAAGCATTCTATGCAACA------------GGTGACATAATAGGGGATATAAG +>FTWCYXX01ATH85_1 +AGTAGTATCAACTCAATTGCTGTTGAATGGCAGTCTAGCAGAAAAAAAT---ATAACAATTAGATCTGAAAATATCACAAACAATGCCAAAATTATAATAGTACAACTTGTCCAACCTGTGACAATTAAGTGTATCAGACCTAACAACAATACAAGAAAATGTATACGT------ATAGGACCAGGACAAGCATTCTATGCAACA------------GGTGACATAATAGGGGATATAAG +>A1.AU.x.PS1044_Day0.DQ676872 +-GTAGTAACAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAAATAGG---ACAATGATTAGATCTAAAAACATTACAGACAATAAAGAAAACATAATAGTACAGCTTACCGAGCCTGTAAACATTACTTGTATCAGACCTAACAACAATACAAGAAAAAGTGTACGT------ATAGGACCAGGACAAACATTCTATGCA------------ACAGGTGAAATAATAGGGGATATTAG +>A1.KE.94.Q23_17.AF004885 +-GTAGTATCAACTCAATTGCTGTTGAATGGCAGTCTAGCAGAAAAAAAT---ATAACAATTAGATCTGAAAATATCACAAACAATGCCAAAATTATAATAGTACAACTTGTCCAACCTGTGACAATTAAGTGTATCAGACCTAACAACAATACAAGAAAAAGTATACGT------ATAGGACCAGGACAAGCATTCTATGCA------------ACAGGTGACATAATAGGGGATATAAG +>A1.RW.92.92RW008.AB253421 +-GTAGTGTCAACTCAACTGTTGTTAAATGGCAGTCTAGCAAAAGAAAAG---GTAATAATTAGATCTGAAAATATCACAAACAATGTCAAAACCATAATAGTACAACTTGTCAAGCCTGTGAAAATTAATTGTACCAGACCTAACAACAACACAAGAACAAGTATACGT------ATAGGACCAGGACAATCATTCCATGCA------------ACAGGTGACATAATAGGGGATATCAG +>A1.UG.92.92UG037.AB253429 +-GTAGTATCAACTCAACTGCTGTTAAATGGCAGTTTAGCAGAAGGAAAG---GTAATGATTAGATCTGAAAATATCACAAACAATGTCAAAAACATAATAGTACAACTTAACGAGTCTGTAACAATTAATTGTACCAGACCTAACAATAATACAAGAAGAAGTGTACGT------ATAGGACCAGGACAAACATTCTATGCA------------ACAGGTGATATAATAGGGGATATAAG +>A2.CD.97.97CDKS10.AF286241 +-GTAGCATCAACTCAACTGCTGTTGAATGGCAGTCTAGCAGAAAAAGAG---GTGATGATTAGATCTGAAAATATTACAAACAATGCCAAAAACATAATAGTACAGTTTAATGAATCGGTACCAATTACTTGTATCAGACCCAACAACAATACGAGAAAAGGTATACCT------ATTGGACCAGGACAAGTCTTCTAT---------------ACAAGTGACATAATAGGGGATATAAG +>A2.CD.97.97CDKTB48.AF286238 +-GTAGCATCAACTCAACTGCTGCTGAATGGCAGTCTAGCAGAAGGAAAG---GTAATGATTAGATCTGAAAATATTACAGACAATGCCAAAAACATAATAGTACAGTTTAATAAACCTGTACCAATTAATTGTACCAGACCCAACAACAATACAAGAAAAAGTATACGC------TTTGGACCAGGACAGGCCTTCTATACA------------AATAATAACATAATAGGGGATATAAG +>A2.CY.94.94CY017_41.AF286237 +-GTAGCATCAACTCAACTGCTGTTGAATGGCAGTCTAGCAGAAGGAGGGAAAATAATGATTAGATCTGAAAATATTACAAACAATGCCAAAAACATAATAGTTCAGTTTACTAAGCCTGTACTAATTACTTGTATCAGACCCAACAACAATACAAGAAAAAGTATACGC------TTTGGACCAGGACAAGCCTTCTAT---------------ACAAATGAAATAATAGGGGACATAAG +>B.FR.83.HXB2_LAI_IIIB_BRU.K034 +-GTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAGAG---GTAGTAATTAGATCTGTCAATTTCACGGACAATGCTAAAACCATAATAGTACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGAATCCGTATCCAGAGAGGACCAGGGAGAGCATTTGTTACA------------ATAGGA---AAAATAGGAAATATGAG +>B.NL.00.671_00T36.AY423387 +-GTGGTGTCAACCCAATTACTGTTAAATGGCAGTCTAGCAGAAGAAGAT---GTAGTAATTAGATCTAAAAATTTCACAGACAATACCAAAACCATAATAGTACAGCTGAAGGAATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGTATACAT------ATAGGACCAGGGAGAGCATTTTATGCA------------ACAGGAGAAATAATAGGAGATATAAG +>B.TH.90.BK132.AY173951 +-GTAGTATCAACTCAACTGCTGCTAAATGGCAGTCTAGCAGAAGAAGAG---GTAGTAATTAGATCTGAAAATTTTACAGACAATGCTAAAACCATAATAGTACAGCTGAAAGAACCTGTAGAAATTAATTGTACAAGACCTAACAACTATACAAGGAAAAGAATAACT------ATGGGACCAGGGAGAGTATATTATACA------------ACAGGAGAAATAATAGGAGATATAAG +>B.US.98.1058_11.AY331295 +-GTAGTATCTACTCAATTGCTGTTAAATGGCAGTCTAGCAGAAGAAGAT---GTAATAATCAGATCTGACAATTTCACGGACAATGCTAAAACCATAATAGTACAACTGAACGAAACTGTAGATATTCACTGTATAAGGCCCAACAATAATACAAGAAAACGTATAACT------ATGGGACCAGGGAAAGTATATTATACA------------ACAGGACAAATAATAGGAGATATAAG +>B.US.98.15384_1.DQ853463 +-GTAGTGTCAACTCAACTGCTGTTGAATGGCAGTCTAGCAGAAGAAGAG---ATAGTAATTAGATCACAAAATTTTACGGACAATGTTAAATCCATAATAGTACAGCTGAATGAAACTGTAAAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAGCTATACGT------ATAGGACGAGGGAGAGCAATTTATGCA------------ACAGATAGAATAATAGGAGATATAAG +>C.BR.92.BR025_d.U52953 +-GTAGTGTCAACTCAACTACTGTTAAATGGTAGTCTAGCAGAAGAAGAG---ATAATAATTAGATCTAAAAATCTAACAGACAATGTCAAAACAATAATAGTACATCTTAACGAATCTGTAGAGATTAATTGTACGAGACCCAACAATAATACAAGAAAAAGTATAAGG------ATAGGACCAGGACAAGCATTCTATGCA------------ACAGGAGAAATAATAGGAGATATAAG +>C.ET.86.ETH2220.U46016 +-GTGGTATCAACTCAACTATTGTTAAATGGTAGTATAGCAGAAGGTGAG---ACAATAATTAGATTTGAAAATCTGACAAACAATGCCAAAATAATAATAGTACAGCTTAATGAATCTGTAGAAATTACTTGTACGAGACCCAGCAATAATACAAGAGAAAGTATAAGG------ATAGGACCAGGACAAACATTCTATGCA------------ACAGGAGACATAATAGGAGATATAAG +>C.IN.95.95IN21068.AF067155 +-GTGGTATCAACTCAACTACTGTTAAATGGTAGCCTAGCAGAAGGAGGG---ATAATAATTAGATCTGAAAATCTGACAAACAATGTCAAAACAATAATAGTACATCTTAATCAACCTGTAGAAATTATGTGTACAAGACCCGACAATAATACAAGAAAAAGTATAAGG------ATAGGACCAGGACAAACATTCTATGCA------------ACAGGAGACATAATAGGAGACATAAG +>C.ZA.04.SK164B1.AY772699 +-GTGGTATCAACTCAACTACTGTTAAATGGTAGCCTAGCAGAAGGGGAG---ATAATAATTAGATCTGAAAATCTGACAGACAATGCCAAAACAATAATAGTACATCTTAATAAATCTGTAGCAATTGTGTGTACAAGACCCAACAATAATACAAGGAAAAGTATAAGG------ATAGGACCAGGACAAGTATTCTAT---------------ACAAATGAAATAATAGGAAACATAAG +>D.CD.83.ELI.K03454 +-GTGGTGTCAACTCAACTGCTGTTGAATGGCAGTCTAGCAGAAGAAGAG---GTCATAATTAGATCCGAAAATCTCACAAACAATGCTAAAAACATAATAGCACATCTTAATGAATCTGTAAAAATTACCTGTGCAAGGCCCTATCAAAATACAAGACAAAGAACACCT------ATAGGACTAGGGCAATCACTCTATACT------------ACAAGATCA---AGATCAATAATAGG +>D.CM.01.01CM_4412HAL.AY371157 +-GTGGTATCAACTCAATTGCTGTTGAATGGCAGTCTAGCAGAAGAG------GTCATGGTTAGATCTGAAAATCTCACAGACAATGCTAAAAACATAATAGTCCAGCTTAATAATACTATAAACATTACTTGTGTAAGGCCGAACAGCAATACAAGAAAAAGTATAAAT------CTAGGACCAGGGCAGGCATTCTATGCAACATAT------GCAACAAATATAATAGGAAACATAAG +>D.TZ.01.A280.AY253311 +-GTGGTGTCAACTCAACTGCTGTTGAATGGCAGTTTAGCAGGAGAAGAG---ATAATAATTAGATCTGAAAATCTCACAAACAATGTTAAAACCATAATAGTACAGTTAAATGAGACTGTAAAAATTAATTGTACAAGGCCTAATAACAATACAAGAAAAGGTATACGT------ATAGGACCAGGGCAAACATTCTTTACA---------------GCAGAGGTAACAGGAGATATAAG +>D.UG.94.94UG114.U88824 +-GTAGTGTCAACTCAACTGTTGTTGAATGGCAGTCTAGCAGAAGAAGAA---ATAATAATTAGATCTGAAAATCTCACAAACAATGCTAAAATCATAATAGTACAGCTTAATGAGTCTGTACCAATTAATTGCATAAGGCCCTATAACAATACAAGACAAAGTACACGT------ATAGGACCAGGGCAAGCACTCTTTACA------------ACA---AAAGTAATAGGAGATATAAG +>F1.BE.93.VI850.AF077336 +-GTGGTATCAACTCAATTGTTGTTAAATGGCAGCCTAGCAGAAGAAGGT---ATAGTAATCAGATCTCAAAATATCTCAAATAATGCAAAAACCATAATAGTACACCTTAATGAATCTGTACAGATTAATTGTACAAGACCCAACAACAATACAAGGAAAGGTATACAT------TTAGGACCAGGACAAACATTCTATGCA------------ACAGGAGCAATAATAGGAGACATAAG +>F1.BR.93.93BR020_1.AF005494 +-GTGGTATCCACTCAATTGTTGTTAAATGGCAGCCTAGCAGAAGGAGAG---ATAGTAATCAGATCTCAAAATATCTCAGATAATGCAAAAACCATAATAGTGCACCTTAATGAATCTGTACAGATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGAATATCT------TTAGGACCAGGACGAGTATTTTATACA------------ACAGGAGAAATAATAGGAGACATCAG +>F1.FI.93.FIN9363.AF075703 +-GTGGTATCAACTCAATTGTTGTTAAATGGCAGCCTATCAGAAGGAGGT---ATAATAATCAGATCTCAAAATCTCTCAGATAATGCAAAAACTATAATAGTACACCTTAATGAATCTGTACAGATCAATTGTACAAGACCCAACAACAATACAAGAAAAAGTATACGT------ATAGGACCAGGACAATCATTTTATGCA------------ACAGGAGAAATAATAGGAGACATAAG +>F1.FR.96.MP411.AJ249238 +-GTGGTATCAACTCAATTGCTGTTAAATGGCAGCCTGGCAGAAGAAGAT---ATAATAATCAGATCTCAAAATATCTCAGATAATGCAAAAACCATAATAGTACACCTTAATGAAAGTGTACAGATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGTATACAT------TTAGGACCAGGACAAGCATTCTATGCA------------ACAGGTGATATAATAGGAGATATAAA +>F2.CM.02.02CM_0016BBY.AY371158 +-GTGGTATCAACTCAACTACTGTTAAATGGCAGCTTAGCAGAAAAAAAT---ATAATAATTAGATCTGAAAATATCACAGATAATGCAAAAACCATAATAGTACAGTTTAATGAATCAGTAAAAATTAACTGTACAAGACCCAACAACAATACAAGAAAAAGTATACGT------ATAGGACCAGGACAAGTATTTTATGCA------------ACAGGTGAGATAATAGGAGACATAAG +>F2.CM.95.MP255.AJ249236 +-GTGGTATCAACTCAACTACTGCTAAATGGCAGCCTAGCACAAGAAGAT---ATAATAATTAGATCTAAAAATATCACAGATAATACAAAAAACATAATAGTACAGTTTAATAGATCTGTAATAATTGATTGTAGAAGACCCAACAACAATACAAGAAAAGGGATACGT------ATAGGACCAGGACAAACATTCTTTGCA------------ACAGGTGAAATAATAGGGGATATAAG +>F2.CM.95.MP257.AJ249237 +-GTGGTATCAACTCAACTACTGTTAAATGGCAGCCTAGCAGAAGAAAAG---ATGATAATTAGATCTGAAAATATCTCAGATAATACAAAAACCATAATAGTACAGTTTAAAAATCCTGTAAAAATTAATTGTACAAGACCCAACAACAATACAAGAAGAAGTATACAT------ATAGGACCAGGACGAGCATTCTATGCA------------ACAGGTGAGATAATAGGAGATACAAG +>F2.CM.97.CM53657.AF377956 +-GTAGTGTCAACTCAACTACTGTTAAATGGCAGCCTAGCAGAAGGAGAT---ATAGTAATTAGATCTGAAAATATCTCAGATAATGCAAAAACCATAATAGTACAGTTTAATAGATCTGTAGCAATTAACTGTACAAGACCCACCAACATTACAAGAAGAAGTATGCGT------ATAGGACCAGGACGAGTATTTTATGCA------------ACAGGTACCGTACTAGGAGATATAAG +>G.BE.96.DRCBL.AF084936 +-GTAGTATCAACTCAACTACTGCTGAATGGCAGTTTAGCAGAAAAAGAT---ATCATAATTAGTTCTGAAAATATCTCAGACAATGCCAAAGTCATAATAGTGCACCTTAATAGGTCTGTAGAAATTAATTGTACCAGACCCAACAACAATACAAGAAGAAGTGTAGCA------ATTGGACCAGGACAAGCATTCTATACA------------ACAGGAGAAGTAATAGGAGACATAAG +>G.KE.93.HH8793_12_1.AF061641 +-GTGGTATCAACTCAACTACTGCTGAATGGCAGTTTAGCAGAAGGAGAA---ATAATAATTAAATCAGAAAACATCACAGACAATACCAAAGTCATAATAGTGCAGCTTAATGAAACTGTAGAAATTACGTGTGTCAGACCCAACAACAATACAAGAAAAAGTATACAC------CTCGGGCCAGGACAAGCGCTCTATGCA------------ACAGGGGACATAATAGGAAATATAAG +>G.NG.92.92NG083.U88826 +-GTGGTATCAACTCAACTACTGCTGAATGGCAGTTTAGCAGAAGAAGAT---ATAAGAATTAGATCTGAAAATTTCACAGACAATACCAAAGTCATAATAGTGCAGCTTAATAATAGTATAGAAATTAATTGTATCAGACCCAATAACAATACAAGAAAAAGTATACCA------ATCGGACCAGGACAAGCGTTCTATGCA------------ACAGGTGATATAATAGGAGACATAAG +>G.PT.x.PT2695.AY612637 +-GTGGTATCAACTCAATTGCTGCTGAACGGCAGTTTAGCAGAAGGGGAA---ATAATGATTAGATCTGAAAACATCACGAACAATGCCAAAAACATAATAGTACAGCTCAATGAAACTGTACCCATTACGTGTGCCAGACCCAGCAATAATACAAGAAAAAGTATAAGA------TTTGGACCAGGACAAGCGTTCTATGCA------------ACAGATGCCATAATAGGAGATATAAG +>H.BE.93.VI991.AF190127 +-GTGGTATCAACTCAACTGTTGTTAAATGGAAGCCTAGCAGAAGTGGAGGAGGTAATAATTAGATCTAAAAATATTACAGATAATACCAAAAACATAATAGTACAGTTAAATGAACCTGTACAAATTAACTGTACCAGAACAGGCAATAATACGAGAAAAAGTATACGT------ATAGGGCCAGGACAAGCATTCTATGCA------------ACAGGTGACATCATAGGAGATATAAG +>H.BE.93.VI997.AF190128 +-GTGGTATCAACTCAACTGCTATTAAATGGAAGCCTAGCAGAAGGACAG---GTCATAATTAGATCTAAAAATATCTCAGACAATACCAAAAACATAATAGTACAGCTTGATAGTCCTATAGAAATTACCTGTACCAGACCTAACAATAATACAAGAAAAGGTATACAT------TTCGGGCCAGGGCAAGCATTCTATGCA------------ACAGGTGATATCATAGGAAACATAAG +>H.CF.90.056.AF005496 +-GTGGTATCAACTCAACTGCTATTAAATGGAAGCCTAGCAGAAGAACAG---ATCATAATTAGAACTAAAAATATCTCAGACAATACCAAAAACATAATAGTACAGCTTAAGACACCAGTAAACATTACATGTACCAGGCCTAACAATAATACGAGAACAAGTATACAT------TTAGGGCCAGGACGAGCATTCTATGCA------------ACAGGTGACATCATAGGAGATATAAG +>J.CD.97.J_97DC_KTB147.EF614151 +-GTGGTGTCAACTCAACTACTGCTAAATGGTAGTATAGCGGAAAAAGAA---GTAATAATTAAATCCAAAAATATCTCAGACAATGCTAAGACCATAATAGTACAGCTTAACCAAACTGTAGAAATCAATTGTACCAGACCCGCCAACAATACAAGAAAGGGTATACCC------ATAGGACCAGGGCAAGTGCTATATGCA------------ACAGGTGCAGTAATAGGAAACATAAG +>J.SE.93.SE7887.AF082394 +-GTGGTATCAACTCAACTACTGCTAAATGGCAGTATAGCAGAAGGAGAC---ATAATAATTAGATCTGAAAACATCTCAGACAATGCCAAAAACATAATAGTACAACTTAATAAAACTGTAGAAATTGTGTGTTACAGACCTAATAACAATACAAGGAAAGGTATACAC------ATGGGACCAGGACAAGTGCTCTACGCA------------ACAGGAGAAATAATAGGAAATATAAG +>J.SE.94.SE7022.AF082395 +-GTGGTATCAACTCAACTACTGCTAAATGGCAGTGTAGCAGAAGGAGAC---ATAATAATTAGATCTGAAAATATCTCAGACAATGCTAAAAACATAATAGTACAACTTAATGACACTGTAGAAATTGTGTGTACCAGACCTAATAACAATACAAGAAAAGGTATACAC------ATGGGACCAGGACAAGTGCTCTACGCA------------ACAGGGGAAATAATAGGAGATATAAG +>K.CD.97.EQTB11C.AJ249235 +-GTGGTATCAACTCAATTGCTATTAAATGGCAGCCTAGCAGAAGAAGAG---ATAATTATTAGGTCTGAAGATATTACAAAGAATACAAAAAACATAATAGTACAGCTTAATGAAGCTGTAGAAATTAATTGTACAAGGCCAAGCAACAATACAAGAAAAAGTATACAT------ATAGGACCAGGAAGAGCATTCTATGCA------------ACAGGTGACATAATAGGAGATATAAG +>K.CM.96.MP535.AJ249239 +-GTGGTATCAACTCAACTGCTGTTAAATGGCAGCCTAGCAGAAGAAGAG---ATAATAATTAGGTCTGAAAATATTACAGATAATACAAAAAACATAATAGTACAGCTTAATGAAACTGTACAAATTAATTGTACAAGGCCAAACAACAATACAAGAAAAAGTATACAT------ATGGGACCAGGAAAAGCATTCTATACA------------ACAGGTGATATAATAGGAGATATAAG +>01_AE.TH.90.CM240.U54771 +-GTGGTATCAACTCAATTGCTGTTAAATGGCAGTCTAGCAGAAGAAGAG---ATAATAATCAGATCTGAAGATCTCACAAACAATGCCAAAACCATAATAGTGCACCTTAATAAATCTGTAGAAATCAATTGTACCAGACCCTCCAACAATACAAGAACAAGTATAACT------ATAGGACCAGGACGAGTATTCTATAGA------------ACAGGAGATATAATAGGAAATATAAG +>01_AE.TH.93.93TH051.AB220944 +-GTGGTATCAACACAATTGCTGTTAAATGGCAGTCTAGCAGAAGAAGAA---ATAATAATCAGATCTGAAAATCTCACAAACAATGCCAAAACCATAATAGTGCACCTTAATAAATCTGTAGAAATCAATTGTACCAGACCCTCCAACAATATAAGAACAAGTATAAGA------ATAGGACCAGGACGAGTATTCTATAAA------------ACAGGAGCTATAACAGGAGATATAAG +>02_AG.CM.99.pBD6_15.AY271690 +-GTAGTGTCAACTCAACTGCTACTAAATGGCAGTCTAGCAGAAGAAGAG---GTAATGATTAGATCTGAAAATATCACAAACAATGCCAAAACCATAATAGTACAGCTGGTTACTCCTGTAAAAATTAATTGTACCAGACCTGGCAATCCTATAAGAAAAAGGGTAGGT------ATAGGACCAGGACAAGCATTCCATGCA------------ACAGGTAATATAATAGGAGACATAAG +>02_AG.NG.x.IBNG.L39106 +-GTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGGAGAA---GTAGTGATCAGATCTGAAAATATCACAAACAATGCCAAAACCATAATAGTACAGTTGGCTAATCCTGTAAAAATTAATTGTACCAGACCTAACAACAATACAAGAAAAGGTGTACAT------ATAGGACCAGGGCAAGCATTCTATGCA------------ACAGGTGACATAATAGGGGATATAAG +>03_AB.BY.00.98BY10443.AF414006 +-GTAGTGTCAACTCAACTGCTGTTAAATGGTAGCCTAGCAGAAAAAGAG---GTAGTAATTAGATCTGTCAATTTCACGGACAATACTAAAACCATAATAGTACAGCTGAAAGAACCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAGGTATTCAT------ATGGGACCAGGGAGAGCATTTTATGCA------------ACAGGAGACATAATAGGAGACATAAG +>04_cpx.GR.91.97PVCH.AF119820 +-GTGGTATCAACTCAATTGCTGTTAAATGGAAGCTTAGCAACAGGAGGG---GTAGTAATTAGATCTAAAAATTTCACAGACAATCCCAAAAATATAATAGTACAGCTTGACAAGGCTGTAAAAATTAATTGTACCGGCCTTAACAACAATACAGGAGGAAGTGAACGTATCGGTATAGGGCCAGGACACACATGGTATGCA------------ACAGGTAACATAGTAGGAGATATAAG +>04_cpx.GR.97.97PVMY.AF119819 +-GTAGTATCAACTCAATTGCTGTTAAATGGAAGCTTATCAACAGAAGGG---GTAGTACTTAGATCTAAAAACTTCACAGACAATACCAAAAATATAATAGTACAGCTTGCAGAGGCTGTAAAAATTAATTGTACCAGACCTAACAACAATACAAGAAAAGGTGTACAT------ATAGGACCAGGAAAAACATGGTTTGCA------------ACAGGGGAAGTAATAGGAGACATAAG +>05_DF.BE.x.VI1310.AF193253 +-GTGGTATCAACTCAGTTGCTGTTAAATGGCAGCCTAGCAAAAGAAGGT---ATAATAATCAGATCTCAAAATATCTCAGATAATGCAAAAAACATAATAGTACACCTTAATGAATCTGTACATATTAATTGTACAAGGCCCAACAACAATACAAGAAAAAGTATACAT------TTAGGACCAGGACAAGCATTCTATGCA------------ACAGGTGACATAATAGGAGACATAAG +>06_cpx.EE.01.EE0359.AY535659 +-GTGGTATCAACTCGATTACTGCTAAATGGTAGTTTAGCAGAAGAAAGT---ATAATAATTAGATCTGAAAACATCACAGACAATGCCAAAAACATAATAGTGCAGCTTAATAAATCTGTAGAAATTATATGTACCAGACCCTATAATAATACAAGAAAAAGTATACAC------TTTGGACCAGGGCAAGCGCTTTTTGCA------------ACAGGTGAAATAATAGGAGATATAAG +>06_cpx.GH.03.03GH173_06.AB2868 +-GTGGTGTCAACTCAATTACTGCTGAATGGTAGTTTAGCAGAAAAAGAA---ATAATAATTAGATCTGAAAATCTCACAGACAATACCAAAAACATAATAGTGCAGCTTAATAGCACTGTACAAATTACATGCAATAGACCCAATAACAATACAAGAAGAGGTATACAC------CTTGGACCAGGGCAAGTGTTCTTTGCA------------ACAGGTGACATAATAGGAGATATAAG +>06_cpx.ML.95.95ML127.AJ288982 +-GTGGTATCAACTCAGTTACTGCTGAATGGCAGTTTAGCAGAAGAAGAA---ATAATAATTAAATCTAAAAACCTCACAGACAATACCAAAATCATAATAGTGCAGCTTAATAAATCTGTAGAAATTAGTTGTTCCAGACCCAATAACAATACAAGAAAAAGTATACAC------ATTGGACCAGGGCAAGCGTTCTATGCA------------ACAGGTGAAATAATAGGAAATATAAG +>06_cpx.ML.95.95ML84.AJ245481 +-GTGGTATCAACTCAGTTACTGCTGAATGGCAGTTTAGCAGAAGATGAA---ATAATAATTAAATCTGAAAACCACACAAACAATGCCAAGATCATAATAGTGCAGCTTAATAAAACTGTACAAATTAGGTGTACCAGACCCAGTAACAATACAAGGAAAAGTATACCC------CTTGGACCAGGGCAAGCGTTCTATGCA------------ACAGGTGACATAATAGGAGATATAAG +>06_cpx.RU.05.04RU001.DQ400856 +-GTAGTATCAACTCAATTACTGCTAAATGGCAGTTTAGCAGAAGAAAAT---GTAATAATTAGATCTGAAAACATCACAGACAATACCAAAACCATAATAGTGCATCTTAATAAATCTGTAGAAATTACATGTACCAGACCCAATAATAATACAAGAAAAGGTATACAC------TTTGGACCAGGGCAAGTGTTTTTTGCA------------ACAGGTGACATAATAGGAGATATAAG +>06_cpx.SN.97.97SE1078.AJ288981 +-GTGGTATCAACTCAATTACTACTGAATGGCAGTTTAGCAGAAGAGGAA---ATAATAATTAAAACTGAAAACCTCACAGACAATAGCAAGAACATAATAGTACAGCTTAATAAATCTATAGAAATTAAGTGTACCAGACCCAATAACAATACAAGAAAAAGTATATCC------TTTGCACCAGGGCAAGCGTTCTATGCA------------ACAGGTGACATAATAGGAGATATAAG +>07_BC.CN.97.97CN001.AF286226 +-GTGGTATCAACTCAACTACTTGTTAATGGTAGCCTAGCAGAAGGAGAA---ATAATAATTAGGTCTGAAAATCTGACAAACAATGTCAAAACAATAATAGTACATCTTAATCAATCTGTAGAAATTGTATGTACAAGACCCGGCAATAATACAAGAAAAAGTATAAGG------ATAGGACCAGGACAAACATTCTATGCA------------ACAGGAGACATAATAGGAGACATAAG +>08_BC.CN.98.98CN006.AF286229 +-GTGGTATCAACTCAACTACTGTTAAATGGTAGCCTAGCAGAAGGAGAG---ATAATAATTAGATCTGAAAATCTGACAAACAATGTCAAAACAATAATAGTACATCTTAATCAATCTGTAGAAATTGTATGTACAAGACCCAACAATAATACAAGAAAGAGTATAAGG------ATAGGACCAGGACAAACATTCTATGCA------------ACAGGAGACATCATAGGAGACATAAG +>09_cpx.CI.00.00IC_10092.AJ8665 +-GTAGTATCAACTCAATTACTGCTGAATGGCAGCTTAGCAGAAGGAGAT---ATAATGATTAGATCTGAGAATATCACAGACAATGCCAAAGACATAATAGTACAATTGAAGGAGCCTGTAAATATTACTTGTATCAGACCTAGCAACAATACAAGGGAAAGTGTACGT------ATAGGACCAGGACAAACTTTCTTTGCA------------ACAGGTGATATAATAGGAGACATAAG +>10_CD.TZ.96.96TZ_BF061.AF28954 +-GTAGTGTCAACTCAACTGTTATTGAATGGCAGTCTAGCAGAAGAAGAG---ATAATAATTAGATCTGAAAATCTCACAGATAATGCTAAAAATATAATAGTGCAGCTTAATGAGTCTGTAACAATTAATTGCGTGAGACCCAACAATAATACAAGAAGAAGTATAAGG------ATAGGACCAGGACAAACATTCTATGCA------------ACA---GACATAATAGGAAACATAAG +>11_cpx.CM.95.1816.AF492624 +-GTAGTATCAACTCAATTACTGTTAAATGGCAGTCTAGCAGAAGGAGAG---GTAAGAATTAGATCTAAAAACATCACAGACAATGCTAAAACCATAATAGTACAATTTGACAGTCCTGTACACATTAATTGTACCAGACCTAACAACAATACAAGAAAAAGTATACGG------ATAGGACCAGGACAAGCTTTCTATGCA------------ACAGGTGCTATAATAGGGGATATAAG +>12_BF.AR.97.A32879.AF408629 +-GTGGTATCAACTCAATTGTTGTTAAATGGCAGCCTAGCAGAAGAAGAG---ATAATAATCAGCTCTCAAAATATCTCAGATAATGCAAAAAACATAATAGTACACCTTAAGGAACCTGTATGGATTAATTGTACAAGACCCAACAATAATACAAGAAAAAGTATACAT------TTAGGACCAGGACAAGCATTTTATGCA------------ACAGGAGATATAATAGGAGACATCAG +>12_BF.AR.97.A32989.AF408630 +-GTGGTATCAACTCAATTGTTGTTAAATGGCAGTCTACCAGAAGGAGAG---ATAATAATCAGATCTCAAAATATTTCACATAATACAAAAACCATAATAGTACACCTTAATGAATCTGTACAGATTACTTGTATAAGACCCAACAATAATACAAGAAAAAGTATACAT------ATAGGACCAGGAAGAGCATTTTATGCA------------ACAGGAGACATAATAGGAGACATGAG +>12_BF.AR.99.ARMA159.AF385936 +-GTGGTATCAACTCAATTGTTGTTAAATGGCAGCCTAGCAGAAGAAGAT---ATAATAATTAGATCTCAAAACATCTCAGATAATACAAAAACCATAATAGTTCATCTTAATGAATCTGTACAGATTAACTGTACAAGACCCAACAATAATACAAGAAAAAGTATACAG------TTAGGACCAGGACGGGCATTTTATGCA------------ACAGGAGACATAATAGGGGACATTAG +>13_cpx.CM.02.02CM_A1394.DQ8453 +-GTAGTATCAACTCAACTACTGTTAAATGGCAGCCTAGCAGAAGGAGGAGAGATAAAGATTAGATCTGAAAACATCTCAAACAATGCTAAAACCATAATAGTACAGCTTAAAGATCCTGTGAGAATTAATTGTACTAGACCTAACAACAATACAAGAAAAAGTATAAGG------ATAGGACCAGGACAAACATTCTATGCA------------ACAGGTGATATAATAGGGGATATAAG +>14_BG.DE.01.9196_01.AY882421 +-GTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAAAAGAG---GTAATAATTAGGTCTGAAAATTTCACGGACAATGCTAAAACCATAATAGTACAGCTAAATAAATCTGTAGTAATTAATTGYTCAAGACCCAACAACAATACAAGAAAAAGTATACCT------ATAGGACCAGGGAGAGCATTTTATGCA------------ACAGGACAAATAATAGGAGACATAAG +>15_01B.TH.99.99TH_MU2079.AF516 +-GTAGTATCAACTCAACTGCTGTTAAATGGCAGTTTAGCAGAAGAAGAG---GTAATAATCAGATCTAGCAACTTCACGAACAATGCTAAAATCATAATAGTACAGCTAAATGAATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGTATACAT------ATAGGTCCAGGGAGATCATGGTATACA------------ACAGGAGAAATAATAGGAGATATAAG +>16_A2D.KR.97.97KR004.AF286239 +-GTGGCATCTACTCAATTGCTGCTGAATGGCAGTCTAGCAGAAGGAAATAAAACAATAATTAGATCTGCAAATATTACAGACAATACTAAAAACATAATAGTACAGTTTACTAAGCCTGTACAAATTAATTGTACCAGACCCGACGTCGGTCAAAGAAGGAGTGTACGC------ATTGGACCAGGACGAGCCTTCTATACAAGGCAAACATATACAAGGCAAGCAAAAGGGGATATAAG +>18_cpx.CU.99.CU76.AY586540 +-GTGGTATCAACTCAATTGCTGTTAAATGGAAGTCTAGCAACAGAAAAG---GTAATGATCAGATCTGAAAATATCACAGACAATACCAAGAACATAATAGTACAGCTTACGACGCCTGTAAACATTACCTGTACCAGACCCAGCAATAACACGAGAAGAAGTATACAT------ATAGGACCAGGACAAGCATTCTATGCA------------ACAGGTGACATAATAGGAGATATAAG +>19_cpx.CU.99.CU29.AY588971 +-GTAGTGTCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAGGAG---ATAGTAATTAGTTCTGAAAATATCACAAACAATGCCAAAATCATAATAGTACAGCTTAAGAAGCCTGTACAAATTAATTGTACCCGACCTAACAACAATACAAGACAAGGTGTACAT------ATAGGACCAGGAGCAGTATTCTATAGA------------ACAGGTGATATAATAGGGGATATAAG +>20_BG.ES.99.R77.AY586544 +-GTGGTATCAACTCAATTACTGCTGAATGGCAGCTTAGCAGAAAAGGAA---ATAATGATTAGATCTGAAAACCTCACAAACAATGCCAAAGTCATAATAGTGCAGCTTAATAAAAGTATAGAAATTAATTGTACCAGACCCAACAACAATACAAGAAAGAGTATAAGA------TTTGGACCAGGACAAGCGTTCTATGCA------------ACAGGTGACATAATAGGAGATATAAG +>21_A2D.KE.91.KNH1254.AY945737 +-GTGGTGTCAACTCAACTGCTGTTGAATGGCAGTTTAGCAGAGGAAGAG---ATCATAATTAGAACTGAAAATATCAGCAGGAATGAAAAAAGCATAATAGTACAGCTTAATGAGTCTGTAACAATTAATTGTACAAGGCCCAATAACAATACAAGAAGAGGTATACAT------ATAGGACCAGGGCAAGCAATATATGCA------------ACCGGTAAGATAATAGGAAACATAAG +>23_BG.CU.03.CB118.AY900571 +-GTGGTATCAACTCAATTACTGCTGAATGGCAGCTTAGCAGAAAAGGAA---ATAATGATTAGATCTGAAAACCTCACAAACAATGCCAAAGTTATAATAGTGCAGCTTAATAGCAGTATAACAATTAATTGTACCAGACCCAACAACAATACAAGAAAGAGTATAACA------TTTGGACCAGGACAAGCATTCTATGCA------------ACAGGTGACATAATAGGAAATATAAG +>24_BG.CU.03.CB378.AY900574 +-GTGGTATCAACTCAATTACTGCTGAATGGCAGCTTAGCAGACAAGGGA---ATAATGATTAGATCTGAAAACCTCACAAACAATGCCAAAGTCATAATAGTGCAGTTTAATAAAAGTATAGATATTAATTGTACCAGACCTGGCAACAATACAAGAAAGAGTATAAGA------TTTGGACCAGGACAAGCGTTCTATGCA------------ACAGGTGCCATAATAGGCGATATAAG +>25_cpx.CM.01.101BA.DQ826726 +-GTAGTGTCAACACAACTGTTGCTAAATGGCAGCTTAGCAGAAGAAGAG---ATAGTAATTAGATCTGAAAATATTTCAAACAATGCCAAAAGCATAATAGTGCATTTTAATGAGACTGTAACAATTAATTGTATCAGACCTAACAACAATACAAGAAAAAGTGTACGT------ATAGGGCCAGGACAAACATTCTATGCA------------ACAGGTGATATAATAGGAGATATAAG +>27_cpx.FR.04.04CD_FR_KZS.AM851 +-GTGGTATCAACTCACTTGCTGTTAAATGGCAGTCTAGCAGAAGAAGAG---ATAATAATTAAATCTGAAAATCTCACAGACAATACCAAAACCATAATAGTACACCTTAATGAATCTGTAAAAATTAATTGTACCAGACCCTTCAACAATACAAGGAAAAGTGTAAGT------CTAGGACCAGGATCCGTATGGTATAGA------------CCAGGAGAAGTAATAGGAGATATAAG +>28_BF.BR.99.BREPM12313.DQ08587 +-GTAGTGTCAACTCAACTGCTATTAAATGGCAGTCTGGCAGAAGAAGAA---ATAATAATTAGATCTGAAAATTTCACAAACAATGCTAAAATTATAATAGTACAGCTGAATAGGACTGTAGACATTAATTGTACAAGACCCGGCAACAATACAAGAAAAAGTATACAT------ATAGCACCGGGAAGAGCATTTTATGCA------------ACAGGAGACATAATAGGAGATATAAG +>29_BF.BR.99.BREPM11948.DQ08587 +-GTAGTGTCAACACAACTACTGTTAAATGGCAGTCTAGCAGAAGAAGAT---ATAGTAATTAGATCTGAAAATTTCACGAACAATGCTAAAACCATAATAGTACAGTTGAACGAAACTGTACGAATTAATTGTACAAGACCCAACAATAATACAAGAAGAAGTATGCAT------CTAGGACCAGGGAGAGCATTATTTCAT------------GCAACAGATATAATAGGAGATATAAG +>31_BC.BR.02.110PA.EF091932 +-GTAATATCAACTCAACTACTGTTAAATGGTAGTCTAGCAGAAGATGAG---ATAATAATTAGATCTAAAAATATAACAGACAATGTCCAAACAATAATAGTACAGCTTAAGGAACCTGTAGAAATTAATTGTACGAGACCCAACAATAATACAAGAGAAAGTATAAGA------ATAGGTCCAGGACAAACATTCTATGCA------------ACAGGAGACATAATAGGAGATATAAG +>33_01B.MY.05.05MYKL045_1.DQ366 +-GCGGTATCAACTCAATTGCTGTTAAATGGTAGTTTAGCAGAAGAAGAG---ATAATAATCAGATCTGCAAATCTCACAGACAATACCAAAACCATAATAGTGAATCTTAATACACCTGTAGAAATCAATTGTACCAGACCCTCTGAAAAGAAAAGAATGCGTATGACT------ACGGGACCAGGACACGTATTCTATAAA------------ACAAACGAAATAACAGGAGATATAAG +>34_01B.TH.99.OUR2478P.EF165541 +-GTGGTATCAACTCAATTGCTGTTAAATGGCAGTCTAGCAGAAGAAGAG---ATAATAATCAGATCTGAAAATATCACAAACAATGCAAAAACCATAATAGTGCACCTTAATAAATCTGTAGAAATCAATTGTACCAGACCCTCCAACAATACAAGAACAAGTGTACAT------ATAGGACCAGGACAAGCATTCTATAAA------------ATAGGAGACATAATAGGAGATATAAG +>35_AD.AF.05.05AF026.EF158043 +-GTGGTATCAACTCAACTATTGTTGAATGGCAGTCTAGCAGAAGGAGAG---ATAATAATTAGATCTGAAAATATCTCAGACAATGCTAAAACCATAATAGTACAGCTTAATGAGACTGTAACAATTAATTGTACCAGACCTGGCAACAATACAAGAAAAAGTATACAT------ATAGGACCAGGACAAGCATTCTATGCA------------ACAGGTGATATAATAGGGGATATAAG +>36_cpx.CM.00.00CMNYU830.EF0879 +-GTGGTATCAACTCAATTACTGTTAAATGGCAGTCTAGCAAAAAATGAA---ACAATAATCAGATCTGAAAATATCACAAACAATGCCAAGACCATAATAATACAGTTGGTCAAGCCTGTAAATATTACTTGTGTCAGGCCTAACAACAATACAAGAAAAGGTACACCT------ATAGGACCAGGACAAGCTTTCTATGCA------------AGAAATGATGTAATAGGGGACATAAG +>37_cpx.CM.00.00CMNYU926.EF1165 +-GTGGTATCAACTCAATTGCTGTTAAATGGCAGTCTAGCAGAAGGAAAG---GTAATGATTAGATCTGAAAATATCACAAACAACGCCAAAACCATAATAGTACAATTTAATGAGACTGTAGAAATTAATTGTACCAGGTTTAACAACAACACAAGAAAAAGTGTACGC------ATAGGACCAGGACAAACATTCTTTGCA------------ACAGGTGACATAATAGGGGACATAAG +>42_BF.LU.03.luBF_05_03.EU17015 +-GTAGTGTCAACTCAACTGCTGTTAAATGGCAGCYTAGCAGAAGAAGAT---ATAATAATCAGATCTCAAAATATYTCAGACAATGCAAAAACCATAATAGTACACCTTAATGAATCTGTACAGATTAATTGCACGAGACCCAACAATAATACAAGAAAAAGTATACAT------ATAGCACCAGGACGAGCATTTTATGCA------------ACAGGAGACATAATAGGAGACATCAG diff --git a/examples/dewrapped.phy b/examples/dewrapped.phy new file mode 100644 index 0000000..f5f5c9a --- /dev/null +++ b/examples/dewrapped.phy @@ -0,0 +1,135 @@ + 134 500 +Ref.A1.KE.94.Q23_17.AF004885 GGGTTATGCAAGAATGTCAGCACAGTCCAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAATTGCTGTTG---AATGGCAGTCTAGCAGAAAAAA---ATATAACAATTAGATCTGAAAATATCACAAACAATGCCAAAATTATAATAGTACAACTTGTCCAACCTGTGACAATTAAGTGTATCAGACCTA------ACAACAATACAAGAAAAAGTATAC----------GTATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGACATAATAGGGGATATAAGACAAGCACATTGTAATGTCACTAGATCAAG-ATGGAATAAGACTTTACAAGAGGTAGCTGAAAAATTAAGAACATACT------------TTGGG------------AACAAAACAATAA---------T---ATTTGCTAAC-----------TCCTCAGGAGGGGATCTAGAGATCACAACACATAGT-TTT +Ref.A1.SE.94.SE7253.AF069670 GGGCCATGCAACAATGTAAGCACAGTACAGTGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGAAA---AGATAATGATTAGATCTGAAAATATCTCAGACAATGCCAAAACCATAATAGTACAACTTACCGAGCCTGTAACAATTAATTGTACCAGACCTA------GCAACAATACAAGAACAAGTATAC----------GTATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGACATAACAGGGGATATAAGACAAGCACATTGTAATGTCAGTAGATCAAG-CTGGAATAAAACTCTACAAGACATAGTTACACAATTAAGAGTATACT------------GGAAT-A--------------GAACAATAA---------T---CTTTAATAGC-----------TCCTCAGGAGGGGATTTAGAAATTACAACACATAGT-TTT +Ref.A1.UG.85.U455.M62320 GGGCCATGCAGGAATGTCAGCACAGTACAATGTACACATGGAATCAAGCCAGTGGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAAGAG---AAATAAGGATTAGATCTGAAAATTTCACAAACAATGCAAAAACCATAATAGTACAGCTTGTCAATCCTGTAAAAATCAATTGTTCCAGACCTT---------ACAATACAAGAAAAAATATACGTAGGTATAG-TATAGGATCAGGACAAGCATTCTATGTAACA---------------GGTAAAATAATAGGGGATATAAGACAAGCACATTGTAATGTCAGTAGAAGGGA-CTGGAATAGAACTATACAACAGGTAGCTGAACAATTAAAGAAAAAGT------------TTAAT------------AACAAAACAATAA---------T---CTTTGCTAGC-----------TCCTCAGGAGGGGATATAGAAATTACAACACATAGC-TTT +Ref.A1.UG.92.92UG037.U51190 GGGCTATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAGGCCAGTAGTGTCAACTCAACTGCTGTTA---AATGGCAGTTTAGCAGAAGGAA---AGGTAATGATTAGATCTGAAAATATCACAAACAATGTCAAAAACATAATAGTACAACTTAACGAGTCTGTAACAATTAATTGTACCAGACCTA------ACAATAATACAAGAAGAAGTGTAC----------GTATAGGACCAGGACAAACATTCTATGCAACA---------------GGTGATATAATAGGGGATATAAGACAAGCACATTGTAATGTCAGTGGGTCACA-ATGGAATAAAACTTTACACCAGGTAGTTGAACAATTAAGAAAATATT------------GGAAC------------AACAATACAATAA---------T---CTTTAATAGC-----------TCCTCAGGAGGGGATTTAGAAATTACAACACATAGT-TTT +Ref.A2.CD.97.97CDKS10.AF286241 GGACCATGCAATAATGTTAGCTCAGTACAATGTACACATGGAATTATGCCAGTAGCATCAACTCAACTGCTGTTG---AATGGCAGTCTAGCAGAAAAAG---AGGTGATGATTAGATCTGAAAATATTACAAACAATGCCAAAAACATAATAGTACAGTTTAATGAATCGGTACCAATTACTTGTATCAGACCCA------ACAACAATACGAGAAAAGGTATAC----------CTATTGGACCAGGACAAGTCTTCTATACAAGT---------------G---ACATAATAGGGGATATAAGACAAGCATATTGTAGTATCAACAAAACAAA-ATGGGATGCCTCTTTACAAAAGGTAGCTGAACAATTAAGAAAACACT------------TCCCT------------AATAAAACAATAA------------ATTTTACCAAA-----------CCCTCAGGAGGGGATCTAGAAATTACAACACATAGT-TTT +Ref.A2.CD.97.97CDKTB48.AF286238 GGGTCATGCAGTAATGTTAGCTCAGTACAATGTGCACATGGAATTAGGCCAGTAGCATCAACTCAACTGCTGCTG---AATGGCAGTCTAGCAGAAGGAA---AGGTAATGATTAGATCTGAAAATATTACAGACAATGCCAAAAACATAATAGTACAGTTTAATAAACCTGTACCAATTAATTGTACCAGACCCA------ACAACAATACAAGAAAAAGTATAC----------GCTTTGGACCAGGACAGGCCTTCTATACAAAT---------------AATAACATAATAGGGGATATAAGACAAGCACATTGTAATATCAGCATAACAGA-ATGGAATGCCACTCTAAAAAAGGTAGTTGAACAATTAAGAGAACACT------------TCCCT------------AATAAAACAATAA---------T---CTTTAATAGC-----------TCCTCAGGAGGGGACCTAGAAATTACAACACATAGT-TTT +Ref.A2.CY.94.94CY017_41.AF286237 GGGTCATGCAAGAATGTTAGCTCAGTACAATGTACACATGGAATTAAACCAGTAGCATCAACTCAACTGCTGTTG---AATGGCAGTCTAGCAGAAGGAGGGAAAATAATGATTAGATCTGAAAATATTACAAACAATGCCAAAAACATAATAGTTCAGTTTACTAAGCCTGTACTAATTACTTGTATCAGACCCA------ACAACAATACAAGAAAAAGTATAC----------GCTTTGGACCAGGACAAGCCTTCTATACAAAT---------------G---AAATAATAGGGGACATAAGACAAGCACATTGTAATATCAACAAAACATT-ATGGAATGACACTTTACAAAAGGTAGCTGAACAATTAAGAGAGAAAT------------TCCCT------------AAGAAAACCATAA---------T---CTTTACTAAC-----------TCCTCAGGAGGGGACCCAGAAATTACAACACTTAGT-TTT +Ref.B.FR.83.HXB2-LAI-IIIB-BRU.K03455 GGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGAAG---AGGTAGTAATTAGATCTGTCAATTTCACGGACAATGCTAAAACCATAATAGTACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACA------ATACAAGAAAAAGAATCCGTATC---C-AGAGAGGACCAGGGAGAGCATTTGTTACAATA---------------G---GAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAA-ATGGAATAACACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAAT------------TTGGAA-AT---A-----ATAAAACAATAA---------T---CTTTAAGCAA-----------TCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGT-TTT +Ref.B.US.83.RF.M17451 GGACCATGTAAAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTAGTGTCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGAAG---AGGTAGTAATTAGATCTGAAAATTTCACGGACAATGTTAAAACCATAATAGTACAGCTGAATGCATCTGTACAAATTAATTGTACAAGACCCA------ACAACAATACAAGAAAAAGTATAA----------CTAAGGGACCAGGGAGAGTAATTTATGCAACA---------------GGACAAATAATAGGAGATATAAGAAAAGCACATTGTAACCTTAGTAGAGCACA-ATGGAATAACACTTTAAAACAGGTAGTTACAAAATTAAGAGAACAAT------------TTGAC------------AATAAAACAATAG---------T---CTTTACGTCA-----------TCCTCAGGAGGGGACCCAGAAATTGTACTTCACAGT-TTT +Ref.B.US.86.JRFL.U63632 GGACCATGTAAAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGCTA---AATGGCAGTCTAGCAGAAGAAG---AGGTAGTAATTAGATCTGACAATTTCACGAACAATGCTAAAACCATAATAGTACAGCTGAAAGAATCTGTAGAAATTAATTGTACAAGACCCA------ACAACAATACAAGAAAAAGTATAC----------ATATAGGACCAGGGAGAGCATTTTATACTACA---------------GGAGAAATAATAGGAGATATAAGACAAGCACATTGTAACATTAGTAGAGCAAA-ATGGAATGACACTTTAAAACAGATAGTTATAAAATTAAGAGAACAAT------------TTGAG------------AATAAAACAATAG---------T---CTTTAATCAC-----------TCCTCAGGAGGGGACCCAGAAATTGTAATGCACAGT-TTT +Ref.B.US.90.WEAU160.U21135 GGACCCTGTAAAAATGTCAGCACAGTACAATGTACACACGGAATTAGACCAGTAGTGTCAACCCAATTGCTGTTA---AATGGCAGTCTAGCAGAAGAAG---ACATAGTAATTAGATCTGAAAATTTCACGGACAATGCTAAAAACATAATAGTACAGCTGAATGTATCCATAGAAATTAATTGTACAAGACCCA------ACAACAATACAAGAAAAAAAATAA----------CTTTAGGACCAGGGAGAGTACTTTATACAACA---------------GGAGAAATAATAGGAGATATAAGACGAGCACATTGTAACCTTAGTAGAACAAG-TTGGAATAACACTTTAAAACAGATAGTTGAAAAATTAAGAG------AAATAAAACAATTTAAG------------AATAAAACAATAG---------T---TTTTAAACAA-----------TCCTCAGGAGGGGACCCAGAAATTGTAATGCACAGT-TTT +Ref.C.BR.92.BR025-d.U52953 GGACCATGCAATAATGTCAGCACAATACAGTGTACACATGGAACTAAGCCAGTAGTGTCAACTCAACTACTGTTA---AATGGTAGTCTAGCAGAAGAAG---AGATAATAATTAGATCTAAAAATCTAACAGACAATGTCAAAACAATAATAGTACATCTTAACGAATCTGTAGAGATTAATTGTACGAGACCCA------ACAATAATACAAGAAAAAGTATAA----------GGATAGGACCAGGACAAGCATTCTATGCAACA---------------GGAGAAATAATAGGAGATATAAGACAAGCACATTGTAACATTAGTAGAACAGC-ATGGAACAAAACTTTACAAGAGGTAGGTAAAAAATTAGCAGAGCACT------------TCCCT------------AATAAAGCAATAA---------A---ATTTGCAAAG-----------CACTCAGGAGGAGACCTAGAAATTACAACACATAGC-TTT +Ref.C.BW.96.96BW0502.AF110967 GGACCATGCAATAATGTCAGCTCAGTACAATGTGCACATGGAATTAAGCCAGTGGTATCAACTCAGCTACTGTTA---AATGGTAGCGTAGCAAAAGGAG---AGATAATAATTAGATCTGAAAATCTGACAAACAATGCCAAAATAATAATAGTACAACTTAATAAACCTGTAAAAATTGTGTGTGTAAGGCCTA------ACAATAATACAAGAAAAAGTGTAA----------GGATAGGACCAGGACAAACATTCTATGCAACA---------------GGAGAAATAATAGGAGACATAAGACAAGCATATTGTATCATTAATAAAACTGA-ATGGAATAGCACTTTACAAGGGGTAAGTAAAAAATTAGAAGAACACT------------TCTCT------------AAAAAAGCAATAA---------A---ATGTGAACCG-----------TCATCAGGAGGGGACCTAGAAATTACAACACATAGC-TTT +Ref.C.ET.86.ETH2220.U46016 GGACCATGCCATAATGTCAGCACAGTACAATGCACACATGGAATTAAACCAGTGGTATCAACTCAACTATTGTTA---AATGGTAGTATAGCAGAAGGTG---AGACAATAATTAGATTTGAAAATCTGACAAACAATGCCAAAATAATAATAGTACAGCTTAATGAATCTGTAGAAATTACTTGTACGAGACCCA------GCAATAATACAAGAGAAAGTATAA----------GGATAGGACCAGGACAAACATTCTATGCAACA---------------GGAGACATAATAGGAGATATAAGACAAGCACATTGTAACATTAGTGAAGAAAA-ATGGAATAAAACTCTACAAAAGGTAAAGGAAAAATTACAAAAGCACT------------TCCCT------------AATAAAACAATAG---------A---ATTTAAGCCA-----------TCCTCAGGAGGGGACCTAGAAATTACAACACATAGC-TTT +Ref.C.IN.95.95IN21068.AF067155 GGACCATGCCATAATGTCAGCACAGTACAATGTACACATGGAATTAAGCCAGTGGTATCAACTCAACTACTGTTA---AATGGTAGCCTAGCAGAAGGAG---GGATAATAATTAGATCTGAAAATCTGACAAACAATGTCAAAACAATAATAGTACATCTTAATCAACCTGTAGAAATTATGTGTACAAGACCCG------ACAATAATACAAGAAAAAGTATAA----------GGATAGGACCAGGACAAACATTCTATGCAACA---------------GGAGACATAATAGGAGACATAAGACAAGCACATTGTAACATTAGTGAAGATAA-GTGGAATGAAACTTTACAAAATGTAAGTAAAAAGTTAGCAGAACACT------------TCCCT------------AATAAAACAATAA---------T---ATTTAATTCA-----------TCATCAGGAGGGGACCTAGAAATTACAACACATAGC-TTT +Ref.D.CD.83.ELI.K03454 GGCCCATGCACAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTGGTGTCAACTCAACTGCTGTTG---AATGGCAGTCTAGCAGAAGAAG---AGGTCATAATTAGATCCGAAAATCTCACAAACAATGCTAAAAACATAATAGCACATCTTAATGAATCTGTAAAAATTACCTGTGCAAGGCCCT------ATCAAAATACAAGACAAAGAACAC----------CTATAGGACTAGGGCAATCACTCTATACTACA---------------A---GATCAAGATCAATAATAGGACAAGCACATTGTAATATTAGTAGAGCACA-ATGGAGTAAAACTTTACAACAAGTAGCTAGAAAATTAGGAACCCTTC------------TTAAC------------AAAACAATAAT------------AAAGTTTAAACCA-----------TCCTCAGGAGGGGACCCAGAAATTACAACACACAGT-TTT +Ref.D.CD.83.NDK.M27323 GGCCCATGCTCAAATGTCAGCACAGTACAATGTACACATGGAATTAGACCAGTGGTGTCAACTCAACTGCTGTTG---AATGGCAGTCTAGCAGAAGAAG---AGATCATAATTAGATCTGAAAATCTCACAAACAATGTTAAAACCATAATAGTACAGCTTAATGCATCTATAGTAATTAATTGTACAAGGCCCT------ACAAATATACAAGGCAAAGGACAT----------CGATAGGACTAAGGCAATCACTCTATACAATAACAGGAAAAAAGAAG------AAAACAGGATACATAGGACAAGCACATTGTAAAATTAGCAGAGCAGA-ATGGAATAAAGCTTTACAACAGGTAGCTACAAAACTAGGAAACCTTC------------TTAAC------------AAAACAACAATAA---------C---TTTTAAGCCA-----------TCCTCAGGAGGGGACCCAGAAATTACATCACACATG-CTT +Ref.D.CD.84.84ZR085.U88822 GGCCCATGCAAAAATGTCAGCTCAGTACAATGTACACATGGAATTAGGCCAGTGGTGTCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGAAG---AGATCGTAATAAGATCTGAAAATCTCACAAACAATGCTAAAATCATAATAGTACATCTTAATCAATCTGTAGAAATTAATTGTACAAGGCCCT------ACAAAAAGGAAAGACAAAGGACAC----------CTATAGGACAAGGGCAAGCGCTCTATACAACA---------------AGGTATACAACAAGGATAATAGGACAAGCATATTGTAATATTAGTGGAGTAAA-ATGGAATAACACCTTACGACAGGTAGCTAGAAAATTAGGAAACCTTT------------TGAAC------------CAAACAAAAATAA---------T---TTTTAAACCA-----------TCCTCAGGAGGGGATCCAGAAATTACAACACACAGT-TTT +Ref.D.UG.94.94UG114.U88824 GGTCCATGCAAAAATGTCAGCACAGTACAGTGTACACATGGGATTAAGCCAGTAGTGTCAACTCAACTGTTGTTG---AATGGCAGTCTAGCAGAAGAAG---AAATAATAATTAGATCTGAAAATCTCACAAACAATGCTAAAATCATAATAGTACAGCTTAATGAGTCTGTACCAATTAATTGCATAAGGCCCT------ATAACAATACAAGACAAAGTACAC----------GTATAGGACCAGGGCAAGCACTCTTTACAACA---------------A---AAGTAATAGGAGATATAAGACAAGCACATTGTAACATTAGTGGAGCAGG-ATGGAATAAAACTTTACAACAGGTAGCTGAAAAATTAGGAAACCTTC------------TTAAC------------CAGACAACAATAA---------T---TTTTAAACCA-----------TCCTCGGGAGGGGACCCAGAAATTACAACACACAGC-TTT +Ref.F1.BE.93.VI850.AF077336 GGGCCATGCAAGAATGTCAGCACAGTACAATGTACACATGGAATTAAACCAGTGGTATCAACTCAATTGTTGTTA---AATGGCAGCCTAGCAGAAGAAG---GTATAGTAATCAGATCTCAAAATATCTCAAATAATGCAAAAACCATAATAGTACACCTTAATGAATCTGTACAGATTAATTGTACAAGACCCA------ACAACAATACAAGGAAAGGTATAC----------ATTTAGGACCAGGACAAACATTCTATGCAACA---------------GGAGCAATAATAGGAGACATAAGAAAGGCACATTGTAACATTAGTGGAACACA-ATGGAATAACACTCTGGAGTATGTAAAGGCAGAATTAAAGTCGCATT------------TCCCTA-AT---A-----ATACAGCAATAA---------A---ATTTAACCAA-----------TCCTCAGGAGGGGACCTAGAAATTACAATGCATAGT-TTT +Ref.F1.BR.93.93BR020_1.AF005494 GGGTCATGCAAGAATGTCAGTACAGTACAATGTACACATGGAATTAAACCAGTGGTATCCACTCAATTGTTGTTA---AATGGCAGCCTAGCAGAAGGAG---AGATAGTAATCAGATCTCAAAATATCTCAGATAATGCAAAAACCATAATAGTGCACCTTAATGAATCTGTACAGATTAATTGTACAAGACCCA------ACAACAATACAAGAAAAAGAATAT----------CTTTAGGACCAGGACGAGTATTTTATACAACA---------------GGAGAAATAATAGGAGACATCAGAAAGGCACATTGTAACGTTAGTGGAACACA-ATGGAGGAACACGTTAGCAAAGGTAAAGGCAAAGTTAGGGTCTTATT------------TCCCT------------AATGCAACAATAA---------A---ATTTAACTCA-----------TCCTCAGGAGGGGACCTAGAAATTACAAGGCATAAT-TTT +Ref.F1.FI.93.FIN9363.AF075703 GGGCCATGTAGGAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTGGTATCAACTCAATTGTTGTTA---AATGGCAGCCTATCAGAAGGAG---GTATAATAATCAGATCTCAAAATCTCTCAGATAATGCAAAAACTATAATAGTACACCTTAATGAATCTGTACAGATCAATTGTACAAGACCCA------ACAACAATACAAGAAAAAGTATAC----------GTATAGGACCAGGACAATCATTTTATGCAACA---------------GGAGAAATAATAGGAGACATAAGAAAGGCACATTGTAACATTAGTGGAGAACA-ATGGAATAAAACTTTAGACCGAGTAAAGGCAGAGTTAAAGCTTCATT------------TTAAT-A--------------AAACAATAC---------A---ATTTAACTCA-----------TCCTCAGGAGGGGACCTAGAAATTACAATGCATAGT-TTT +Ref.F1.FR.96.MP411.AJ249238 GGGCCATGCAAGAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTGGTATCAACTCAATTGCTGTTA---AATGGCAGCCTGGCAGAAGAAG---ATATAATAATCAGATCTCAAAATATCTCAGATAATGCAAAAACCATAATAGTACACCTTAATGAAAGTGTACAGATTAATTGTACAAGACCCA------ACAACAATACAAGAAAAAGTATAC----------ATTTAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGATATAATAGGAGATATAAAAAAGGCATATTGTGAAATTAATGGAACACA-ATGGAGTAAAACTAAAACACAGGTACAAGAAAAGTTACGGGCTCTTT------------TTAAT-A--------------AAACAATAA---------A---ATTTAACCAA-----------TCCTCAGGAGGGGACCTAGAAATTACAATGCATAGT-TTT +Ref.F2.CM.95.MP255.AJ249236 GGGCCATGCGAAAAGGTCAGCACAGTACAATGTACACATGGAATTAGACCAGTGGTATCAACTCAACTACTGCTA---AATGGCAGCCTAGCACAAGAAG---ATATAATAATTAGATCTAAAAATATCACAGATAATACAAAAAACATAATAGTACAGTTTAATAGATCTGTAATAATTGATTGTAGAAGACCCA------ACAACAATACAAGAAAAGGGATAC----------GTATAGGACCAGGACAAACATTCTTTGCAACA---------------GGTGAAATAATAGGGGATATAAGAAAGGCATATTGTAACATTAATAGAACACT-GTGGAATGAAACTTTAAAAAATGTAAGTGGAGAGTTCAAAAAACACT------------TCAAC-T--------------TCTCAGTAG---------C---CTTTAATTCA-----------TCCTCAGGAGGGGATGTAGAAATTACAACGCATAGT-TTT +Ref.F2.CM.95.MP257.AJ249237 GGATTATGCAGGAATGTTAGCACAGTACAATGTACACATGGAATTAAACCAGTGGTATCAACTCAACTACTGTTA---AATGGCAGCCTAGCAGAAGAAA---AGATGATAATTAGATCTGAAAATATCTCAGATAATACAAAAACCATAATAGTACAGTTTAAAAATCCTGTAAAAATTAATTGTACAAGACCCA------ACAACAATACAAGAAGAAGTATAC----------ATATAGGACCAGGACGAGCATTCTATGCAACA---------------GGTGAGATAATAGGAGATACAAGAAAGGCACATTGTAACATTAGTGAAAAACA-GTGGTATGACACTTTAATAAAGATAGCTACAGAGTTCAAAGACCAAT------------ATAAT-A--------------AAACAGTAG---------G---GTTTCAACCA-----------TCAGCAGGAGGGGATCTAGAAATTACAACACATAGT-TTT +Ref.G.BE.96.DRCBL.AF084936 GGAACATGTAACAATGTCAGTACAGTACAATGTACACATGGAATTAAGCCAGTAGTATCAACTCAACTACTGCTG---AATGGCAGTTTAGCAGAAAAAG---ATATCATAATTAGTTCTGAAAATATCTCAGACAATGCCAAAGTCATAATAGTGCACCTTAATAGGTCTGTAGAAATTAATTGTACCAGACCCA------ACAACAATACAAGAAGAAGTGTAG----------CAATTGGACCAGGACAAGCATTCTATACAACA---------------GGAGAAGTAATAGGAGACATAAGAAAGGCACATTGTAATGTTAGTTGGACAAA-ATGGAATGAGACGCTAAGGGATGTCCAGGCAAAACTACAGGAATACT------------TTATT------------AATAAGAGCATAG---------A---ATTTAACTCA-----------TCTTCAGGAGGGGACCTAGAAATTACAACACATAGT-TTC +Ref.G.KE.93.HH8793_12_1.AF061641 GGACCATGTCAAAATGTCAGTACAGTACAATGTACACATGGAATTAAGCCAGTGGTATCAACTCAACTACTGCTG---AATGGCAGTTTAGCAGAAGGAG---AAATAATAATTAAATCAGAAAACATCACAGACAATACCAAAGTCATAATAGTGCAGCTTAATGAAACTGTAGAAATTACGTGTGTCAGACCCA------ACAACAATACAAGAAAAAGTATAC----------ACCTCGGGCCAGGACAAGCGCTCTATGCAACA---------------GGGGACATAATAGGAAATATAAGACAGGCACATTGTGATGTTAGTGGAAGAAA-TTGGAGTAACATGATAGAGAAGGTAAAAGCACAACTAAGAAAAATCT------------TTAAC-A--------------AGACCATAA---------C---CTTTGACTCA-----------TCTGCAGGAGGGGACCTAGAAATTACAACACATAGT-TTT +Ref.G.NG.92.92NG083.U88826 GGACCATGTAAAAATGTCAGTACAGTACAATGTACACATGGAATTAAGCCAGTGGTATCAACTCAACTACTGCTG---AATGGCAGTTTAGCAGAAGAAG---ATATAAGAATTAGATCTGAAAATTTCACAGACAATACCAAAGTCATAATAGTGCAGCTTAATAATAGTATAGAAATTAATTGTATCAGACCCA------ATAACAATACAAGAAAAAGTATAC----------CAATCGGACCAGGACAAGCGTTCTATGCAACA---------------GGTGATATAATAGGAGACATAAGACAAGCACATTGTAATGTTAGTAGAATAAA-ATGGAGGGAGATGTTAAAGAATGTCACAGCACAGCTAAGGAAAATCT------------ATAAT------------AATAAGAACATAA---------C---CTTTAACTCA-----------TCTGCAGGAGGGGACCTAGAAATTACAACACATAGT-TTC +Ref.G.SE.93.SE6165.AF061642 GGACCATGTAAAAATGTCAGTACAGTACAATGTACACATGGAATTAAGCCAGTGGTATCAACTCAACTACTACTG---AATGGCAGTTTAGCAGAAGGAA---AAATAAAAGTTAGATCTGAAAATTTCACAGACAATACCAAAGTCATAATAGTACAGCTTAATAAAACTGTAGAAATTAATTGTACCAGACCCA------ACAACAATACAATGAAAAGGATAAGAATG---G-GAATTGGACCAGGACAAACGTTCTATGCAACA---------------GGTGCCATAATAGGAGACATAAGACAAGCACATTGTAATGTTACTAAAAGAAA-ATGGAAAGAGGCCTTACAGAATGTCGCTGCAGAACTAGGGAAAATCT------------TTAAT----AAGAGCAG--CGAGAACATAA---------C---CTTTAACTCA-----------TCTGCAGGAGGGGACCTAGAAATTACAACACATAGT-TTC +Ref.H.BE.93.VI991.AF190127 GGACCATGCACAAATGTCAGCACAGTACAATGCACACATGGAATTAGGCCAGTGGTATCAACTCAACTGTTGTTA---AATGGAAGCCTAGCAGAAGTGGAGGAGGTAATAATTAGATCTAAAAATATTACAGATAATACCAAAAACATAATAGTACAGTTAAATGAACCTGTACAAATTAACTGTACCAGAACAG------GCAATAATACGAGAAAAAGTATAC----------GTATAGGGCCAGGACAAGCATTCTATGCAACA---------------GGTGACATCATAGGAGATATAAGACGAGCATATTGTAATATTAGTGGAAAACA-ATGGAATGAGACCTTACACAAGGTAATCACCAAATTAGGAAGCTACT------------TTGAC------------AATAAAACAATAA---------T---TTTACAACCA-----------CCCGCAGGAGGGGATATAGAGATTATAACACATAGT-TTT +Ref.H.BE.93.VI997.AF190128 GGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATTAAGCCAGTGGTATCAACTCAACTGCTATTA---AATGGAAGCCTAGCAGAAGGAC---AGGTCATAATTAGATCTAAAAATATCTCAGACAATACCAAAAACATAATAGTACAGCTTGATAGTCCTATAGAAATTACCTGTACCAGACCTA------ACAATAATACAAGAAAAGGTATAC----------ATTTCGGGCCAGGGCAAGCATTCTATGCAACA---------------GGTGATATCATAGGAAACATAAGACAAGCACATTGTAATGTTAGTGAAGAGAA-ATGGAATAAGACTTTACAACAGATAGCTACACAATTAAGTAAATACT------------TTGTC------------AATAGAACATTAA---------T---TTTTAAACCA-----------CACTCAGGAGGGGACCTGGAAGTTACAACACATAGT-TTT +Ref.H.CF.90.056.AF005496 GGATTATGTACAAATGTCAGTACAGTACAATGTACACATGGAATTAGACCAGTGGTATCAACTCAACTGCTATTA---AATGGAAGCCTAGCAGAAGAAC---AGATCATAATTAGAACTAAAAATATCTCAGACAATACCAAAAACATAATAGTACAGCTTAAGACACCAGTAAACATTACATGTACCAGGCCTA------ACAATAATACGAGAACAAGTATAC----------ATTTAGGGCCAGGACGAGCATTCTATGCAACA---------------GGTGACATCATAGGAGATATAAGACAAGCACATTGTAATATTAGTAGAACAGA-CTGGAATAAGACTTTACACCAGGTAGTTACACAATTAGGAATACACT------------TGAAC------------AATAGAACAATAA---------G---CTTTAAGCCA-----------AACTCAGGAGGGGACATGGAAGTTAGAACACATAGT-TTT +Ref.J.SE.93.SE7887.AF082394 GGATCATGCAAAAATGTCAGTACAGTACAATGTACACACGGAATTAAGCCAGTGGTATCAACTCAACTACTGCTA---AATGGCAGTATAGCAGAAGGAG---ACATAATAATTAGATCTGAAAACATCTCAGACAATGCCAAAAACATAATAGTACAACTTAATAAAACTGTAGAAATTGTGTGTTACAGACCTA------ATAACAATACAAGGAAAGGTATAC----------ACATGGGACCAGGACAAGTGCTCTACGCAACA---------------GGAGAAATAATAGGAAATATAAGAGAAACACATTGTAACATTAGTGAAAGAGA-TTGGAGTAACACTTTACGTAGAGTAGCTACAAAACTAAGAGAACACT------------TTAAT-A--------------AAACAATAA---------A---CTTTACATCA-----------CCCTCAGGAGGGGATATAGAAATTGTGACACATAGT-TTT +Ref.J.SE.94.SE7022.AF082395 GGACCATGCAAAAATGTCAGTACAGTACAATGTACACACGGAATTAAGCCAGTGGTATCAACTCAACTACTGCTA---AATGGCAGTGTAGCAGAAGGAG---ACATAATAATTAGATCTGAAAATATCTCAGACAATGCTAAAAACATAATAGTACAACTTAATGACACTGTAGAAATTGTGTGTACCAGACCTA------ATAACAATACAAGAAAAGGTATAC----------ACATGGGACCAGGACAAGTGCTCTACGCAACA---------------GGGGAAATAATAGGAGATATAAGGAAAGCATATTGTAACATTAGTAGAAAAGA-TTGGAATAACACTTTACGTAGAGTAGCTAAAAAACTAAGAGAACACT------------TTAAT-A--------------AAACAATAG---------A---CTTTACATCA-----------CCCTCAGGAGGGGACATAGAAATTACAACACATAGT-TTT +Ref.K.CD.97.EQTB11C.AJ249235 GGGCCATGCACAAATGTCAGCACAGTACAATGTACACATGGAATTAAACCAGTGGTATCAACTCAATTGCTATTA---AATGGCAGCCTAGCAGAAGAAG---AGATAATTATTAGGTCTGAAGATATTACAAAGAATACAAAAAACATAATAGTACAGCTTAATGAAGCTGTAGAAATTAATTGTACAAGGCCAA------GCAACAATACAAGAAAAAGTATAC----------ATATAGGACCAGGAAGAGCATTCTATGCAACA---------------GGTGACATAATAGGAGATATAAGGCAGGCGCATTGTAACATTAGCGGAGGACA-GTGGAATAAAACTGTAAACCAAGTAAAGAAAGAATTAGGAAAACACT------------TTAAC-A--------------AAACAATAA---------TA---TTTCAACCA-----------TCCTCAGGAGGGGACCCACAAGTCACAAGGCACATT-TTT +Ref.K.CM.96.MP535.AJ249239 GGACCATGTAAAAATGTCAGCACAGTACAATGTACACATGGAATTAAACCAGTGGTATCAACTCAACTGCTGTTA---AATGGCAGCCTAGCAGAAGAAG---AGATAATAATTAGGTCTGAAAATATTACAGATAATACAAAAAACATAATAGTACAGCTTAATGAAACTGTACAAATTAATTGTACAAGGCCAA------ACAACAATACAAGAAAAAGTATAC----------ATATGGGACCAGGAAAAGCATTCTATACAACA---------------GGTGATATAATAGGAGATATAAGACAGGCACATTGCAACATTAGTGGAGAAAA-ATGGAACATGACTTTAAGCAGAGTAAAGGAAAAGCTAAAAGAACATT------------TTAAG------------AATGGAACAATA------------ACATTTAAACCA-CCA-------AACCCAGGAGGAGACCCAGAAATTCTAACGCACATG-TTT +Ref.U.CD.83.83CD003.AF286236 GGACCATGCAAAAATGTCAGCACAGTACAATGCACACATGGAATTAGGCCAGTGGTGTCAACTCAATTACTATTA---AATGGCAGTTTATCAGAAGAAG---AGGTAATAATTAGATCTGAAAATATCACAAACAATGCCAAAACCATAATAGTACAGCTTAATGAGACTGTAAAAATTAATTGTACCAGACCCGGATCCGACAAGAAGATAAGACAAAGTATA---------C-GTATAGGACCAGGAAAAGTATTCTATGCAAAA---------------GGTGGAATAACAG---------GACAAGCACATTGTAACATTACAGATGGGGA-ATGGAGGAATACTTTACAACAGGTAGCTATCGCATTAAGAAGACAAT------------TTAAT------------AATAAATCAATAA---------T---ATTTAACTCA-----------TCCTCAGGAGGGGACATAGAGATTACAACACATACT-TTT +Ref.U.CD.90.90CD121E12.AF457101 GGACCATGCAGAAATGTCAGCACAGTACAATGCACACATGGAATTAGGCCAGTGGTGTCAACTCAATTACTATTA---AATGGCAGTTTATCAGAAAAAG---AGGTAATAATTAGATCTGAAAATATCACAAACAATGCCAAAACCATAATAGTACAGTTTAATGAGACTGTAAAAATTAATTGTAGCAGACCCA------ATAATAATACAAGGCAAGGTATAC----------ATATAGGACCAGGACGAGTATTCTATGCAACA---------------AGTAACATAATAGGAGATATAAGACAAGCACATTGTAACCTTACAAAATGGCA-ATGGAAGAGAGCTTTACAACAGGTAGCTGATGCATTAAGAAGACACT------------TTAAT-AAA---A-----------CAATAG---------T---ATTTAACTCA-----------TCCTCAGGAGGGGACTTAGAAATTACAACACATAGC-TTT +Ref.U.GR.99.GR303.AY046058 GGGCCATGCAAAAATGTCAGTTCAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGCTA---AATGGCAGTCTAGCAAAAGGAG---AGGTAATAATTAGATCTGAAAATTTCACAGACAATGCTAAAACCATAATAGTACAGCTTAATGAGTCTGTATCAATTAATTGTACCAGACCTA------GCAACAATACAAGAAAAAGTATAC----------ATATAGGACCAGGACAAGCGTTCTTTGCAACA---------------G---ACATAATAGGAGATATAAGACAAGCACATTGTAATGTTTCAGGAGCAAC-ATGGAACAACACTCTAAAAAGGGTAGTTGCAAAACTAAGAGAACAGT------------TTGGG------------AACAAAACAATAA---------T---CTTTAATTCA-----------TCTACAGGAGGGGATTTAGAAATTACAACACATAGT-TTA +Ref.01_AE.CF.90.90CF11697.AF197340 GGGCCATGCAAAAATGTCAGCTCAGTACAATGCACACATGGAATTAAGCCAGTGGTGTCAACTCAATTGTTGTTA---AATGGCAGTCTAGCAGAAGAAG---ACATAATAATCAGATCTGAAAATCTCACAAACAATGCCAAAACCATAATAGTGCACCTTAATAAATCTGTAGAAATCAATTGTACCAGACCCT------TCAAAAATATGAGAACAAGTGCAC----------GTATAGGACCAGGACAAGTATTCTATAAAACA---------------GGAAGCATAACAGGAGACATAAGAAAAGCATATTGTGAGATTAATGGAACAAA-ATGGAATGAAACTTTAAAACAGGTAACAAAAAAATTAAGAGAGCACT------------TTAAG------------AATAAGACAATAA---------T---CTTTCAACCA-----------TCCTCAGGAGGAGATCCAGAAATTACAATGCATCAT-TTT +Ref.01_AE.CF.90.90CF402.U51188 GGGCCATGCAAAAATGTCAGCTCAGTACAATGCACACATGGAATTAAGCCAGTGGTATCAACTCAATTGCTGTTA---AATGGCAGTCTAGCAGAAGAAG---AGATAATAATCAGATCTGAAGATCTCACAGACAATGCCAAAACCATAATAGTGCACCTTAATAAATCTATAGAAATCAATTGTACCAGACCCT------TCAAGAAAGTAAGAATAAGTGCAA----------GGATAGGACCAGGACGAGTATTCCATACAACA---------------GGAAACATAAATGGTGATATAAGAAAAGCATATTGTGAAATTAATAAAACAAA-ATGGAAAGAAACTTTAAAACAGGTAACAAGAAAATTAAGAGAGCACC------------TTAATG-GG---A-----CAATGACAATAA---------G---CTTTCGACCA-----------TCCTCAGGAGGAGATCCAGAAATCACAATGCATCAT-TTT +Ref.01_AE.CF.90.90CF4071.AF197341 GGGCCATGTAAAAAGGTAAGCTCAGTACAATGCACACATGGAATTAAGCCAGTGGTATCAACTCAATTGCTGTTA---AACGGTAGTCTAGCAGAAGAAG---AAATAATAATTAGATCTGAAAATCTCACAAACAATGCCAAAACCATAATAGTGCACCTTAATAAATCTGTAGAAATCAATTGTACCAGACCTT------TCAAAAAGATGAGAACAAGTGTAA----------GAATAGGACCAGGGCGAGTATTCTATAAAACA---------------GGATCCATAACAGGAGATATAAGAAAAGCATATTGTGAGATTAATGGAACAAA-ATGGAATGAAACTTTACAACAGATAATTAGAAAATTAGAAGAGCACT------------TTAAT------------AATAAGACAATA------------CAATTTAAACCA-CCA-------TACTCAGGAGGAGATCTAGAAATTACAATGCACCAT-TTT +Ref.01_AE.TH.90.CM240.U54771 GGGCCATGTAAAAATGTCAGCTCAGTACAATGCACACATGGAATTAAGCCAGTGGTATCAACTCAATTGCTGTTA---AATGGCAGTCTAGCAGAAGAAG---AGATAATAATCAGATCTGAAGATCTCACAAACAATGCCAAAACCATAATAGTGCACCTTAATAAATCTGTAGAAATCAATTGTACCAGACCCT------CCAACAATACAAGAACAAGTATAA----------CTATAGGACCAGGACGAGTATTCTATAGAACA---------------GGAGATATAATAGGAAATATAAGAAAAGCATATTGTGAGATTAATGGAACAAA-ATGGAATAAAGTTTTAAAACAGGTAACTGAAAAATTAAAAGAGCACT------------TTAAT-A--------------AGACAATAA---------T---CTTTCAACCA-----------CCCTCAGGAGGAGATCTAGAAATTACAATGCATCAT-TTT +Ref.02_AG.CM.97.97CM_MP807.AJ286133 GGGCCATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAACCAGTAGTATCAACCCAACTGCTACTA---AATGGCAGTCTAGCAGAAGAAA---AGGTAAGGATTAGATCTGAAAATATCACAAACAATGCCAAAACCATAATAGTACAGTTGGTTAAGCCTGTAAAAATTAATTGTACCAGACCTA------ACAACAATACAAGGAAAGGTGTAC----------GTATAGGACCAGGACAAACATTCTATGCAACA---------------GGGGAAATAATAGGGGATATAAGACAAGCATATTGTAATGTCAGTAGAACAAA-ATGGAATAACACTTTACAACAGGTAGTTACACAATTAAGGGAGTACT------------TTGGGA-AA---A-----ACAAAACAATAA---------A---ATTTGCTAAC-----------TCCTCAGGAGGGGATTTAGAAATTACAACACATAGT-TTT +Ref.02_AG.FR.91.DJ264.AF063224 GGGCCATGCAAGAATGTTAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTACTGTTA---AATGGCAGTCTAGCAGAAGAAG---AGGTAGTGATTAGATCTGAAAATATCACAAACAATGCCAAAAACATAATAGTGCAGTTGGTGACGCCTGTAAGAATCAATTGTACTAGACCTA------ACAACAATACAAGAAAGAGTGTGC----------GTATAGGACCAGGGCAAACATTCTATGCAACA---------------GGTGACATAATAGGGGATATAAGAAAAGCACATTGTAATGTCAGTAGATCAAA-ATGGAATAACACTTTACAACAGGTGGCTATACAATTAAGGAAGCACT------------TTAAC-A--------------CAACAATAA---------T---CTTTGCTAAC-----------CCCTCAGGAGGGGATATAGAAATTACAACACATAGT-TTT +Ref.02_AG.NG.-.IBNG.L39106 GGGCCATGCAAAAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGGAG---AAGTAGTGATCAGATCTGAAAATATCACAAACAATGCCAAAACCATAATAGTACAGTTGGCTAATCCTGTAAAAATTAATTGTACCAGACCTA------ACAACAATACAAGAAAAGGTGTAC----------ATATAGGACCAGGGCAAGCATTCTATGCAACA---------------GGTGACATAATAGGGGATATAAGACAAGCACATTGTAATGTCAGTAAAACAGA-ATGGAATAAAACTTTACATCAGGTAGTTACACAATTAAAGACGTACT------------TTAAG------------AACACCACAATAA---------T---CTTTGCTAAC-----------CCCTTAGGAGGGGATGTAGAAATTACAACACATAGT-TTT +Ref.02_AG.SE.94.SE7812.AF107770 GGGCCATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGTTATTA---AATGGCAGCCTAGCAGAAGAAG---AGATAGTGATTAGATCTGAAAATTTTACAAACAATGCCAAAATCATAATAGTACAGTTGCATGAATCTGTAAAAATTAATTGTACCAGACCTG------GCAACAATACAAGAAAAAGTGTAC----------GTATAGGACCAGGGCAAACATTCTATGCAACA---------------GGTGACATAATAGGGGATATAAGACAAGCACATTGTAATGTCAGCTGGCAACA-ATGGAACAAAACTTTACACGATGTGGCTACAAAATTAAGGGAGTATT------------TTAAT------------AATACCACAATAA---------T---CTTTGATGAA-----------CCCTCAGGAGGGGATTTAGAAATTACAACACATAGT-TTT +Ref.03_AB.RU.97.KAL153_2.AF193276 GGGCCATGTACAAATGTCAGTACAGTACAATGTACACATGGAATTAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGTAGTCTAGCAGAAGAAG---AGGTAGTAATTAGATCTGTCAATTTCACGGACAATACTAAAACCATAATAGTACAGCTGAAAGAACCTGTGGAAATTAATTGTACAAGACCCA------ACAACAATACAAGAAAAGGTATTC----------ATATAGGACCAGGGAGAGCATTTTATGCAACA---------------GGAGACATAACAGGAGATATAAGACAAGCACATTGTAACATTAGTATAACAAA-ATGGAATAACACATTAAAACAGATAGTTATCAAATTAAGAAAACAAT------------TTGGG------------AATAAAACAATAG---------T---CTTTAATCAA-----------TCCTCAGGAGGGGACCCAGAAATTGTAATGCACAGT-TTT +Ref.03_AB.RU.98.RU98001.AF193277 GGGCCATGTACAAATGTCAGTACAGTACAATGTACACATGGAATTAAGCCAGTAGTGTCAACTCAACTGCTGTTA---AATGGTAGCCTAGCAGAAGAAG---AGGTAGTAATTAGATCTGTCAATTTCACGGACAATACTAAAACCATAATAGTACAGCTGAAAGAACCTGTAGAAATTAATTGTACGAGACCCA------ACAACAATACAAGAAAAGGTATTC----------ATATAGGACCAGGGAGAGCATTTTATGCAACA---------------GGAGACATAATAGGAGATATAAGACAAGCATATTGTAACATTAGTAGAACAAA-ATGGAATAACACATTAGAACAGATAGTTAGCAAATTAAGAAAACAAT------------TTAGG------------AATAAAACAATAG---------T---CTTTAATCAA-----------TCCTCAGGAGGGGACCCAGAAATTGTAATGCACAGT-TTT +Ref.04_CPX.CY.94.CY032.AF049337 GGGCCATGCACAAATGTCAGCTCGGTACGATGCACTCATGGAATTAAGCCAGTGGTATCAACTCAATTGCTGTTA---AATGGAAGCTTAGCAACGGAAG---AGGTAGTAATTAGATCTAAAAATATCACAGACAATACCAAAAATATAATAGTACAGCTTGCAAAGGCTGTAAAAATTAATTGTACCAGACCTG------GCAACAATACAAGAAAAAGTGTAC----------ATATAGGGCCAGGACTAACATGGTATGCAACA---------------GGTGAAATAATAGGAGATATAAGACAAGCACATTGTAACATTAGTGGAAATGA-TTGGAATGACACCTTAAAAGTGATAAGTGAAGAATTGAAAAGACTCT------------TCCCT------------AATAAAACAATAA---------A---ATTTGCTCCA-----------CCCGTAGGAGGGGACCTAGAAATTACAACACATAGC-TTT +Ref.04_CPX.GR.91.97PVCH.AF119820 GGGTCATGCACAAATGTCAGCCCGGTACAATGCACTCATGGAATTAAGCCAGTGGTATCAACTCAATTGCTGTTA---AATGGAAGCTTAGCAACAGGAG---GGGTAGTAATTAGATCTAAAAATTTCACAGACAATCCCAAAAATATAATAGTACAGCTTGACAAGGCTGTAAAAATTAATTGTACCGGCCTTA------ACAACAATACAGGAGGAAGTGAACGTATC---G-GTATAGGGCCAGGACACACATGGTATGCAACA---------------GGTAACATAGTAGGAGATATAAGACAAGCACACTGTAACATTAGTGGAAGTGA-TTGGAATGAAGCTTTACAGAAGGTAGTTGTAAAATTAAGAGAACACT------------TCCCT------------AATAAAACAATAA---------T---ATTTAATCAA-----------TCCTCAGGAGGGGACCTAGAGATTACAACACATAGC-TTT +Ref.04_CPX.GR.97.97PVMY.AF119819 GGTCCATGCAAAAATGTCAGCTCGGTGCAATGCACTCATGGAATTAAGCCAGTAGTATCAACTCAATTGCTGTTA---AATGGAAGCTTATCAACAGAAG---GGGTAGTACTTAGATCTAAAAACTTCACAGACAATACCAAAAATATAATAGTACAGCTTGCAGAGGCTGTAAAAATTAATTGTACCAGACCTA------ACAACAATACAAGAAAAGGTGTAC----------ATATAGGACCAGGAAAAACATGGTTTGCAACA---------------GGGGAAGTAATAGGAGACATAAGAAAAGCACATTGTAACATTAGTGAAAAAGA-TTGGAATACAACTTTACAAAAGATAGTTGATGAATTAAGAAAACACT------------TCCCT------------AATAAAAACATAA---------C---ATTTGCTCCA-----------TCAGCAGGAGGGGACGTAGAAATTACAACACATAGT-TTT +Ref.05_DF.BE.-.VI1310.AF193253 GGACCATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATTAAGCCAGTGGTATCAACTCAGTTGCTGTTA---AATGGCAGCCTAGCAAAAGAAG---GTATAATAATCAGATCTCAAAATATCTCAGATAATGCAAAAAACATAATAGTACACCTTAATGAATCTGTACATATTAATTGTACAAGGCCCA------ACAACAATACAAGAAAAAGTATAC----------ATTTAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGACATAATAGGAGACATAAGAAAGGCACATTGTAACGTTAGTAGGGAACA-ATGGAATAAAACCTTAATCCAGGTAGCGAAAGAGTTACAGTCTCACT------------TCCCT------------AATAAAACAATAA---------A---ATTTAACTCA-----------TCCTCAGGAGGGGACCTAGAAATTACAATGCATAGT-TTT +Ref.05_DF.BE.93.VI961.AF076998 GGGCCATGCAAGAATGTCAGCACAGTACAATGTACACATGGAATTAAGCCAGTGGTATCAACTCAATTGCTATTA---AATGGCAGTCTAGCAGAAGAAA---GTATAATAATCAGATCTCAAAATATCTTAGATAATACAAAAACCATAATAGTACACCTTAATGAATCTGTACAGATTAATTGTACAAGGCCCA------ACAACAATACAAGAAAAAGTATAC----------CTTTAGGACCAGGACAAGCATTCTATACAACA---------------GGGGACATAATAGGAGACATAAGAAAGGCACATTGTAACGTTAGTGGAGCACA-ATGGAATAAAACTTTAGAACAGGTAAAGGAAGAGCTAAGGGCTC---ATATAAAGGACA----------TAGGC--AATAAAACAATAG---------T---ATTTAACTCA-----------TCCGCAGGAGGGGACCTAGAAATTACAAGCCATATT-TTT +Ref.06_CPX.AU.96.BFP90.AF064699 GGACCATGTAAAAATGTCAGTACAGTACAATGTACACATGGGATTAAGCCAGTGGTATCAACTCAATTACTGCTG---AATGGCAGTTTAGCAGAAGGAA---ACATAACAATTAAAACTGAAAACATCACAGACAATACCAAAAATATAATAGTGCAGCTTAATCAGCCTGTAGAAATTAGGTGTACCAGACCCG------GTAACAATACAAGAAAAAGTATAT----------CCTTTGGACCAGGACAAGCGTTCATTGCAACA---------------GGTGACATAATAGGAGATATAAGACAAGCTCATTGTAATGTTAGTAGAGCAAA-TTGGACAGATATACTAGGAGAAGTGAAGGTAAAACTAGAAGAAGTTT------------TTAAT------------AACACACACATAA---------C---CTTTAAGTCA-----------TCTGCAGGAGGGGACCTAGAAATTACAACACATAGT-TTT +Ref.06_CPX.ML.95.95ML127.AJ288982 GGACCATGTAAAAATGTTAGTACAGTACAATGTACACATGGAATTAAGCCAGTGGTATCAACTCAGTTACTGCTG---AATGGCAGTTTAGCAGAAGAAG---AAATAATAATTAAATCTAAAAACCTCACAGACAATACCAAAATCATAATAGTGCAGCTTAATAAATCTGTAGAAATTAGTTGTTCCAGACCCA------ATAACAATACAAGAAAAAGTATAC----------ACATTGGACCAGGGCAAGCGTTCTATGCAACA---------------GGTGAAATAATAGGAAATATAAGAAAAGCTCATTGTAATGTTAGTAGAAAAGC-TTGGAATAGTATGTTACAGAATGTGACTGCAAAACTAAAAGAACTCT------------TTAAT------------AATAAGAATATAA---------C---CTTCAATTCA-----------TCAGCAGGAGGGGACCTAGAAGTTACAACACATAGT-TTT +Ref.06_CPX.ML.95.95ML84.AJ245481 GGACCATGTAAAAATGTCAGTACAGTACAATGTACACATGGAATTAAGCCAGTGGTATCAACTCAGTTACTGCTG---AATGGCAGTTTAGCAGAAGATG---AAATAATAATTAAATCTGAAAACCACACAAACAATGCCAAGATCATAATAGTGCAGCTTAATAAAACTGTACAAATTAGGTGTACCAGACCCA------GTAACAATACAAGGAAAAGTATAC----------CCCTTGGACCAGGGCAAGCGTTCTATGCAACA---------------GGTGACATAATAGGAGATATAAGACAAGCTCATTGTAATGTTAGTAGAACAGC-ATGGAAGGAGACGTTACAGAATGTGACTGAAAAACTAAAACAACTCC------------TTAAT-A--------------CGAACATAA---------C---CTTTAATCCA-----------TCTGCAGGAGGGGACCTAGAAATTACAACACATAGT-TTT +Ref.06_CPX.SN.97.97SE1078.AJ288981 GGAGCATGTAAAAATGTCAGTACAGTACAATGTACACATGGAATTAAGCCAGTGGTATCAACTCAATTACTACTG---AATGGCAGTTTAGCAGAAGAGG---AAATAATAATTAAAACTGAAAACCTCACAGACAATAGCAAGAACATAATAGTACAGCTTAATAAATCTATAGAAATTAAGTGTACCAGACCCA------ATAACAATACAAGAAAAAGTATAT----------CCTTTGCACCAGGGCAAGCGTTCTATGCAACA---------------GGTGACATAATAGGAGATATAAGACAAGCTCATTGTAATGTTAGTAGAACAGA-TTGGAATAATATGTTAAAGAATGTGACTACAAAACTAATAGAAGTCT------------TTAAA-A--------------AGAACATAA---------C---CTTTAATTCA-----------TCTGCAGGAGGGGATCTAGAAATTACAACACATAGT-TTT +Ref.07_BC.CN.97.97CN001.AF286226 GGACCATGCCATAATGTTAGCACAGTACAATGTACACATGGGATTAAGCCAGTGGTATCAACTCAACTACTTGTT---AATGGTAGCCTAGCAGAAGGAG---AAATAATAATTAGGTCTGAAAATCTGACAAACAATGTCAAAACAATAATAGTACATCTTAATCAATCTGTAGAAATTGTATGTACAAGACCCG------GCAATAATACAAGAAAAAGTATAA----------GGATAGGACCAGGACAAACATTCTATGCAACA---------------GGAGACATAATAGGAGACATAAGACAAGCACATTGTAACATTAGTGAAGATAA-ATGGAATGAAACTTTACAAAGGGTAAGTAAAAAATTAGCAGAACACT------------TCCAG------------AATAAAACAATAA---------A---ATTTGCATCA-----------TCCTCAGGAGGGGACCTAGAAGTTACAACACATAGC-TTT +Ref.07_BC.CN.97.C54A.AX149647 GGACCATGCCATAATGTTAGCACAGTACAATGTACACATGGGATTAAGCCAGTGGTATCAACTCAACTACTGTTA---AATGGTAGCCTAGCAGAAGGAG---AAATAATAATTAGATCTGAAAATCTGACAAACAATGTCAAAACAATAATAGTACATCTTAATCAATCTGTAGAAATTGTATGTACAAGACCCG------GCAATAATACAAGAAAAAGTATAA----------GGATAGGACCAGGACAAACATTCTATGCAACA---------------GGAGACATAATAGGAGACATAAGACAAGCACATTGTAACATTAGTGAAGATAA-ATGGAATGAAACTTTACAAAGGGTAAGTAAAAAATTAGCAGAACACT------------TCCAG------------AATAAAACAATAA---------A---ATTTGCATCA-----------TCCTCAGGAGGGGACCTAGAAGTTACAACACATAGC-TTT +Ref.07_BC.CN.98.98CN009.AF286230 GGACCATGCCATAATGTTAGCACAGTACAATGTACACATGGGATTAAGCCAGTGGTATCAACTCAACTACTGTTA---AATGGTAGCCTAGCAGAAGAAG---AAATAATAATTAGATCTGAAAATCTGACAAACAATGTCAAAACAATAATAGTACATCTTAATCAATCTGTAGAAATTGTATGTACAAGACCCG------GCAATAATACAAGAAAAAGTATAA----------GGATAGGACCAGGACAAACATTCTATGCAACA---------------GGAGAAATAATAGGAGACATAAGACAAGCACATTGTAACATTAGTGAAGATAA-ATGGAATAAAACTCTACAAGGGGTAAGTGAAAAATTAGCAGAACACT------------TCCAG------------AATAAAACAATAA---------A---ATTTGCATCA-----------TCCTCAGGAGGGGACCTAGAAATTACAACACATAGC-TTT +Ref.08_BC.CN.97.97CNGX_6F.AY008715 GGACAATGCCATAATGTTAGCACGGTACAATGTACACATGGAATTAAGCCAGTGGTATCAACTCAACTACTGTTA---AATGGTAGCCTAGCAGAAAGAG---AGATAATAATTAGATCTGAAAATCTGACAAACAATGTCAAAACAATAATAGTACATCTTAATCAATCTGTAGAAATTGTATGTACAAGACCCA------ACAATAATACAAGAAAAAGTATAA----------GGATAGGACCAGGACAAACATTCTATGCAACA---------------GGAGACATCATAGGAGACATAAGACAAGCACATTGTAACATTAGTAAAGATGC-ATGGTATGAAACTTTACAGAGGGTAAGTAAAAAATTAGCAGAACACT------------TCCCT------------AATAAAACAATAA---------A---ATTCGCATCA-----------TCCTCAGGAGGGGACCTAGAAATTACAACACATAGC-TTT +Ref.08_BC.CN.97.97CNGX_7F.AY008716 GGACAATGCCATAATGTTAGCACGGTACAATGTACACATGGAATCAAGCCAGTGGTATCAACTCAACTACTGTTA---AATGGTAGCCTAGCAGAAAGAG---AGATAATAATTAAATCTGAAAATCTGACAAACAATGTCAAAACAATAATAGTACATCTTAATCAATCTGTAGAAATTGTATGTACAAGACCCA------ACAATAATACAAGAAAAAGTATAA----------GGATAGGACCAGGACAAACATTCTATGCAACA---------------GGAGACATCATAGGAGACATAAGACAAGCACATTGTAACATTAGTAAAGATGA-ATGGTATGAAACTTTACAGAGGGTAAGTAAAAAATTAGCAGAACACT------------TCCCT------------AATAAAACAATAA---------A---ATTTGCATCA-----------TCCTCAGGAGGGGACCTAGAAATTACAACACATAGC-TTT +Ref.08_BC.CN.97.97CNGX_9F.AY008717 GGACAATGCCATAATGTTAGCACGGTACAATGTACACATGGAATTAAGCCAGTGGTATCAACTCAACTACTGTTA---AATGGTAGCCTAGCAGAAAGAG---AGATAATAATTAGATCTGAAAATCTGACAAACAATGTCAAAACAATAATAGTACATCTTAATCAATCTGTAGAAATTGTATGTACAAGACCCA------ACAATAATACAAGAAAAAGTATAA----------GGATAGGACCAGGACAAACATTCTATGCAACA---------------GGAGACATCATAGGAGACATAAGACAAGCACATTGTAACATTAGTAAAGATAA-ATGGTATGAAACTTTACAGGGGGTAAGTAAAAAATTAGCAGAACACT------------TCCCT------------AATAAAACAATAA---------A---ATTTGCATCA-----------TCCTCAGGAGGGGACCTAGAAATTACAACACATAGC-TTT +Ref.08_BC.CN.98.98CN006.AF286229 GGACAATGCAATAATGTTAGCACGGTACAATGTACACATGGAATTAAGCCAGTGGTATCAACTCAACTACTGTTA---AATGGTAGCCTAGCAGAAGGAG---AGATAATAATTAGATCTGAAAATCTGACAAACAATGTCAAAACAATAATAGTACATCTTAATCAATCTGTAGAAATTGTATGTACAAGACCCA------ACAATAATACAAGAAAGAGTATAA----------GGATAGGACCAGGACAAACATTCTATGCAACA---------------GGAGACATCATAGGAGACATAAGACAAGCACATTGTAACATTAGTAAAAATAA-ATGGAACGAAACTATACAGAGGGTAAGTGAAAAATTAGCAGAACACT------------TCCCT------------AATAAAACAATAA---------C---ATTTGCACCA-----------TCCTCAGGAGGGGACCTAGAAATTACAACACATAGC-TTT +Ref.10_CD.TZ.96.96TZ_BF061.AF289548 GGACCATGCAAAAACATCAGTACAGTACAGTGTACACATGGGATTAAACCAGTAGTGTCAACTCAACTGTTATTG---AATGGCAGTCTAGCAGAAGAAG---AGATAATAATTAGATCTGAAAATCTCACAGATAATGCTAAAAATATAATAGTGCAGCTTAATGAGTCTGTAACAATTAATTGCGTGAGACCCA------ACAATAATACAAGAAGAAGTATAA----------GGATAGGACCAGGACAAACATTCTATGCAACA---------------G---ACATAATAGGAAACATAAGACAAGCATATTATAACATTAGTGGAACAGA-ATGGAATAAAACTTTACAACAAGTAGCTAAAAAATTAGGAGACCTTT------------TTAAC------------CGACCAACAATAA---------T---TTTTAAACCA-----------TCCTCAGGAGGGGACCCAGAAATTACAACACACACC-TTT +Ref.10_CD.TZ.96.96TZ_BF071.AF289549 GGTCCATGCGACAATGTCAGTACAGTACAATGTATACATGGGATTAAGCCAGTAGTGTCAACTCAACTGTTATTG---AATGGCAGTCTAGCAGAAGAAG---AGATAATAATTAGATCTGAAAATATCACAAGTAGTCTTAAAACTATAATAGTACAGCTTAATGAGTCTGTAACAATTAATTGCACGAGACCCA------ACAATAATACAAGAAAAAGTATAA----------GGATAGGACCAGGACAAACATTCTATGCAACA---------------GGAGACATAATAGGAGATATAAGACAAGCACATTGTAACATTAGTGGAACAGA-ATGGAATAAAACTTTACAACAGGTAGCTAAAAAACTAAGAGACCTTC------------TTAAC------------AAAACAAAAATAA---------T---TTTTAAACCT-----------TCTTCGGGAGGGGACCCAGAAGTTACAACACACACC-TTT +Ref.10_CD.TZ.96.96TZ_BF110.AF289550 GGACCCTGTAAAAACGTCAGTACAGTACAGTGCACACATGGGATTAGGCCAGTAGTGTCAACTCAGCTGTTATTG---AATGGCAGTCTAGCAGAAGAAG---AGATAATAATTAGATCTGAGAATCTCACAGATAATGCTAAAACTATAATAGTACATCTTAATGAGTCTGTAACAATTAATTGCACGAGACCCA------ACAATAATACAAGAAAAAGTATAA----------GGATAGGACCAGGACAAACATTCTATGCAACA---------------GGAAACATAATAGGCAATATAAGGCAAGCATATTGTGAAATTAATGGGACAGA-ATGGAATAAAACTTTACAACAGGTAGCTAACAAATTAGGAAAACTTC------------TTAAC-A-----------AAACAACAATAC------------ATTTTCGACCA-----------TCCTCGGGAGGAGACCCAGAGATTACAGAACATACC-TTT +Ref.11_CPX.CM.97.MP818.AJ291718 GGACCATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTACTGTTA---AATGGCAGTCTAGCAGAAGAAA---AGGTAAAGATTAGATCTGAAAACTTCACAAACAATGCTAAAACCATAATAGTACAGTTTAACAATACTGTAAGAATTAATTGTACCAGACCTG------GCAACAATACAAGAAAAAGTATAC----------ATTTAGGACCAGGACACGCTTTCTATGCCACA---------------GGTGCTATAATAGGGGATATAAGACAAGCACATTGTAAAGTCAGTAAAGCAGA-ATGGCTCAACACTTTACAACAGGTAGCTACACAATTACGAGGAAAGT------------TTAAC-A--------------AAACAATAA---------T---CTTTGATAAC-CCC-------TCCCCAGGAGGGGATATAGAAATTACATCACATAGC-TTT +Ref.11_CPX.FR.99.MP1298.AJ291719 GAGCCATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAATAGTATCAACTCAACTACTGATA---AATGGCAGTCTAGCAAAAGGGG---AGGTAAAGATTAGATCTGAAAACTTCACAGACAATGCTAAAACCATAATAGTTCAGCTAAACAGTTCTGTAATGATTAATTGTACCAGACCTA------ACAACAATACAAGAAAAGGGATAC----------AGATAGGACCAGGACGAACTGTCTATGCAACA---------------GGGGCTATAATAGGGGACATAAGGAAAGCACATTGTAATATCAGTG-------GATGGAAAAACACGTTAGAACAGGTAGCTATGCAATTAAGAAAACAGT------------TTAAC-A-----------AAACAAACATAA---------T---CTTTAATAGT-----------ACCTCAGGAGGGGATATAGAAATTACAACACATAGC-TTT +Ref.11_CPX.FR.99.MP1307.AJ291720 GGACCGTGCAACAATGTCAGCACAGTACAATGCACACATGGGATTAAACCAGTAGTATCAACTAAACTACTGCTAAATAATGGCAGTCTAGCAGAAGGAG---AGGTAAAGATTAGATCTGACAACTTCACAGACAATGCTAAAACCATAATAGTACAGCTTAACGAAACTGTAAAAATTAATTGTACCAGACCTA------ACAACAATACAAGAAAAGGTATAC----------ATACAGGACCAGGACAAGCTCTCTATACAACA---------------GGTGCTATAATAGGAGATATAAGACAAGCATATTGTAACATCAGTGGAAAAGC-CTGGAATAACACACTAAAACAGGTAGCTACACAATTATGGAGGAAGT------------TTAAC-A--------------AAACAATAG---------T---CTTTACTAAC-----------TCCTCAGGAGGGGATTTAGAAATTACAACACATAGC-TTT +Ref.11_CPX.GR.-.GR17.AF179368 GGACCATGCAAAAATGTCAGCACAGTACAATGTACACATGGAATCAAGCCAGTAGTATCAACTCAACTACTGTTA---AATGGCAGTCTAGCAGAAGGAG---AGGTAAGGATTAGATCTGAAAACCTCACAAACAATGCTAAAACCATAATAGTACAGCTTAACAGTACTGTAAGAATTAATTGTACCAGACCTA------ACAACAATACAAGAAAGGGTATAC----------ATATAGGACCAGGACAAGCTTTTTATGCAACA---------------GGTGATATAATAGGGGATATAAAACAAGCACATTGTAACGTCAGTAGAGCAGA-CTGGAATAACACTTTACAACAGGTAGCTGAACAATTACATAATAACT------------TTAAC-A--------------AAACAATAG---------T---ATTTAATGAG-----------CATTCAGGAGGGGATTTAGAAGTTACAACACATAGT-TTT +Ref.12_BF.AR.97.A32989.AF408630 GGGCCATGCAAGAATGTCAGCACAGTACAATGTACACATGGAATTAAACCAGTGGTATCAACTCAATTGTTGTTA---AATGGCAGTCTACCAGAAGGAG---AGATAATAATCAGATCTCAAAATATTTCACATAATACAAAAACCATAATAGTACACCTTAATGAATCTGTACAGATTACTTGTATAAGACCCA------ACAATAATACAAGAAAAAGTATAC----------ATATAGGACCAGGAAGAGCATTTTATGCAACA---------------GGAGACATAATAGGAGACATGAGAAAGGCATATTGTAGCGTTAATGGAACACA-ATGGAATAAAACGTTAGAACGGGTAAGGGAAAAGCTAAGGTCTTATT------------TGCCT------------AATACAACAATAA---------A---ATTTAACTCA-----------TCCTCAGGAGGAGACCCAGAAATTACAACACATAGT-TTC +Ref.12_BF.AR.99.ARMA159.AF385936 GGGCCATGCCAGAATGTCAGCACAGTACAATGTACACATGGGATTAAACCAGTGGTATCAACTCAATTGTTGTTA---AATGGCAGCCTAGCAGAAGAAG---ATATAATAATTAGATCTCAAAACATCTCAGATAATACAAAAACCATAATAGTTCATCTTAATGAATCTGTACAGATTAACTGTACAAGACCCA------ACAATAATACAAGAAAAAGTATAC----------AGTTAGGACCAGGACGGGCATTTTATGCAACA---------------GGAGACATAATAGGGGACATTAGAAAGGCACATTGTAATGTTAGTGGAGCACA-ATGGAGTAAAACGGTAGAACAGGTAAAGGCAAAGTTAAGATCTCAAT------------TCAAT-A--------------AAACAATAA---------A---ATTTAACTCA-----------TCCTCAGGAGGGGACCCAGAAATTACAATGCATAGT-TTC +Ref.12_BF.UY.99.URTR23.AF385934 GGGCCATGCAAGAATGTCAGCACAGTACAATGTACACATGGAATTAAACCAGTGGTATCAACTCAATTGTTGTTA---AACGGCAGCCTAGCAGAAGAAG---ATATAGTAATCAGATCTCAAAATTTCTCAGATAATGCAAAAATCATAATAGTACACCTTAATGACTCTGTACAGATTAATTGTACAAGGCCCA------ACAATAATACAAGAAAAGGTATAC----------ATATAGGACCAGGACAAGCATTTTATGCAACA---------------GGGGACATAATAGGAGACATCAGAAAGGCACATTGTAACGTTAGTGGAGCAAA-ATGGAATGAAACATTAGAACGGGTAAAGACAAAATTAAGGTCTTATT------------TCTCTA-AT---A-----CAACATCAATAA---------C---ATTTAACTCA-----------TCCTCAGGAGGGGACCCAGAAATTACAATGCATAGT-TTC +Ref.12_BF.UY.99.URTR35.AF385935 GGGCTATGCCAAAATGTCAGCACAGTACAATGTACACATGGAATTAAACCAGTGGTATCAACTCAATTGTTGTTA---AATGGCAGCCTAGCAGAAGAAG---ATATAGTAATCAGATCTCAAAATATCTCAGATAATGTGAAAACAATAATAGTACACCTTAATGAATCTGTACAGATTAATTGTATAAGACCTA------ACAATAATACAAGAAAAAGTATAC----------AGCTGGGACCAGGACGAGCATTTTATGCAACA---------------GGAGACATAATAGGAGACATCAGACAGGCACATTGTAACGTTGATGGAAAACA-ATGGCATAAAAAGTTAGAAGAGGTGAGGAAAGAGTTAAAGTCTCATT------------TCTCT------------GATGCAGAAATAA---------A---ATTTAACTCA-----------TCCTCAGGAGGGGACCTAGAAATTGCAATGCATAGT-TTC +Ref.13_CPX.CM.96.1849.AF460972 GGGCCATGCAGGAATGTCAGCTCAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGCCTAGCAGAAGGAG---AAATAAAGATTAGAGCTAAAAACTTCTCAGACAATGTTGGAAACATAATAGTACAGCTTGCTACGCCTGTAAGAATTAATTGTACCAGACCTA------ACAACAATACAAGAAAAAGTATAC----------GGATAGGACCAGGACAAGCATTCTATGCAACA---------------GGGGATATAATAGGAGATATAAGACAGGCACATTGTAACATCAGTAGACAAAA-ATGGAATAACACCTTACAACAGGTAGCTGCACAGTTAAGAAAATATT------------TGAAAG-AT---A-----ATACAACAATAA---------T---CTTTGCTAAT-----------CCCTCAGGAGGGGATTTAGAAGTTACAACACATAGT-TTT +Ref.13_CPX.CM.96.4164.AF460974 GGGCCATGCAAGAATGTCAGCTCAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGCCTAGCAGAAGAAG---AAGTAATGATTAGATCTGAAAACTTCTCAGACAATGTTAAAACCATAATAGTACAGCTTGCTAAGCCTATAAGAATTAATTGTACCAGACCTA------ACAACAATACAAGGAAAAGTATAC----------GGATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGGAATAATAGGGGATATAAGACAAGCATATTGTAACATAAGTAAAAAAGA-ATGGAATAACACGTTACAAGAGGTAGCTAAACAATTAAGAGAACACC------------TTAACA-AT---AAGAC--AGC---------AACAATAGT---CTTTGCTAAT-----------CACTCAGGAGGGGATATAGAAATTACAACACATAGT-TTT +Ref.14_BG.ES.00.X475.AF423758 GGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGCTA---AATGGCAGTCTAGCAGAAGAAA---AAATAGTAATTAGGTCTAAAAACTTCACGGACAATACTAAAACCATAATAGTACAGCTGAAAGACCCTATAGAAATTAATTGTACAAGACCCA------ACAACAATACAAGAARAGGAATAA----------CTATGGGACCAGGGAGAGTATGGTATACAACA---------------GGACAAATAATAGGAGATATAAGAAAAGCACATTGTAACATTAGCAAARCAAA-ATGGAATAACACTTTAGGACAGATAGCTGAAAAATTAAGAGAACAAT------------TCATG------------AATAAAACAATAG---------T---CTTTMAGCGT-----------TCCTCAGGAGGAGACCCAGAAATTGTAATGCACAGT-TTC +Ref.14_BG.ES.00.X477.AF423759 GGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGCTA---AATGGCAGTCTAGCAGAAGAAG---AAATAGTAATTAGGTCTAAAAACTTCACRGACAATGCTAAAACCATAATAGTACAGCTGAAAGAACCTATAGAAATTAATTGTACAAGACCCA------ACAACAATACAAGAAAAAGTATAC----------CTATGGGACCAGGGAGAGCATTGTATGCAACA---------------GGACAAATAATAGGAGATATAAGACAAGCACATTGTAACATTAGCAAAACAAA-ATGGAATAACACTTTAGGACAGATAGTTAATAAATTAAGAGAACAAT------------TCAGG------------AATAAAACAATAA---------T---CTTTCAGCAT-----------TCCTCAGGAGGAGACCCAGAAATTGAAATGCACAGT-TTC +Ref.14_BG.ES.00.X623.AF450097 GGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGAAG---ACATAGTAATTAGGTCTAAAAACTTCTCGGACAATGCTAAAACCATAATAGTACAGCTGAAGGAGCCTATASAAATTAATTGTACAAGACCCA------ACAACAATACAAGAAGAAGCATAA----------CTATGGGACCAGGGAAAGTATTGTATACAACA---------------GGACAAATAATAGGAAATATAAGAAGTGCACATTGTAACCTTAGCACAGAAAG-ATGGAATACCACTTTAAGACAAATAGTTGGAAAATTAAGAGAACAAT------------TCGGG------------AATAAAACAATAG---------T---CTTTAATCAT-----------TCCTCAGGAGGGGACCCAGAAATTGTAATGCACAGT-TTC +Ref.14_BG.ES.99.X397.AF423756 GGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGCTA---AATGGCAGTCTAGCAGAAGAAG---AAATAGTAATTAGGTCTAAAAACTTGAGTGACAATACTAAAACCATAATAGTACAGCTGAAAGACCCTATAGAAATTAATTGTACAAGACCCA------ACAACAATACAAGAAAAAGAATAA----------CTATGGGACCAGGGAGAGTATTGTATACAACA---------------GGACAAATAATAGGAGATATAAGAAAAGCACATTGTAACATTAGCAAAACAAA-ATGGAATAACACTTTAGGACAGATAGTTAGAAAATTAAGAAAACAAT------------TCATG------------AATAAAACAATAG---------T---CTTTCAGCGT-----------TCCTCAGGAGGAGACCCAGAAATTGTAATGCACAGT-TTC +Ref.N.CM.95.YBF30.AJ006022 GGAAGCTGTACAAATGTGAGTACTGTACAATGCACACATGGAATAAAGCCAGTGATATCCACTCAGTTAATCCTA---AATGGAAGCTTAAATACAGATG---GAATTGTTATTAGAAATGATA------------GTCACAGTAATCTGTTGGTGCAATGGAATGAGACAGTGCCAATAAATTGTACAAGGCCAG------GAAATAATACAGGAGGACAGGTGC----------AGATAGGACCTGCTATGACATTTTATAACATA---------------GAAAAAATAGTAGGAGACATTAGACAAGCATACTGTAATGTCTCTAAAGAACT-ATGGGAACCAATGTGGAATAGAACAAGAGAGGAAATAAAGAAAATCC------TGG------G----GAAAAAC--------AACATAACCTTCAGGG-------------CTCGAGAGAGGA----ATGAAGGAGACCTAGAAGTGACA-CACTTAATGTTC +Ref.N.CM.97.YBF106.AJ271370 GGAAGCTGTACAAATGTGAGTACTGTACAATGCACACATGGAATAAAGCCAGTGATATCCACTCAGTTAATCCTA---AATGGAAGCTTAGATACAGATG---ATATTGTTATTAGACATCATG------------G---GGGTAATCTGTTGGTGCAATGGAATGAGACAGTGTCAATAAATTGTACAAGGCCAG------GAAATAATACAGGAGGACAGGTGC----------AGATAGGACCTGCTATGACATTTTATAATATA---------------GAAAAAATAGTAGGAGACGTTAGACAAGCATACTGTAATGTCTCTGAAG----AATGGGGATCAATGTGGAATAAAACAAAAAAGAAGATAAAAAGACTCC------T------GGGAA-AC---A-----ACAC------AACTTTCAAAG-------------CTCAGGATAAAA----ATGGAGGAGACCTAGAAGTAACA-CACTTAATGTTC +A1.UA.1997.ukr970063 GGGCCATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAGAAAG---AGGTAATGATTAGATCTGAAAATATCACAGACAATAGCAAAATCATAATAGTACAGCTTACTGAGCCTGTAAACATCACTTGTATCAGACCTG------GCAACAATACAAGAACAAGTATAC----------GTATAGGACCAGGACAAACCTTCTATGCAACA---------------GGTGATGTAATAGGGGACATAAGAAAAGCATATTGTAATGTCAGCAGAGCAGC-ATGGAATAGCACTTTACAAAAGATAAGTACACAATTAAGAAGATACT------------TTAAT------------AACAAAACAATAA---------T---CTTTAAGAAC-----------TCCCCAGGAGGGGATTTAGAAGTTACAACACATAGT-TTC +A1.FI.1991.FIN91121 GGGCCATGCACGAATGTTAGCACAGTACAGTGCACACATGGAATCAGGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTGGCAAAAGAAG---AGGTAAGAATTAGATCTGAAAATATCACAAATAATGTCAAAACTATAATAGTACAACTTGTCAAGCCTGTAAACATTACCTGTATCAGACCTA------ACAATAATACAAGAAAGAGTATAC----------ATTTAGGACCAGGACGAGCATTCTATGCAACA---------------GGTGACATAATAGGGAATATAAGAAAAGCACATTGTATTGTCAATGAATCAGA-ATGGAATGAAGCTTTACAACAGGTAGCTACACAATTAGGAAAATACT------------TTGAG------------AACAAAACGATAA---------A---TTTTACTAGC-----------CCCTCAGGAGGGGATCTAGAAGTCACAACACATAGT-TTT +A1.FI.1991.FIN9199 GGGCCACGCAAGAATGTCAGCACAGTACAATACACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAAAAGAAA---AGGTAAAAATTAGATGTGAAAATATCACAAACAATGCCAAAACTATAATAGTACAACTGGTCGAGCCTGTGCAAATTAATTGTACCAGACCTA------ACAACAATACAAGAAGAAGTGTAC----------GTATAGGACCAGGACAGGCATTTTATACAACA---------------G---ACATAATAGGGGATATAAGACAAGCCCATTGTAATGTCAGTAGATCAGC-ATAGAATAAAACTTTACAAAAGGTAGCTGAGCAATTAGGAGGATACT------------TTGGG------------AACAAAACAATAA---------T---ATTTGCTAAC-----------GCCTCAGGAGGGGATCTAGAAATCACAACACATAGT-TTT +A1.KE.1995.Q168-a2_PNS32d GGGCCATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAAAAG---AGGTAATGATTAGATCTGAAAATTTCACAAACAACGCCAAAAACATACTAGTACAATTTAAAGAGCCTGTAAAAATTAATTGTACCAGACCTG------ACAACAATACAAGAACAAGTATAC----------GTATAGGACCAGGACAAGCATTCTATGCAACA---------------G---GTATAATAGGGGATATAAGACAAGCATATTGTACTGTCAATGGATCAGA-ATGGAATAAAGCGTTACAAAAGGTAGTTGAACAATTAAGAAGCTCCT------------TCGAG------------AACAAAACAATAA---------T---ATTTGCTAAC-----------TCCTCAGGAGGGGATCTAGAAATCACAACACATAGT-TTT +A1.KE.1994.Q259-w6_PNS59d GGGCCATGCAAGAATGTTAGCTCAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGAAA---AAGTAAGAATTAGATCTGAGAATATCACAGACAATGGCAAAAACATAATAGTACAACTTAAAACACCTGTAAACATTAGTTGTACCAGACCTA------ACAACAATACAAGAAAAAGTATAC----------GTATAGGACCAGGACAAGCATTCTATGCAACA---------------GATGACATAATAGGGAATATAAGACAAGCATATTGTACTGTTAATAGAACACA-ATGGGATGACACTTTGCAAGAGGTAGCCAACCAATTAAGAATATACT------------TTAAC-A--------------AAACAATAA---------T---TTTTAATAAC-----------TCAGCAGGAGGGGATCTAGAAATTACAACACATAGT-TTT +A1.KE.1995.Q461 GGGCCATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTTTAGCAGAAAGAA---AGGTAATGATTAGATCTGAAAATATTACAAACAATGCCAAAAACATAATAGTACAATTTACCAAGCCTGTGAATATTACTTGTATCAGACCTG------GCAACAATACAAGAAAAAGTGTAC----------GCATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGACATAACAGGGGACATAAGAAACGCACATTGTGTTGTCAATAGAACAGA-GTGGAATAACACTTTACAAAAGGTAGTTGAACAATTAAGAGAATACT------------TCCCT------------AATAAAACAATAA---------T---CTTTACTAAC-----------TCCTCAGGAGGGGACATAGAAATTACAACACATAGT-TTT +A1.KE.1996.Q769-b9_PNS79d GGGCTATGCAAAAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTGTCAACTCAACTACTGTTA---AATGGCAGTCTAGCAGAAGGAA---AGGTAATGGTCAGATCTGAAAATATCACAAACAATGCCAAGAACATAATAGTACAATTTAACAATTCTGTACAAATTAATTGTATCAGACCTG------GCAACAATACAAGAAAAAGTATAC----------ATTTAGGACCAGGAAAAGTATTCTATGCAACA---------------A---ATATAATAGGGGATATAAGAAAAGCACATTGTAATGTCAATAGACAACA-ATGGAATAAAGCTTTGCAAGATGTAGCCACACAATTAAGAACACACT------------TTAGA------------AACAGAACAATAA---------T---CTTTAATAAT-----------TCCTTAGGAGGAGATCTAGAAATAACAACACATAGT-TTT +A1.KE.1994.Q842-d12_PNS70d GGGCCATGCAAGAATGTCAGCACAGTCCAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTACTGTTA---AATGGCAGTCTAGCAGAAAAAG---AGGTAAAAATTAGATGTGAAAATATCACAAACAATGCTAAAACTATAATAGTACAACTTGTCAATCCTGTGAAAATTAATTGTACCAGACCTA------ACAACAATACAAGAAAAAGTATAC----------ATATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGACATAATAGGGGATATAAGACAAGCACATTGTAATGTCAACAGGACAGA-ATGGAACAACACTTTGCACCAGGTAGTCGAACAATTAAGAAAACACT------------TTAAC-A--------------AAACAATAA---------A---CTTTGCTAAC-----------TCCACAGGAGGGGATCTAGAAATAACAACACATAGT-TTT +A.CD.1997.KCC2 GGGCCATGCAAAAATGTCAGCACAGTACAATGTACACATGGAATTAAACCTGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTTTAGCAGAAGGAG---GAATAAAGATTAGATCTGCAAATATCTCATACAATGCCAAAAACATAATAGTACAGCTTGACATACCTGTAAAAATTAATTGTAGTAGGCCCA------ACAACAATACAAGAACAAGTGTGC----------GTATAGGACCAGGGCAAACATTTTATGCAACA---------------GGTGATATAATAGGGAATATAAGACAAGCACACTGCAACCTTAGTAGAACAGC-ATGGAACGATACTTTATACAATGTATCTAAAGCCTTAAGAGAACACT------------TCCCA------------AATAAAACAATAA---------T---CTTTAATAAA-----------TCATCAGGAGGGGACCTAGAAGTGACACAACATATG-TTT +A.CD.1997.KMST91 GGGACATGCAACAATGTCAGCCCAGTGCAATGCACACATGGGATTAAGCCAGTGGTGTCCACTCAGCTGTTGCTA---AATGGCAGTCTACCAGAAGGAG---GGGTAATAATTAGATCTGAAAATATCACAAACAATGCCAAAACAATAATAGTACAGCTTGATGAGCCTGTAAGAATCAATTGTACCAGACCCA------ACAACAACACAAGAAAAGGTATAC----------ATATAGGACCGGGAGGAGCATTCTACGCAACA---------------GGAGAAGTAGTAGGAAACATAAGACATGCATGGTGTGAAGTTAATGGAACAGC-CTGGAAAGAAGCTTTAAAGAAAGTAGTTACAAAATTAAAAGAACACT------------TCAAA------------AATAAAACAATAG---------C---CTTTCAGCCA-----------TCATCAGGAGGGGACCTAGAAATTACAACACACAGT-TTT +A.CM.1997.97CM_MP582 GGACCATGCACGAATGTCAGCACAGTACAATGCACACATGGAATCAGGCCAGTAATATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGGAG---AGGTACGAATTAGATCTGAAAATATCACAAACAATGCCAAAACCATAATAGTACAACTTGATGAGGGTGTACATATTAATTGTACCAGACCTA------ACAACAATACAAGAAAAAGTGTAC----------ATATAGCACCAGGACAGGCATTCTATGCAACA---------------GGGGCCATAATAGGGGACATAAGAAGAGCATATTGTCAGATCAATGGAACAAG-ATGGAATGAAACTTTGCACAAGATAGTCAAACAATTAAGAAAACACT------------TTAAC------------AACAGAACAATAA---------T---CTTTAATAGT-----------TCCTCAGGAGGGGATTTAGAAATTACAACACATAGT-TTC +A.CM.1997.97CM_MP812 GGGCCATGTAAGAATGTCAGTACAGTACAATGCACACATGGAATCATGCCAGTAGTATCAACTCAACTACTGTTA---AATGGCAGCCTAGCAGAAGGAA---AGACAATGATTAGATCTAAAAATTTCACAAACAATGCCAAAATCATAATAGTACAGTTTAACCAGTCTGTAGAAATTAATTGTACTAGACCTT------ATAAGAATATAAGAAGGAGGATAC----------ATATAGGACCAGGACGAGCATTCTATGCAACA---------------AGTGGCATAGAAGGGGGTATAAGGCAAGCATATTGTAATGTCAATGCAACAGC-ATGGAATAAAACTTTACACATGGTAGCTGAACAATTAAAAACACAGT------------TTAAT-A--------------AAACAATAA---------T---CTTTGATAAC-----------TCTTCAGGAGGGGATATAGAAATTACAACACATAGT-TTT +A.CM.1999.99CM_MP1370 GGGCCATGCAAGAATGTCAGTACAGTACAATGTACACATGGAATCAAACCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGGAG---AGGTARTCATTAGATCTGAAAATATTACAAACAATGYCAAAAACATCATAGTACAGTTTGTCAAGCCTGTAAAAATTAATTGTACAAGACCTA------ATAATAATACAAGAACAGGTATAC----------ATATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGAAATAATAGGGGATATAAGACAAGCACATTGTARTATCAGTAAAAGAGA-ATGGTATGAAGCTTTAAACCAGACACAAACACAATTTAAGAAACACT------------TTAAC-A--------------GAACAATAA---------A---ATTTAATAAC-----------TCCTCAGGAGGGGACTTAGAAATTACAACACATAGT-TTT +A.CM.1999.99CM_MP1433 GGGCTATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTGGTATCAACTCAATTGCTGTTA---AATGGTAGTCTAGCAGAAGGAG---AAATAAAGGTTAGATCTGAGAATTTCACAAACAATGCTAAAACTATAATAGTACAGCTTGACCAGCCTGTAATAATTAATTGTACCAGACCTA------ACAACAATACAAGAAGAAGTGTAC----------GTATAGGACCAGGACAAGCATACTATGCAACA---------------GGAGAAATAATAGGGGATATAAGAAAAGCATATTGTACTGTCAATAAAACAGC-ATGGAATGAAACAAAACACAAGGTAATGGAAAAATTAAGAGGAATAT------------ACCAC-A--------------GACCAATAA---------A---GTTTAATAGC-----------TCCTCAGGAGGGGATTTAGAAATTACAACACATATG-TTT +A.CM.1998.98CM_MP1014 GGGCCATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTG---AATGGCAGTCTAGCAGAAGGAA---ATGTAAYGATTAGATCTGAGAATCTCACAAAYAATGCCAAAACCATAATAGTACAGCTTGCTGATCCTGTAAACATTACTTGTATCAGACCCA------ACAACAATACAAGAAAAAGTGTAC----------ATATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGACATAATAGGGGATATAAGACAAGCACATTGTAATGTCAGWAGARAAAA-WTGGAATAACACTTTACACAAGGTAGCTGAACAGTTAAGACAGTACT------------TTAWC------------AACAAAACAATAG---------C---ATTCAATAAA-----------TCATCAGGAGGGGATTTAGAAATTACAACACATAGC-TTT +A.CM.1997.97CM_MP640 GGGCAATGCAATGATGTCAGCTCAGTACAATGCACACATGGAATCAGACCAGTAGTATCAACYCAATTGCTGTTA---AATGGCAGTCTAGCAAAAGGAA---AGGTAAAGATTAGAWCTGAAAGTCTCACAAACAATGCCAAAACCATAATAGTACAATTTAATGAGACTGTAAGAATTAATTGTACCAGACCTA------ACAACAATACAAGAACAAGTGTAC----------ATATGGGACCAGGAAAAGCATTCTATGCAACA---------------GGTGACATAATAGGGGACATAAGAAAAGCACACTGTAATGTCAGTAGAGCAAA-ATGGAAGGACACTTTACAAGAGGTAGCCAGACAATTAAGTGAACACT------------TTAAT------------AAAACAGAAATAA---------C---CTTTAAGAAC-----------TCCTCAGGAGGGGATTTAGAAATTACAACACATAGT-TTT +A.KE.2000.00KE_KER2018 GGGTTATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCGGTAGTGTCAACTCAACTACTGTTA---AATGGCAGTCTAGCAGAAAAAG---AGATAAGAATTAAATCTGAAAACATCTCAGACAATGCTAAAACCATAATTGTACAACTTACCAAGCCTGTACTAATCAATTGTGCCAGACCTA------GCAACAATACAAGAAAAAGTGTAC----------ATATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGCTATAACAGGAGATATAAGACAAGCATATTGTGTAGTCAATAGAACACA-ATGGAATGACACTTTAGGACAGGTAGCGATACAATTAAGAAAACACT------------GGAAC-A--------------CAACAATAA---------T---CTTTAATGAA-----------CCCTCAGGAGGGGATTTGGAAATTACAACACATAGT-TTT +A.KE.-.MS208w6BMCB1_gp160 GGGCCATGCGAGAATGTCAGTACAGTACAATGTACACATGGAATCAAGCCAGTAGTGTCAACTCAATTGCTATTA---AATGGCAGTCTAGCAGAAGAAC---ATGTAATGATTAGATCTGAAAATATCACAAACAATGCCAAAAATATACTAGTACAACTTAACGAGTCTGTAGAAATTTATTGTATTAGACCTA------GTAACAATACAAGAAAAAGTGTAC----------GTATAGGACCAGGACAAACATTCTATGCAACA---------------GGTGAAATAATGGGGGATATTAGACAAGCACATTGTAATGTCAGTGGGTCAAA-ATGGAATAAAACTTTACAACAGGTAGCCAACCAATTAAGAAAACACT------------TTAAC------------ACCACAACAATAA---------T---CTTTGCTAAC-----------CCCTCAGGAGGGGATCTAGAAATTACAACACATAGT-TTT +A.KE.-.MJ613_W1M_ENV_A2 GGGCTATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGGAA---AGGTAATGATCAGATCTGAAAATATCACAAACAATGCCAAAAACATAATAGTACAATTTACCAGTCCTGTAGAAATTAAGTGTATCAGACCTA------ACAACAATACAAGAAAAAGTGTAC----------ATATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGACATAATAGGGAATATAAGACAAGCACATTGTAATGTCAGTAGAACAGA-ATGGAATAAAACTTTGCAACAAGTAGCCACCCAATTAAGAACATACT------------TTGGA------------AACAAGACAATAA---------T---CTTTGCTAAC-----------TCCACAGGAGGGGATCTAGAAATAACAACACATAGT-TTT +A.KE.-.BJ613_W6M_ENV_A1 GGACTATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAACCAGTAGTATCAACTCAACTACTGTTA---AATGGCAGTCTAGCAGAAGGAA---AGGTAATGATCAGATCTGAAAATCTCACAAACAATGCCAAAAACATAATAGTACAATTTAACGAGTCTGTAAAAATTGATTGTATCAGGCCTA------ACAACAATACAAGAAAAAGTATAC----------ATATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGCCATATTAGGGGATATAAGACAAGCACATTGTAAAGTTAATAAAACAGA-ATGGAATAAAGCTTTGCAAAAGGTAGTCACCCAATTAGGAACATACT------------TTAAG------------GACAAAGCAATAA---------C---CTTTGATAAC-----------TCCACAGGAGGGGATCTAGAAATAACAACACATAGT-TTT +A.KE.-.MG505_W0M_ENV_A2 GGGCCATGCCCAAAGGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGAAG---AGGTAATGATTAGATCTGAAAATATCACAAACAATGCCAAAAACATACTAGTACAATTTAACACGCCTGTGCAAATTAATTGTACCAGACCTA------ACAACAATACAAGGAAAAGTATAC----------GTATAGGACCAGGACAAGCATTCTATGCAACA---------------GGGGACATAATAGGGGATATAAGACAAGCACATTGTACTGTCAGTAAAGCAAC-ATGGAATGAAACTTTGGAAAAGGTAGTCAAACAATTAAGAAAACACT------------TTGGGA-AC---A-----ACAAAACAATAA---------T---ATTTGCTAAC-----------TCCTCAGGAGGGGATCTAGAAGTCACAACACATAGT-TTT +A.KE.-.BG505_W6M_ENV_A5 AGGCCATGCCCAAGTGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGAAG---AGGTAATGATTAGATCTGAAAATATCACAAACAATGCCAAAAACATACTAGTACAATTTAACACGCCTGTGCAAATTAATTGTACCAGACCTA------ACAACAATACAAGGAAAAGTATAC----------GTATAGGACCAGGACAAGCATTCTATGCAACA---------------GGGGACATAATAGGGGATATAAGACAAGCACATTGTACTGTCAGTAAAGCAAC-ATGGAATGAAACTTTGGGAAAGGTGGTCAAACAATTAAGAAAACACT------------TTAGGA-AC---A-----ACACAATAATAA---------G---ATTTGCTAAT-----------TCCTCAGGAGGGGATCTAGAAGTCACAACACATAGT-TTT +A.KE.-.MI206_W0M_ENV_A1 GGGCCATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGTTGTTA---AATGGCAGTCTAGCAGAAA------AGGTAAAAATTAGATCTGAAAATACCACAAACAATGCCAAAATTATAATAGTACAACTTGACGAGCCTGTGACAATTAATTGTACCAGACCCA------ACAACAATACAAGAACAAGTGTAC----------GTATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGCCATAATAGGGAACATAAGACAAGCACATTGTAATGTCAGTAGATCAGC-ATGGAATCAAACTTTACAAAAGGTAGTTACACAATTAAGAACATACT------------TTGGG------------AACAAAACAATAA---------T---ATTTGCTAAC-----------TCCTCAGGAGGGGATCTAGAAGTCACAACACATAGT-TTT +A.KE.-.BI206_W6P_ENV_A1 GGGCCATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAACGA---AGGTAAAAATTAGATCTGAAAATATCACAAACAATGCCAAAATTATAATAGTACAACTTGACAATCCTGTGACAATTAATTGTACCAGACCCA------ACAACAATACAAGAACAAGTGTAC----------GTATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGCCATAATAGGGAACATAAGACAAGCACATTGTAATGTCAGTAGATCAGC-ATGGAATCAAACTTTACAAAAGGTAGTTACACAATTAAGAACATACT------------TTGGG------------AACAAAACAATAA---------T---ATTTGCTAAC-----------TCCTCAGGAGGGGATCTAGAAATCACAACACATAGT-TTT +A.KE.-.ML274_W0M_ENV_A1 GGACCATGCAAAGATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGAAA---AGGTAATAATTAGATCTGAAAATATCACAAACAATGCCAAAAACATAATGGTGCAATTTAACGAGTCTGTAACAATTAATTGCACCAGACCTC------ACAACAATACAAGAACAAGTGTAC----------GTATAGGACCAGGACAAGCGTTCTATGCAACA---------------GGTGCCATAACAGGGGATATAAGACAAGCACATTGTAATGTCAGTAAAACAGA-ATGGGATAAAACTTTGCAAAAGGTAGCCACACAATTGGAAACATACT------------TTAAG------------AACAAAACAATAA---------C---ATTTGCTAAA-----------CCCACAGGAGGGGATCTAGAAATAACAACACATAGT-TTT +A.KE.-.BL274_W6M_ENV_A3 GGACCATGCAAAAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTGGCAGAAAAAA---AGGTAATGATTAGATCTGCAAATATCACAAACAATGCCAAAAACATAATAGTACAACTGACCGAGTCTGTAAAAATTAATTGCACCAGACCTC------ATAATAATACAAGAACAAGTGTAC----------GTATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGCTATAACAGGGGATATAAGACAAGCACATTGTAATGTCAGTAGACCAGC-ATGGAATAAAGCTTTGCAAGAGGTAGCCACACAATTGAAAACATACT------------TTAAC-A--------------AAACAATAA---------T---ATTTGATAAA-----------CCCACAGGAGGGGATCTAGAAATAACAACACATAGT-TTT +A.CM.2000.NYU1423 GGGGAATGCAAGAATGTCAGTACAGTACAATGCACACATGGAATCAGGCCAGTAGTATCAACTCAACTGCTATTA---AATGGCAGCCTAGCAAAGGAGC---AGGTAATGATTAGATCTAAAAATTTCACGAACAATGCCAAAACCATAATAGTACAACTTAACCAGTCCATATCAATTAATTGTACCAGACCTA------ACAACAATACAAGAAAAAGTATAC----------ACATAGGACCTGGACAAGCGTTCTATGCAACA---------------GGTGACATAATAGGAAATATAAGACAAGCACATTGTAATGTGTCTAGAGCTGA-ATGGAATGAAGCTTTAAAGCGGGTATCTGCAAAATTAAGGGAACAGT------------TTAAG------------AACAAAACAATAG---------T---TTTTAATTCA-----------TCTACAGGAGGTGATCCAGAAATTACAACACATAGT-TTT +A1.BE.-.VI2809 GGACCATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAAAAG---AGATAAGGATTAGATTTGAAAATATCACAGACAATGCCAAAGCTATAATAGTACAATTTAATCAGTCGGTAGAAATTAATTGTACCAGGCCTA------ACAACAATACAAGAACAAGTGTAC----------ATATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGACATAATAGGGGACATAAGAAATGCACATTGTAATGTCAGTAGAAAGAA-TTGGACTGAGGCTTTATACAGGGTAGCCACACAATTAAGAAAACACT------------TTAAC-A--------------GAACAATAA---------A---CTTTACTAGT-----------GCCTCAGGGGGGGATTTAGAACTTACAACACATACT-TTT +A1.UG.1997.pt185 GGGCCATGCAAGAATGTTAGCACAGTACAATGCACACATGGAATCAGGCCAGTAGTAACAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGGAA---AGGTAATAATTAGAGCTGAAAATATCACAAACAATGCCAAAAACATAATAGTACAATTAAGCGAGCCTGTARATATTACTTGTACCAGACCTG------GCAACAATACAAGGAGAGGTATAC----------GTATAGGACCAGGACGAAAATTCTATGCAGCA---------------GATAAAATAATAGGGGATATAAGACAAGCATATTGTACTGTCAGTAGAGCAAA-ATGGAACGAAACTTTGCAAAAGGTAGCCAACCAATTAAAAACACACT------------TCAACA-CA---N-----NNNNNACAATAG---------T---CTTTGATAGC-----------CCCATAGGAGGGGATCTAGAAATAACAACACATACT-TTT +A1.UG.1998.120MPC12 GGGCTATGCAAGAATGTTAGCACAGTACAATGCACACATGGAATCAGGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGGAG---AAGTAATGATTAGATCTGAAAATATCATAAACAATGCCAAAAACATAATAATACAATTTAACCAGTCTGTACCAATTAATTGTACCAGACCTG------GCAACAATACAAGAAAAGGTATAC----------ATATAGGACCAGGACAGGCATTCTATGCAACA---------------GGTGCCATAATAGGGGATATAAGACAAGCACATTGCACTGTCAATAGAACAGA-ATGGAATAACACTTTGCAAAAGGTAGTCAAAAAATTAAGAACACACT------------TTGGG------------AACAAAACAATAA---------T---CTTTAATAAC-----------TCCGCAGGAGGGGATCTGGAAATAACAACACATAGT-TTT +A1.UG.1998.120FIc01 GGGCTATGCAAGAATGTTAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTGTCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGGAG---AGGTAATGATTAGATCTGAAAATATCACAGACAATACCAAAAACATAATAGTACAATTTAACCAGTCTGTACAAATTATTTGTACCAGACCTA------ACAACAATACAAGAGAAAGTATAC----------ATATAGGACCAGGACAGGCATTCTATGCAACA---------------GGTGAAATAATAGGGAATATAAGACAAGCACATTGCAATGTCAGTGGAACAAG-ATGGAATAAAACTTTGCAAGAGGTAGCCAACAAATTAAGAATACACT------------TTGGG------------AACACAACAATAA---------A---CTTTACTAGC-----------CCCGCAGGAGGGGATCCAGAAATAACAACACATAGT-TTC +A1.UG.1999.601MPC7 GGGCTATGCCAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGCTA---AATGGCAGTCTAGCAGAAGGGA---CGGTAAAAATTAGATCTGAAAACATCACAAACAATGTCAAAACCATAATAGTACACCTTAACGAGTCTGTGGAAATTAATTGTACCAGACCTA------ACAACAATACAAGAACAAGTGTAC----------GTATAGGACCAGGACAAGCATTCTATGCAACA---------------G---ACATAATAGGAGATATAAGGAAGGCATATTGTAAAGTCAATCGCTCAGA-ATGGCATAAAGCTTTACAAAAGGTAGTCGAGCAATTAAGAAAACACT------------TTAAG------------AACAAAACAATAA---------T---CTTTGCTAGC-----------TCCTCAGGAGGGGATCTAGAAATCACAACACATAGT-TTT +A1.UG.1999.601FIC3 GGGCTATGCCAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGCTA---AATGGCAGTCTAGCAGAAGGGG---AGGTAAAAATTAGATCTGAAAACATCACAAACAATGTCAAAACCATAATAGTACACCTTAACGAGTCTGTGAAAATTAATTGTACCAGACCTA------ACAACAATACAAGAACAAGTGTAC----------GTATAGGACCAGGACAAGCATTCTATGCAACA---------------G---ACATAATAGGAAATATAAGGAAGGCATTTTGTACAGTCAATAGCTCAGA-ATGGCATAGAGCTTTACAAAAGGTAGTCGAGCAATTAAGAAAACACT------------TTAAG------------AACAAAACAATAA---------T---CTTTGCTAGC-----------TCCTCAGGAGGGGATCTAGAAATCACAACACATAGT-TTT +A1.UG.1997.368MPc02 GGGCTATGCCAGAATGTCAGCACAGTACAATGCACACATGGAATCAGGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGAAA---ACATAACAATTAAATCTGAAAATATCACAAATAATGCTAAAACCATAATAGTGCAACTTACCAAGCCTGTAATAATTAATTGTACCAGACCTA------ACAACAATACAAGAAAAGGTGTAC----------ACATAGGACCAGGACAAGCATTCTTTGCAGCA---------------GGTGACATAATAGGAAACATAAGGCAAGCATATTGTACGGTCAATGCATCAGA-ATGGAATAACACTTTACAACAGGTAGCTGAACAATTAAGGAAACATT------------GGAAC-A--------------CAACAATAA---------T---CTTCACTAAC-----------TCCTCAGGAGGGGATGTAGAAATTACAACACATAGT-TTT +A1.UG.1997.368FIc01 GGGCTATGCCAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGAAA---ACATAACAATTAAATCTGAAAATATCACAAATAATGCTAAAACCATAATAGTGCAACTTACCAAGCCTGTAATAATTAATTGTACCAGACCTA------ACAACAATACAAGAAAAGGTGTAC----------ACATAGGACCAGGACAAGCATTCTTTGCAACA---------------GGTGACATAATAGGAGACATAAGACAAGCATATTGTACGGTCAATGCATCAGA-ATGGAATAACACTTTACAACAGGTAGCTGAACAATTAAGGAAACATT------------GGAAC-A--------------CAACAATAA---------T---CTTCACTAAC-----------TCCTCAGGAGGGGATGTAGAAATTACAACACATAGT-TTT +A1.KE.1998.QA413_1007M_ENV_A4 GGGCCATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATTAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGGAA---AGGTAATAATTAGATCTGAAAATATCACAGACAATACCAAAAATATAATAGTACAATTTGCTGATTCTGTGAACATTACTTGTATCAGACCCA------ATAACAATACAAGAAAAAGTATAC----------ATATAGGACCAGGACAGGCATTCTATGCAACA---------------GGTGACATAATAGGGGATGTAAGACAAGCACATTGTAATGTCAGTAGATCAAA-ATGGAATAAAACTTTAGAACAGGTAGTTGAGCAATTAAGCCAATACT------------TTAAG------------AACAAAACAATAA---------G---ATTTGCTAAC-----------CCCTCAGGAGGGGATCCAGAAATCACAACACATAGT-TTT +A1.KE.1998.QB726_1165M_ENV_A3 GGGTCATGCAAGAATGTCAGTACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGGAA---AGGTAATGATTAGATCTGAAAATATCACAAACAATGCCAAAACCATAATAGTACAATTTAACAAGTCTGTAGAAATTAACTGTACCAGACCCA------CCAACAATACAAGAAAAAGTGTAC----------ATATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGCCATAATAGGGGATATAAGGAAAGCACATTGTAATGTCAGTAGAACAAA-ATGGAATCAAACTTTGCAAGAGGTAGTCAACCAATTAAGAACACACT------------TTGGG------------ACCGAAACAATAG---------T---CTTTGATAAA-----------CCCATAGGAGGGGATCTAGAAATAACAACACATAGT-TTT +A1.BE.-.PIC771_12 GGACCATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAACAAG---ACATAAGGATTAGATCTGAAAATATCTCACAAAATCACAAAACCATAATAGTACAATTTAATCAGTCGGTAGCAATTAATTGTACCAGACTTA------GAAACAATACAAGAAGAAGTATAC----------ATATAGGACCAGGACAAGCATTCTATGCAGCA---------------GGAGCAATAATAGGGGACATAAGAAGTGCACATTGTAATATCAGTAGCAAAGC-TTGGAATAAGACTTTACACAGGGTAGCCACACAGTTAAGAAAACACT------------TTAAC-G--------------GAACAATAA---------T---CTTTACTAAT-----------GCCTCAGGGGGGGATTTAGAAATTACAACACATAGT-TTT +A1.BE.-.PIC13072_1 GGACCATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAACAAG---ACATAAGGATTAGATCTGAAAATATCTCACAAAATCACAAAACCATAATAGTACAATTTAATGAGTCGGTAGCAATTAATTGTACCAGAGTTA------GAAACAATACAAGAAGAAGTATTC----------ATATAGGACCAGGACAAGCATTCTATGCAACA---------------GGAGAAATAATAGGGGACATAAGGAGTGCACATTGTAATATCAGTAGCAAAGC-TTGGAATAAGACTTTATACAGGGTAGCCACACAATTAAGAAAACACT------------TTACC------------GGAACAACAATAA---------T---CTTTACTAAT-----------GCCTCAGGGGGGGATTTAGAAATTACAACACATAGT-TTT +A1.BE.-.VI2992_2 GGACCATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCGGTAGTATCAACTCAACTACTGTTA---AATGGCAGTCTAGCAGAACAAG---ACATAAGGATTAGAGCTGAAAATATCTCAGCAAATCACAAAATCATAATAGTACAATTTAATCAGTCGGTAGAAATTAATTGTTCCAGACCTA------ACAACAATACAAGAACAAGTGTAC----------ATATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGACATAATAGGGGACATAAGAGCTGCACATTGTAATGTCAGTAAACATAA-TTGGACTGAGGCTTTATACAGGGTAGCCACACAATTAAGAAAACACT------------TTGAA------------AACAAAACAATAA---------A---CTTTACTAGT-----------GCCTCAGGAGGGGATTTAGAACTTACAACACATACT-TTT +A1.BE.-.VI3050_25 GGACCATGCAACAATGTTAGCACAGTACAATGCACACATGGAATCAAGCCGGTAGTATCAACTCAACTACTGTTA---AATGGCAGTCTAGCAGAAAAAG---AGATAAGGGTTAGATTTGAAAATATCTCAGACAATGCCAAAGCTATAATAGTACAATTTAATCATTCGGTAGAAATTAATTGTTCCAGACCTA------ACAACAATACAAGAACAAGTGTAC----------ATATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGACATAATAGGGGACATAAGAGCTGCACATTGTAATGTCAGTAAACATAA-TTGGACTGAGACTTTATACAGGGTAGCCACACAATTAAGAAAACACT------------TTGAA------------AACAAAACAATAA---------A---CTTTACTAGT-----------GCCTCAGGGGGAGATTTAGAACTTACAACACATACT-TTT +A1.BE.-.VI3196_10 GGACCATGCAACAATGTTAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTACTGTTA---AATGGCAGTCTAGCAGAAAAAG---AGATAAGGGTTAGATTTGAAAATATCTCAGACAATGCCAAAGCTATAATAGTACAATTTAATTATTCGGTAGAAATTAATTGTTCCAGACCTA------ACAACAATACAAGAACAAGTGTAC----------ATATAGGACCAGGACAAGCATTCTATGCAACA---------------GGGGACATAATAGGGGACATAAGAGCTGCACATTGTAATGTCAGTAAACATAA-TTGGACTGAGACTTTATACAGGGTAGCCACACAATTAAGAAAACACT------------TTGGA------------AACAAAACAATAA---------A---ATTTACTAAT-----------GCCTCAGGGGGAGATTTAGAACTTACAACACATACT-TTT +A1.BE.-.VI1383_20 GGACTATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAGGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAAAAG---AGGTAAGGATTAGATCTGAAAATATCACAAACAATGCCAAAACCATAATAGTACAACTTGATCAGCCTGTAAATATTACTTGTATGAGACCTA------ACAACAATACAAGAAAAAGTGTAC----------ATATAGGACCAGGACAAGCATTTTATGCAACA---------------GGTGACATAATAGGGAACATAAGAGAAGCACATTGTAATGTCAGTAGAGAAGA-TTGGAACAATACTTTACAAAAGGTAGCCACACAATTAAGAAAACACT------------TTGGG------------AACAAAACAATAA---------T---CTTTGCTAAG-----------TCCTCAGGGGGGGATTTAGAAATTACAACACATAGT-TTT +A.RW.-.PVPI GGGCCATGCAAAAATGTCAGCTCAGTACAATGCACACATGGAATCAGGCCAGTAATATCAACTCAACTGTTGTTA---AATGGCAGTCTAGCAGAAGAAAAAGATGTACAGATTAGATCTGAAAATATCACAAACAATGCCAAAACCATAATAGTACAATTTACCAAGGCTGTAAAAATTAATTGTACCAGACCTA------ACAAAACTACAAGAAAAGGTGTGC----------GTATAGGACCAGGACAAGCATGGTATGCAAGA---------------GGTAACATGATAGGAGATATAAGAAAAGCATATTGTAATGTCAGTAGAACAGA-ATGGAATCAAACATTACAAAAGGTAGCCACACAATTAGGAAAACACT------------TTAAC-A--------------AAACAATAA---------C---CTTTACTAAA-----------TCCTCAGGAGGGGATTTAGAAATTACAACACATAGT-TTT +A.KE.1990.K89 GGGCCATGCACGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGGAG---AGGTAATGATTAGATCTGAAAATATCACAAACAATGCCAAAAATATAATAGTACAATTTGCCGAGCCTGTAAAAATTAATTGTACCAGACCTA------ACAACAATACAAGAATGAGTATAC----------GTATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGACATAATAGGGAACATAAGACAAGCACATTGTAATGTCAGTAGAGCAGA-ATGGAATACAACTTTGCAAAAGGTAGTCACAAAATTAAGGGAATACT------------TTGGGA-AC---A-----ACAAAACAATAA---------A---ATTTGCTAAC-----------TCCTCAGGAGGGGATCTAGAAATCACAACACATAGT-TTT +A1.UG.1990.UG275A GGGATATGTAATAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAGCTGCTGTTA---AATGGCAGTTTAGCAGAAGGAA---AGGTAAAGATTAGATCTGAAAATATCACAAACAATGCCAAAAATATACTGGTATAACTTACCACGCCTGTAACAATTAATTGTACCAGACCTA------ACAACAATACAAGAACAAGTGTAC----------GTATAGGACCAGGACAATCATTCTATGCAACA---------------GGTGACATAATAGGGGATATAAAACAAGCACATTGTAATATCAGTAGAGCAGA-ATGGAATGAAACTTTACAAAAGGTAGTCAGCCAATTAAGAACACACT------------TTGGC------------AACAAAACAATAA---------T---CTTTGGTAAC-----------TCCTCAGGAGGGGATATAGAAATAACAACACATAGT-TTT +A1.UG.1990.UG273A GGGCTATGCAAGAATGTCAGCTCAGTACAATGCACACATGGAATCAGGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAACAG---AGGTAAAAATTAAATCTGAAAATATCTCAGACAATGCTAAAACCCTAATAGTACAACTTACCACGCCTGTAAAAATTAATTGTACTAGACCTG------GCAACAATACAAGAACAAGTGTAC----------GTATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGATATAATAGGAGATATAAGACAAGCACATTGTAATGTCAGTAGATCAGA-ATGGAAGGAAACTTTGCAAAAGGTAGTCAAACAATTAAGAACACACT------------GGAAC-A--------------AAACAATAA---------T---CTTTACAAAC-----------TCCTCAGGAGGGGATTTAGAAATTACAACACATAGT-TTT +A.RW.-.SF1703 GGGCCATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAGGCCAGTAATATCAACCCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGGAA---GGGTAAAGATTAGATCTGAAAATATCACAAACAATGCCAAAACCATAATAGTACAACTTAACAAGACTGTAGAAATTAATTGTACCAGACCTA------ACAACAATACAAGAAAAAGTGTAC----------GTATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGACATAATAGGGGATATAAGACAAGCATATTGTAATGTCAGTAGAGCAGA-CTGGAATAAAACTTTACAAGGGGTAGCCAACCAATTAAAAAGTTACT------------TTAGT------------AACAAAACAATAA---------T---CTTTGCTAGC-----------TCCTCAGGAGGGGATTTAGAAATTACAACACATAGT-TTT +A.GB.-.MC108 GGGCCATGCACGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTA---AATGGCAGTCTAGCAGAAGAAG---AGGTAATGATTAGATCTGAAAATATCACAAACAATGCCAAAAACATAATAGTACAATTTACCACGTCTGTAAAAATTAATTGTACCAGATTTA------ACAACAATACAAGAAGAAGTATAC----------GTATAGGACCAGGACAAGCATTCTATGCAACA---------------AATGACATAATAGGGAATATAAGACAAGCACAATGTGATGTCAATAGAACAGA-ATGGAATAAAGCTTTGCAAGGGGTAGTCAAACAATTACAAAAATACT------------TTGGG------------AACAAAACAATAA---------T---CTTTACTAAC-----------TCCTCAGGAGGGGATCTAGAAATAACAACACATAGT-TTT +A.GB.-.MA246 GGGCTATGCAAGAATGTCAGCACAGTACAATGCACACATGGAATCAAGCCAGTAGTATCAACCCAACTGCTGTTA---AATGGCAGTCTAGCAGAAAGCA---AGGTAATGATTAGATCTGAAAATATCACAAACAATGCCAAAAACATACTAGTACAACTTACCAGTCCTGTAAACATTAGTTGTATCAGACCTA------ACAACAATACAAGAAAAAGTGTAC----------GTATAGGACCAGGACAAGCATTCTATGCAACA---------------GGTGAAATCATAGGGAATATAAGACAAGCATATTGTAATGTCAATAGATCAGA-ATGGAATGAAGCTCTGCGGGAGGTAGTCAAACAATTAAGAACATACT------------TTAAC-A--------------AAACAATAA---------T---CTTTGATAAC-----------TCCTCAGGAGGGGATCTAGAAATAACAACACATAGT-TTT diff --git a/examples/include-from-file/do_filter.sh b/examples/include-from-file/do_filter.sh new file mode 100755 index 0000000..0ca7f19 --- /dev/null +++ b/examples/include-from-file/do_filter.sh @@ -0,0 +1,3 @@ +#!/bin/bash +seqmagick convert --include-from-file selection.txt \ + ../aligned.fasta filtered.fasta diff --git a/examples/include-from-file/selection.txt b/examples/include-from-file/selection.txt new file mode 100644 index 0000000..7f81f13 --- /dev/null +++ b/examples/include-from-file/selection.txt @@ -0,0 +1,3 @@ +gi|66864576|gb|DQ027786.1| +hxb2 +gi|66864553|gb|DQ027774.1| diff --git a/examples/quality-filter/sample.barcodes.csv b/examples/quality-filter/sample.barcodes.csv new file mode 100644 index 0000000..663c0c3 --- /dev/null +++ b/examples/quality-filter/sample.barcodes.csv @@ -0,0 +1 @@ +Sample1,AAAAAA diff --git a/examples/quality-filter/sample.fna b/examples/quality-filter/sample.fna new file mode 100644 index 0000000..1e4867f --- /dev/null +++ b/examples/quality-filter/sample.fna @@ -0,0 +1,24 @@ +>s1 +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAA +>s2 +CCCCCCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAA +>s3 +GGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +>s4 +AAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAA +>s5 +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA diff --git a/examples/quality-filter/sample.qual b/examples/quality-filter/sample.qual new file mode 100644 index 0000000..35baac2 --- /dev/null +++ b/examples/quality-filter/sample.qual @@ -0,0 +1,56 @@ +>s1 +40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 +40 40 40 40 40 40 40 40 38 38 17 17 17 18 30 35 37 40 40 40 +40 40 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 +40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 +39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 36 38 38 40 +40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 +40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 +40 40 40 40 40 40 40 40 40 40 40 38 38 38 39 40 39 37 38 38 +35 40 37 40 40 40 40 40 40 40 40 40 40 40 40 40 40 37 34 32 +32 30 30 31 27 25 19 19 14 14 22 22 24 22 22 20 20 21 25 +>s2 +22 27 31 38 38 38 40 40 40 40 40 40 40 40 40 40 40 40 40 40 +32 32 32 33 40 40 40 40 40 40 28 29 29 29 39 40 40 40 40 40 +40 40 40 39 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 +40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 +40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 +40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 +40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 +40 40 40 40 40 40 40 40 40 40 40 39 39 39 38 38 35 35 32 32 +32 36 37 37 37 37 37 37 35 36 36 37 37 38 38 39 37 39 35 32 +32 32 29 29 29 29 21 21 13 13 22 24 27 24 23 22 22 22 25 +>s3 +40 40 40 40 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 +40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 +40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 36 36 36 39 +40 40 40 40 34 34 34 40 23 23 23 23 24 33 37 37 33 33 33 33 +30 30 30 30 30 33 30 22 22 22 26 26 20 20 29 30 35 33 37 34 +34 34 34 +>s4 +34 36 36 40 37 37 37 36 32 27 31 32 37 37 37 37 32 36 32 35 +36 39 38 33 36 36 36 34 32 32 16 16 16 14 18 24 24 35 35 31 +31 29 28 28 30 33 32 32 32 40 40 32 32 32 36 38 40 36 36 32 +32 38 38 40 38 40 40 40 37 40 38 36 34 36 36 37 38 38 40 40 +37 37 37 37 40 40 38 38 36 35 36 36 40 37 37 37 37 37 37 38 +38 38 38 38 38 38 40 38 38 38 38 38 40 40 39 39 39 39 40 40 +40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 39 +39 39 40 40 40 40 40 40 37 37 34 32 31 35 37 27 25 17 17 16 +20 27 34 37 38 38 34 34 34 37 38 38 40 40 39 39 38 38 37 29 +29 28 29 29 29 27 16 16 13 20 21 27 27 27 24 24 24 25 +>s5 +40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 +40 40 40 40 40 40 40 36 30 30 18 18 18 33 35 33 38 38 38 34 +33 26 29 29 38 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 +40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 +40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 +40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 +40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 +40 40 40 29 28 28 31 31 31 39 40 40 40 40 40 40 40 40 40 40 +40 39 39 39 39 35 35 35 35 40 38 38 40 30 31 30 28 37 33 23 +35 35 37 37 37 32 28 28 28 28 35 25 40 38 38 40 39 39 40 40 +39 35 35 35 35 40 36 36 33 33 38 37 24 38 38 37 37 34 34 32 +35 30 31 16 14 14 20 16 16 19 19 20 18 16 13 13 13 13 17 20 +30 30 20 20 16 16 17 16 14 14 14 16 12 12 12 16 19 23 26 27 +27 24 24 27 29 29 27 27 24 26 30 24 23 23 23 18 17 14 14 14 +13 13 13 17 16 16 16 17 14 16 20 20 diff --git a/examples/range.fasta b/examples/range.fasta new file mode 100644 index 0000000..49115a1 --- /dev/null +++ b/examples/range.fasta @@ -0,0 +1,7 @@ +>456442|refseq_protein.39.micro/1-331 +---MINVGVLGATGAVGQRFVELLSDHP-------------------------------- +----------------------------------------------------------- +>368407|refseq_protein.39.micro/1-331 +---MINVGVLGATGAVGQRFVQLLADHP-------------------------------- +----------------------------------------------------------- + diff --git a/examples/test.fasta b/examples/test.fasta new file mode 100644 index 0000000..83a8413 --- /dev/null +++ b/examples/test.fasta @@ -0,0 +1,417 @@ +>gi|66864576|gb|DQ027786.1| HIV-1 isolate QA013_2282M_C6 from Kenya envelope glycoprotein (env) gene, partial cds +ACTCTGTGTCACTTTAAACTGCATTGAATGGAAGAGTAATAGTAGCAATAACGGCACTGA +TCAGGACATGAATGAGAAACTGCTCTTTCAATATAACCACAGAAGTAAGAGATAAGAAGA +AGCAAGTACAGGCACTTTTATAAGCTTGATGTAGTACAGATAGATAATAGTAATACTAGC +TATAGATTAATAAATTGTAATACCTGCCATTACACAGGCATGTCCAAAGGTAACCTTTGA +GCCAATTCCCATACATTATTGTGCCCCAGCTGGTTGCAATTCTAAAATGTAACAATAAGA +AATTCAATGGGACGGGTCCATGCACAAACGTCAGCACAGTAGTGTACACATGGGATTAGG +CCAGTAGTGTCAACTCAGCTGTTGTTGAATGGCAGTCTAGCAGAAGAAGATAATAATTAG +ATCTGAAAATTTAACAAATAATGCAAAAATTATAATAGTACAGCTTAATAAGTCTGTCAA +TTCATTGCACAAGACCCTACAACAATACAAGAAGAGGTGAACATATGGGACCAGGGCGAG +CACTCTACAGAAAAAATAGTTGGAAATATAAGACAAGCATATTGTAACATTAGTGGAGGG +GAATGGAATAAAGTTACAGCAGGTAGCTGGCAAATTAAGAAACCTTCTTAATAAAACAAC +AATAATTTTTAAACCACCCGCGAGGGGACCTAGAAATTACAACACACAGCTTTAATTGTA +GAGGGGAATTTTTCTATTGTAATACATCACCTGTTTAACAGCACTTGGGACAACAATACC +CAGGAATCAAATGACACTATAATGATCCCATGCAGAAAAACAAATTATAAACATGTGGCA +GGGAGTAGGAAAAGCAATGTATGCCCCTCCCATTGAAGGACACATGTTGTTCATCAAATA +TTACAGGACTATTGTTGACAAGAGATGGTGGTGTAGCTAATGGGAGTTCGAATGACCTTC +AGACCTGGGGGAGGAGAC +>gi|66864574|gb|DQ027785.1| HIV-1 isolate QA013_1790M_B6 from Kenya envelope glycoprotein (env) gene, partial cds +ACTCTGTGTCACGTTAAACTGCATTGAATGGATAAATAATAATAGCACTAACGCCACTGG +TCAGGACATGAATGAAAAACTGCTCTTTCAATATAGCCACAGAAGTAAGAGATAAGAAAA +AGCAAGTACAGGCACTTTTATAAACTTGATGTAGTACAGATAGATAATAGTAATACTAGC +TATAGATTAATAAATTGTAATACCTGCCATTACACAGGCATGTCCAAAGGTAACCTTTGA +GCCAATTCCCATACATTATTGTGCCCCAGCTGGTTGCAATTCTAAAATGTAATAATAAGA +AATTCAATGGGACGGGTCCATGCAAAAACGTCAGCACAGTAGTGTACACATGGGATTAGG +CCAGTAGTGTCAACTCAGCTGTTGTTGAATGGCAGTCTAGCAGAAGAAGATAATAATTAG +ATCTGAAAATTTAACAAATAATGCAAAAATTATAATAGTACAGCTTAATGAGTCTGTCAA +TTAGTTGCACAAGGCCCTACAACAATACAAGAAAAGGTGAACATATGGGACCAGGGCGAG +CACTCTACAGAAAAAATAGTTGGAGATATAAGACAAGCACATTGTAACATTAGTGGAAGG +GAATGGAATAACGTTACAGCAGGTAGCTGACAAATTAAGAAACCTTCTTAATAAAACAAC +AATAATTTTTAAACCACCTGCGAGGAGACCTAGAAATTACAACACACAGCTTTAATTGTC +TAGGGGAATTTTTCTACTGCAACACATCAACTGTTTAATAATAGTAAATGGGAATCAAAT +AGTAGTACAGGGGAAAATAAAAATGAAGATATAATCACTCCCATGCAGAATAAAACAAAT +TATAAACATGTGGCAGGGAGTAGGAAAAGCAATGTATGCCCCTCCTTGAAGGACACATCA +GTTGTTCGTCAATTATTACAGGACTATTGTTGACAAGAGATGGTGGTGTAACTTCGGAGT +TCGGAGACCTTCAGACCTGGGGGAGGAGAC +>gi|66864568|gb|DQ027782.1| HIV-1 isolate QA013_385M_C3 from Kenya envelope glycoprotein (env) gene, partial cds +ACTCTGTGTCACTTTAAACTGCACTGAATGGAAGAATAATGGTAGCACTAACGTCACTGA +TCAGGACATGAATGAAAAACTGCTCTTTCAATATAACCACAGAAGTAAGAGATAAGAAGA +AGCAAGTACAGGCACTTTTATAAACTTGATGTAGTACAGATAGATAATAGTACCAGCTAT +AGATTAATAAATTGTAATACCTCTGATTACACAGGCATGTCCAAAGGTAACCTTTGAGCC +AATTCCCATACATTATTGTGCCCCAGCTGGATTCAATTCTAAAATGTAATGATAAGAAGT +TCAATGGGACGGGTCCATGCAAAAACGTCAGCACAGTACAGTACACATGGGATTAGGCCA +GTAGTGTCAACTCAGCTGTTGTTGAATGGCAGTCTAGCAGAAGAAGAGAATAATTAGATC +TGAAAATCTAACAAATAATGCAAAAATTATAATAGTACAGCTTAATGAGTCTGTACCTTA +ATTGCACAAGGCCCTACAACAATACAAGAAAAGGTGAACATATGGGACCAGGGCGAGCAC +CCTTTAGAAAGAATAGTTGGAGATATAAGACAAGCATATTGTAGCATTAGTGGAATGGGA +TGGAATAAAACTTCAGCAGGTAGCTGACAAATTAAGAAACCTTCTTAATAAAACAACAAT +AATTTTTAAACCACCCGCGGGGGGACCTAGAAATTACAACACACAGCTTTAATTGTGGAG +GGGAATTTTTCTACTGCAACACATCAAGAGTTTAATAATAGTGAATGGAAATCAAATAGT +AGTACAGGGGGAAATGAAAGTATAATCATACTCCCATAGAATAAAACAAATTATAAACAT +GTGGCAGGGAGTAAGAAAAGCAATGTATGCCCCTCCCATTGAAGGACATCAATTGTTCAT +CAAATATTACAGGACTATTGTTGACAAGAGATGGTGGTGTAACTAATCAGAGTGAATGAG +ACCTTCAGACCTGGGGGAGGAGAC +>gi|66864564|gb|DQ027780.1| HIV-1 isolate QA013_105M_C2_1 from Kenya envelope glycoprotein (env) gene, partial cds +ACTCTGTGTCACTTTAAACTGCACTGAATGGAAGAATAATGGTAGCACTAACGTCACTGA +TCAGGACATGAATGAAAAACTGCTCTTTCAATATAACCACAGAAGTAAGAGATAAGAAGA +AGCAAGTACAGGCACTTTTATAAACTTGATGTAGTACAGATAGATAATAGTACCAGCTAT +AGATTAATAAATTGTAATACCTCTGATTACACAGGCATGTCCAAAGGTAACCTTTGAGCC +AATTCCCATACATTATTGTGCCCCAGCTGGATTCAATTCTAAAATGTAATGATAAGAAGT +TCAATGGGACGGGTCCATGCAAAAACGTCAGCACAGTACAGTACACATGGGATTAGGCCA +GTAGTGTCAACTCAGCTGTTGTTGAATGGCAGTCTAGCAGAAGAAGAGAATAATTAGATC +TGAAAATCTAACAAATAATGCAAAAATTATAATAGTACAGCTTAATGAGTCTGTACCTTA +ATTGCACAAGGCCCTACAACAATACAAGAAAAGGTGAACATATGGGACCAGGGCGAGCAC +TCTTTAGAAAGAATAGTTGGAGATATAAGACAAGCATATTGTAGCATTAGTGGAATGGGA +TGGAATAAAACTTCAGCAGGTAGCTGACAAATTAAGAAACCTTCTTAATAAAACAACAAT +AATTTTTAAACCACCCGCGGGGGGACCTAGAAATTACAACACACAGCTTTAATTGTGGAG +GGGAATTTTTCTACTGCAACACATCAAGAGTTTAATAATAGTGAATGGAAATCAAATAGT +AGTACAGGGGGAAATGAAAGTATAATCATACTCCCATAGAATAAAACAAATTATAAACAT +GTGGCAGGGAGTAGGAAAAGCAATGTATGCCCCTCCCATTGAAGGACATCAATTGTTCAT +CAAATATTACAGGACTATTGTTGACAAGAGATGGTGGTGTAACTAATCAGAGTGAAGGAG +ACCTTCAGACCTGGGGGAGGAGAC +>gi|66864562|gb|DQ027779.1| HIV-1 isolate QA013_70M_B1 from Kenya envelope glycoprotein (env) gene, partial cds +ACTCTGTGTCACTTTAAACTGCACTGAATGGAAGAATAATGGTAGCACTAACGTCACTGA +TCAGGACATGAATGAAAAACTGCTCTTTCAATATAACCACAGAAGTAAGAGATAAGAAGA +AGCAAGTACAGGCACTTTTATAAACTTGATGTAGTACAGATAGATAATAGTACCAGCTAT +AGATTAATAAATTGTAATACCTCTGATTACACAGGCATGTCCAAAGGTAACCTTTGAGCC +AATTCCCATACATTATTGTGCCCCAGCTGGATTCAATTCTAAAATGTAATGATAAGAAGT +TCAATGGGACGGGTCCATGCAAAAACGTCAGCACAGTACAGTACACATGGGATTAGGCCA +GTAGTGTCAACTCAGCTGTTGTTGAATGGCAGTCTAGCAGAAGAAGAGAATAATTAGATC +TGAAAATCTAACAAATAATGCAAAAATTATAATAGTACAGCTTAATGAGTCTGTACCTTA +ATTGCACAAGGCCCTACAACAATACAAGAAAAGGTGAACATATGGGACCAGGGCGAGCAC +TCTTTAGAAAGAATAGTTGGAGATATAAGACAAGCATATTGTAGCATTAGTGGAATGGGA +TGGAATAAAACTTCAGCAGGTAGCTGACAGATTAAGAAACCTTCTTAATAAAACAACAAT +AATTTTTAAACCACCCGCGGGGGGACCTAGAAATTACAACACACAGCTTTAATTGTGGAG +GGGAATTTTTCTACTGCAACACATCAAGAGTTTAATAATAGTGAATGGAAATCAAATAGT +AGTACAGGGGGAAATGAAAGTATAATCATACTCCCATAGAATAAAACAAATTATAAACAT +GTGGCAGGGAGTAGGAAAAGCAATGTATGCCCCTCCCATTGAAGGACATCAATTGTTCAT +CAAATATTACAGGACTATTGTTGACAAGAGATGGTGGTGTAACTAATCAGAGTGAATGAG +ACCTTCAGACCTGGGGGAGGAGAC +>gi|66864566|gb|DQ027781.1| HIV-1 isolate QA013_264M_A2 from Kenya envelope glycoprotein (env) gene, partial cds +ACTCTGTGTCACTTTAAACTGCACTGAATGGAAGAATAATGGTAGCACTAACGTCACTGA +TCAGGACATGAATGAAAAACTGCTCTTTCAATATAACCACAGAAGTAAGAGATAAGAAGA +AGCAAGTACAGGCACTTTTATAAACTTGATGTAGTACAGATAGATAATAGTACCAGCTAT +AGATTAATAAATTGTAATACCTCTGATTACACAGGCATGTCCAAAGGTAACCTTTGAGCC +AATTCCCATACATTATTGTGCCCCAGCTGGATTCAATTCTAAAATGTAATGATAAGAAGT +TCAATGGGACGGGTCCATGCAAAAACGTCAGCACAGTACAGTACACATGGGATTAGGCCA +GTAGTGTCAACTCAGCTGTTGTTGAATGGCAGTCTAGCAGAAGAAGAGAATAATTAGATC +TGAAAATCTAACAAATAATGCAAAAATTATAATAGTACAGCTTAATGAGTCTGTACCTTA +ATTGCACAAGGCCCTACAACAATACAAGAAAAGGTGAACATATGGGACCAGGGCGAGCAC +TCTTTAGAAAGAATAGTTGGAGATATAAGACAAGCATATTGTAGCATTAGTGGAATGGGA +TGGAATAAAACTTCAGCAGGTAGCTGACAAATTAAGAAACCTTCTTAATAAAACAACAAT +AATTTTTAAACCACCCGCGGGGGGACCTAGAAATTACAACACACAGCTTTAATTGTGGAG +GGGAATTTTTCTACTGCAACACATCAAGAGTTTAATAATAGTGAATGGAAATCAAATAGT +AGTACAGGGGGAAATGAAAGTATAATCATACTCCCATAGAATAAAACAAATTATAAACAT +GTGGCAGGGAGTAGGAAAAGCAATGTATGCCCCTCCCATTGAAGGACATCAATTGTTCAT +CAAATATTACAGGACTATTGTTGACAAGAGATGGTGGTGTAACTAATCAGAGTGAATGAG +ACCTTCAGACCTGGGGGAGGAGAC +>gi|66864572|gb|DQ027784.1| HIV-1 isolate QA013_987M_C4 from Kenya envelope glycoprotein (env) gene, partial cds +ACTCTGTGTCACTTTAAACTGCACTGAATGGACGAATAATAGTAGCACTAACGCCACTGA +TCAGGACATGAATGAGAAACTGCTCTTTCAATATAACCACAGAAGTAAGAGATAAGAAGA +AGCAAGTACAGGCACTTTTATAAACTTGATGTAGTACAGATAGATAATAGTAATACTAGC +TATAGATTAATAAATTGTAATACCTGCCATTACACAGGCATGTCCAAAGGTAACCTTTGA +GCCAATTCCCATACATTATTGTGCCCCAGCTGGTTGCAATTCTAAAATGTAATGATAAGA +AATTCAATGGGACGGGTCCATGCAAAAACGTCAGCACAGTAGTGTACACATGGGATTAGG +CCAGTAGTGTCAACTCAGCTGTTGTTGAATGGCAGTCTAGCAGAAGAAGATAATAATTAG +ATCTGAAAATCTAGCAAATAATGCAAAAATTATAATAGTACAGCTTAATGAGTCTGTCAA +TTCATTGCACAAGACCCTACAACAATACAAGAAAAGGTGAACATATGGGACCAGGGCGAG +CACTGTACAGAAAAAATAGTTGGAGATATAAGACAAGCATATTGTAGCATTAGTGGACTG +GGATGGAATAAAACTACAGCAGGTAGCTGACAAATTAAGAAACCTTCTCAATACAACAAC +AATAATTTTTAAACCACCCGCGAGGGGACCTAGAAATTACAACACACAGCTTTAATTGTG +GAGGGGAATTTTTCTACTGCAACACATCAACTGTTTAATAATAGTGAATGGAAATCAAAT +AGTAGTACAGGGGGAAATGAAAGTATAATCAAACTCCTGCAGAATAAAACAAATTATAAA +CATGTGGCAGGGAGTAGGAAAAGCAATGTATGCCCCTCCCATTGAGACACATCAGTTGTT +CATCAAATATTACAGGACTATTGTTGACAAGAGATGGTGGTGTAGCTAATCAGTTCGAAT +GAGACCTTCAGACCTGGGGGAGGAGAC +>gi|66864570|gb|DQ027783.1| HIV-1 isolate QA013_559M_C11 from Kenya envelope glycoprotein (env) gene, partial cds +ACTCTGTGTCACTTTAAACTGCACTGAATGGAAGAATAATAGTAGCACTAACGGCACTGA +TCAGGACATGAATGAGAAACTGCTCTTTCAATATAACCACAGAAGTAAGAGATAAGAAGA +AGCAAGTACAGGCACTTTTATAAACTTGATGTAGTACAGATAGATAATAGTAGTACTAGC +TATAGATTAATAAATTGTAATACCTGCCATTACACAGGCATGTCCAAAGGTAACCTTTGA +GCCAATTCCTATACATTATTGTGCCCCAGCTGGTTGCAATTCTAAAATGTAATGATAAGA +AATTCAATGGGACGGGTCCATGCAAAAACGTCAGCACAGTAGTGTACACATGGGATTAGG +CCAGTAGTGTCAACTCAGCTGTTGTTGAATGGCAGTCTAGCAGAAGAAGATAATAATTAG +ATCTGAAGATCTAACAAATAATGCAAAAATTATAATAGTACAGCTTAATGAGTCTGTCAA +TTAATTGCACAAGGCCCTACAACAATACAAGAAAAGGTGAACATATGGGACCAGGGCGAG +CACTCTACAGAAAAAATAGTTGGAGATATAAGACAAGCATATTGTAGCATTAGTGGAATG +GGATGGAATAAAATTACAGCAGGTAGCTGACAAATTAAGAAACCTTCTTAATAAAACAAC +AATAATTTTTAAACCACCCGCGAGGGGACCTAGAAATTACAACACACAGCTTTAATTGTA +GAGGGGAATTTTTCTACTGCAACACATCAACTGTTTAATAATAGTGAATGGAAATCAAAT +AGTAGTACAGGGGGAAATGAAAGTATAATCATACTCCTGCAGAATAAAACAAATTATAAA +CATGTGGCAGGGAGTAGGAAAAGCAATGTATGCCCCTCCCATTGAGACACATCAGTTGTT +CATCAAATATTACAGGACTATTGTTGACAAGAGATGGTGGTGTAACTAATTCGTGAGACC +TTCAGACCTGGGGGAGGAGAC +>hxb2 +TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACA +CACAAGGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGATCAGATATCCAC +TGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGAGAAGTTAGAAGAAGCCA +ACAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGAATGGATGACCCGG +AGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACATGGCCCGAG +AGCTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCG +CTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGAT +CCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGA +GCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCT +TGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTC +AGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACCTGAAAG +CGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCGGCTTGCTGAAGCGCGCACGG +CAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAATTTTGACTAGCGGAGGCTAGA +AGGAGAGAGATGGGTGCGAGAGCGTCAGTATTAAGCGGGGGAGAATTAGATCGATGGGAA +AAAATTCGGTTAAGGCCAGGGGGAAAGAAAAAATATAAATTAAAACATATAGTATGGGCA +AGCAGGGAGCTAGAACGATTCGCAGTTAATCCTGGCCTGTTAGAAACATCAGAAGGCTGT +AGACAAATACTGGGACAGCTACAACCATCCCTTCAGACAGGATCAGAAGAACTTAGATCA +TTATATAATACAGTAGCAACCCTCTATTGTGTGCATCAAAGGATAGAGATAAAAGACACC +AAGGAAGCTTTAGACAAGATAGAGGAAGAGCAAAACAAAAGTAAGAAAAAAGCACAGCAA +GCAGCAGCTGACACAGGACACAGCAATCAGGTCAGCCAAAATTACCCTATAGTGCAGAAC +ATCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGGGTAAAA +GTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTGATACCCATGTTTTCAGCATTATCAGAA +GGAGCCACCCCACAAGATTTAAACACCATGCTAAACACAGTGGGGGGACATCAAGCAGCC +ATGCAAATGTTAAAAGAGACCATCAATGAGGAAGCTGCAGAATGGGATAGAGTGCATCCA +GTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGGAAGTGACATAGCA +GGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGACAAATAATCCACCTATCCCA +GTAGGAGAAATTTATAAAAGATGGATAATCCTGGGATTAAATAAAATAGTAAGAATGTAT +AGCCCTACCAGCATTCTGGACATAAGACAAGGACCAAAGGAACCCTTTAGAGACTATGTA +GACCGGTTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAGGAGGTAAAAAATTGGATG +ACAGAAACCTTGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTG +GGACCAGCGGCTACACTAGAAGAAATGATGACAGCATGTCAGGGAGTAGGAGGACCCGGC +CATAAGGCAAGAGTTTTGGCTGAAGCAATGAGCCAAGTAACAAATTCAGCTACCATAATG +ATGCAGAGAGGCAATTTTAGGAACCAAAGAAAGATTGTTAAGTGTTTCAATTGTGGCAAA +GAAGGGCACACAGCCAGAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTGGAAATGTGGA +AAGGAAGGACACCAAATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATC +TGGCCTTCCTACAAGGGAAGGCCAGGGAATTTTCTTCAGAGCAGACCAGAGCCAACAGCC +CCACCAGAAGAGAGCTTCAGGTCTGGGGTAGAGACAACAACTCCCCCTCAGAAGCAGGAG +CCGATAGACAAGGAACTGTATCCTTTAACTTCCCTCAGGTCACTCTTTGGCAACGACCCC +TCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATG +ATACAGTATTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAA +TTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATA +AAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGT +TGACTCAGATTGGTTGCACTTTAAATTTTCCCATTAGCCCTATTGAGACTGTACCAGTAA +AATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAA +TAAAAGCATTAGTAGAAATTTGTACAGAGATGGAAAAGGAAGGGAAAATTTCAAAAATTG +GGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAAT +GGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTC +AATTAGGAATACCACATCCCGCAGGGTTAAAAAAGAAAAAATCAGTAACAGTACTGGATG +TGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTA +CCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCAC +AGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTT +TTAGAAAACAAAATCCAGACATAGTTATCTATCAATACATGGATGATTTGTATGTAGGAT +CTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATCTGTTGA +GGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGG +GTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACA +GCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAATTGAATTGGGCAAGTCAGATTT +ACCCAGGGATTAAAGTAAGGCAATTATGTAAACTCCTTAGAGGAACCAAAGCACTAACAG +AAGTAATACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGAGAGATTCTAA +AAGAACCAGTACATGGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGA +AGCAGGGGCAAGGCCAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTGAAAA +CAGGAAAATATGCAAGAATGAGGGGTGCCCACACTAATGATGTAAAACAATTAACAGAGG +CAGTGCAAAAAATAACCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAAC +TGCCCATACAAAAGGAAACATGGGAAACATGGTGGACAGAGTATTGGCAAGCCACCTGGA +TTCCTGAGTGGGAGTTTGTTAATACCCCTCCCTTAGTGAAATTATGGTACCAGTTAGAGA +AAGAACCCATAGTAGGAGCAGAAACCTTCTATGTAGATGGGGCAGCTAACAGGGAGACTA +AATTAGGAAAAGCAGGATATGTTACTAATAGAGGAAGACAAAAAGTTGTCACCCTAACTG +ACACAACAAATCAGAAGACTGAGTTACAAGCAATTTATCTAGCTTTGCAGGATTCGGGAT +TAGAAGTAAACATAGTAACAGACTCACAATATGCATTAGGAATCATTCAAGCACAACCAG +ATCAAAGTGAATCAGAGTTAGTCAATCAAATAATAGAGCAGTTAATAAAAAAGGAAAAGG +TCTATCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATAAAT +TAGTCAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGATG +AACATGAGAAATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTGCCACCTG +TAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGAGAAGCCATGC +ATGGACAAGTAGACTGTAGTCCAGGAATATGGCAACTAGATTGTACACATTTAGAAGGAA +AAGTTATCCTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTTATTCCAG +CAGAAACAGGGCAGGAAACAGCATATTTTCTTTTAAAATTAGCAGGAAGATGGCCAGTAA +AAACAATACATACTGACAATGGCAGCAATTTCACCGGTGCTACGGTTAGGGCCGCCTGTT +GGTGGGCGGGAATCAAGCAGGAATTTGGAATTCCCTACAATCCCCAAAGTCAAGGAGTAG +TAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAAC +ATCTTAAGACAGCAGTACAAATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGA +TTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATACAAACTA +AAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCA +GAAATCCACTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAA +TACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATTAGGGATT +ATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATTAGAACA +TGGAAAAGTTTAGTAAAACACCATATGTATGTTTCAGGGAAAGCTAGGGGATGGTTTTAT +AGACATCACTATGAAAGCCCTCATCCAAGAATAAGTTCAGAAGTACACATCCCACTAGGG +GATGCTAGATTGGTAATAACAACATATTGGGGTCTGCATACAGGAGAAAGAGACTGGCAT +TTGGGTCAGGGAGTCTCCATAGAATGGAGGAAAAAGAGATATAGCACACAAGTAGACCCT +GAACTAGCAGACCAACTAATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATA +AGAAAGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAAC +AAGGTAGGATCTCTACAATACTTGGCACTAGCAGCATTAATAACACCAAAAAAGATAAAG +CCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGATGGAACAAGCCCCAGAAGACC +AAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAGAGCTTTTAGAGGAGCTTAAGA +ATGAAGCTGTTAGACATTTTCCTAGGATTTGGCTCCATGGCTTAGGGCAACATATCTATG +AAACTTATGGGGATACTTGGGCAGGAGTGGAAGCCATAATAAGAATTCTGCAACAACTGC +TGTTTATCCATTTTCAGAATTGGGTGTCGACATAGCAGAATAGGCGTTACTCGACAGAGG +AGAGCAAGAAATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATCCAGGAAGTCA +GCCTAAAACTGCTTGTACCAATTGCTATTGTAAAAAGTGTTGCTTTCATTGCCAAGTTTG +TTTCATAACAAAAGCCTTAGGCATCTCCTATGGCAGGAAGAAGCGGAGACAGCGACGAAG +AGCTCATCAGAACAGTCAGACTCATCAAGCTTCTCTATCAAAGCAGTAAGTAGTACATGT +AACGCAACCTATACCAATAGTAGCAATAGTAGCATTAGTAGTAGCAATAATAATAGCAAT +AGTTGTGTGGTCCATAGTAATCATAGAATATAGGAAAATATTAAGACAAAGAAAAATAGA +CAGGTTAATTGATAGACTAATAGAAAGAGCAGAAGACAGTGGCAATGAGAGTGAAGGAGA +AATATCAGCACTTGTGGAGATGGGGGTGGAGATGGGGCACCATGCTCCTTGGGATGTTGA +TGATCTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGA +AGGAAGCAACCACCACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTAC +ATAATGTTTGGGCCACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAGTAGTAT +TGGTAAATGTGACAGAAAATTTTAACATGTGGAAAAATGACATGGTAGAACAGATGCATG +AGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCT +GTGTTAGTTTAAAGTGCACTGATTTGAAGAATGATACTAATACCAATAGTAGTAGCGGGA +GAATGATAATGGAGAAAGGAGAGATAAAAAACTGCTCTTTCAATATCAGCACAAGCATAA +GAGGTAAGGTGCAGAAAGAATATGCATTTTTTTATAAACTTGATATAATACCAATAGATA +ATGATACTACCAGCTATAAGTTGACAAGTTGTAACACCTCAGTCATTACACAGGCCTGTC +CAAAGGTATCCTTTGAGCCAATTCCCATACATTATTGTGCCCCGGCTGGTTTTGCGATTC +TAAAATGTAATAATAAGACGTTCAATGGAACAGGACCATGTACAAATGTCAGCACAGTAC +AATGTACACATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAG +CAGAAGAAGAGGTAGTAATTAGATCTGTCAATTTCACGGACAATGCTAAAACCATAATAG +TACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAA +GAATCCGTATCCAGAGAGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATA +TGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATAACACTTTAAAACAGATAG +CTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAAGCAATCCTCAG +GAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTA +ATTCAACACAACTGTTTAATAGTACTTGGTTTAATAGTACTTGGAGTACTGAAGGGTCAA +ATAACACTGAAGGAAGTGACACAATCACCCTCCCATGCAGAATAAAACAAATTATAAACA +TGTGGCAGAAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTT +CATCAAATATTACAGGGCTGCTATTAACAAGAGATGGTGGTAATAGCAACAATGAGTCCG +AGATCTTCAGACCTGGAGGAGGAGATATGAGGGACAATTGGAGAAGTGAATTATATAAAT +ATAAAGTAGTAAAAATTGAACCATTAGGAGTAGCACCCACCAAGGCAAAGAGAAGAGTGG +TGCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTTCCTTGGGTTCTTGGGAGCAG +CAGGAAGCACTATGGGCGCAGCCTCAATGACGCTGACGGTACAGGCCAGACAATTATTGT +CTGGTATAGTGCAGCAGCAGAACAATTTGCTGAGGGCTATTGAGGCGCAACAGCATCTGT +TGCAACTCACAGTCTGGGGCATCAAGCAGCTCCAGGCAAGAATCCTGGCTGTGGAAAGAT +ACCTAAAGGATCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCA +CTGCTGTGCCTTGGAATGCTAGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATCACA +CGACCTGGATGGAGTGGGACAGAGAAATTAACAATTACACAAGCTTAATACACTCCTTAA +TTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGAATTATTGGAATTAGATAAAT +GGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATAAAATTATTCA +TAATGATAGTAGGAGGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGA +ATAGAGTTAGGCAGGGATATTCACCATTATCGTTTCAGACCCACCTCCCAACCCCGAGGG +GACCCGACAGGCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACAGATCCA +TTCGATTAGTGAACGGATCCTTGGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCT +TCAGCTACCACCGCTTGAGAGACTTACTCTTGATTGTAACGAGGATTGTGGAACTTCTGG +GACGCAGGGGGTGGGAAGCCCTCAAATATTGGTGGAATCTCCTACAGTATTGGAGTCAGG +AACTAAAGAATAGTGCTGTTAGCTTGCTCAATGCCACAGCCATAGCAGTAGCTGAGGGGA +CAGATAGGGTTATAGAAGTAGTACAAGGAGCTTGTAGAGCTATTCGCCACATACCTAGAA +GAATAAGACAGGGCTTGGAAAGGATTTTGCTATAAGATGGGTGGCAAGTGGTCAAAAAGT +AGTGTGATTGGATGGCCTACTGTAAGGGAAAGAATGAGACGAGCTGAGCCAGCAGCAGAT +AGGGTGGGAGCAGCATCTCGAGACCTGGAAAAACATGGAGCAATCACAAGTAGCAATACA +GCAGCTACCAATGCTGCTTGTGCCTGGCTAGAAGCACAAGAGGAGGAGGAGGTGGGTTTT +CCAGTCACACCTCAGGTACCTTTAAGACCAATGACTTACAAGGCAGCTGTAGATCTTAGC +CACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGCTAATTCACTCCCAAAGAAGACAAGAT +ATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTAGCAGAACTACACA +CCAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTT +GAGCCAGATAAGATAGAAGAGGCCAATAAAGGAGAGAACACCAGCTTGTTACACCCTGTG +AGCCTGCATGGGATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGC +CTAGCATTTCATCACGTGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGCTGACAT +CGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGCGTGGCCTGGGCGGG +ACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGG +GTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACT +GCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTG +TGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCA +>gi|66864561|gb|DQ027778.1| HIV-1 isolate QA013_2282M_A10 from Kenya nonfunctional envelope glycoprotein (env) gene, partial sequence +TCTCTGCGTTACCTTAGATTGTCATCATAATGTCACCACTGTCACCACTAACAATACCAA +TGACACTGTATATCACCAATGCCACTATCACCAATGCCAATATCACCAAGGACGATGCCG +ACATGAGAAACTGCTCTTAATGTGACCACAGTAATAAGGGATAAGCAACAGAAAGTATAC +TCACTTTTTTATAGACTTGATCTAGCCAACTGAAAGTAATGATAGTTATAGTTATAGATT +AATAAATTGTAACACCTCCGTCATTAAACAGGCGTCCAAAAGTAACCTTTGAGCCAATTC +CCATACATTATTGTGCCCCAGCTGGTTTTGCGATTCTAAAGTAAAGATAAGAAGTTCAAT +GGAACAGGGCCATGCAGGAATGTCAGCACAGTACAATGCACACATGGGAAAGCCAGTAGT +ATCAACTCAGCTGCTGTTAAATGGCAGCCTAGCAGAAGGAGAGGTAAGAATTAGATCAAA +ATATCACAAACAATGCCAAAAACATATTAGTACAATTGACCACACCTGTAAGAATTAATT +GTACCACCTAACAACAATACAAGAAAAAGTGTACATATAGGACCAGGACAAGCATTCTAT +GCAACAGGAGGAAATAGGAAATATAAGACAAGCATATTGTAATGTTAGTAAAACACAATG +GAATACAGCTTTGCAAAAGGTCTAACAAATTAGAAGCATACTTTAACAAAACAATTAACA +AAACAATAATCTTTACTCACTCATCAGGAGGATACAGAAATTACAACACATAGTTTTAAT +TGTGGAGGAGAGTTTTTCTATTGTAATACATCAGGCCTTTAATAGCACTTAGGACAACGA +TACCAACACACAGGAATCAAATGACACTATAATGATCCCATGCAGTAAAGCAAATTATAA +ACATGTGGCAGAGAACAGGACAAGCAATATATGCCCCTCCCATACAAGGAATAAAGGTGT +GACTCAAATATTACAGGACTAATATTAACAAGAGATGGTAAGAATAATTCTAATAACAGT +GACCTTCAGACCTAGAGGAGGAGAT +>gi|66864559|gb|DQ027777.1| HIV-1 isolate QA013_1790M_C1 from Kenya envelope glycoprotein (env) gene, partial cds +TCTCTGCGTTACCTTAGATTGTCATCATAATGTCACCACTGTCACCACTAACAATACCAA +TGACACTGCATATCACCAATGCCAATATCACCAATGACAAGGCCGACATGAAAAACTGCT +CTTACAATGTGACCACAAATAAGGGATAAGCAACAGAAAGTATACTCACTTTTTTATAGA +CTTGATCTAGTACCAACTGAAAGTAGATAGTTATAGATTAATAAATTGTAACACCTCCGT +CATTAAACAGGCTTGTCCAAAAGTAACCTTTGACAATTCCCATACATTATTGTGCCCCAG +CTAGTTTTGCGATTCTAAAGTGTAAAGATAAGGAGTTCAATAACAGGGCCATGCAAGAAT +GTCAGCACAGTACAATGCACACATGGGATCAAGCCAGTAGTATCAACTCCTGCTGTTAAA +TGGCAGCCTAGCAGAAGGAGAGGTAAGAATTAGATCTGAAAATATCACAAACAATGCAAA +ACATAATAGTACAATTGCCCACACCTATAACAATTACTTGTACCAGACCCAACAACAATA +CAAGAAAGTGTACATATAGGACCAGGACAAGCATTCTATGCAACAGGTGCAATAATAGGG +GATATAAGACGAGCATTGTAATGTTAATAAAACACAATGGAATACAGCTTTGCAAAAGGT +AGCTAACAAATTAAAAACATATTAACAAAACAATAATCTTTACTCACTCATCAGGAGGGG +ATATAGAAATTACAACACATAGTTTTAATTAGAGGAGAGTTTTTCTATTGTAATACATCA +GGCCTGTTTAATAGCACTTGGGTAAACAATACCAACACAGGGGACAACTAACACAGAATC +AAATGACAATATAACGATCCCATGCAGAATAAAGCAAATTATAAATGTGGCAGAGAACAG +GACAAGCAATATATGCCCCTCCCATACAAGGAATAATAAGGTGTGAATCAAATTACAGGA +CTAATATTAACAAAAGATAGTAAGAATAATGCTAATAGAACTGAAACCTTCAGACCTAGA +GAGAGAT +>gi|66864557|gb|DQ027776.1| HIV-1 isolate QA013_987M_B12 from Kenya envelope glycoprotein (env) gene, partial cds +TCTCTGCGTTACTTTAGATTGTCATAATGTCACCAATGACACTGCCAGTATCATGAAAAA +CTGCTCTTAATGTGACCACAGTAATAAGGGATAAGCAACAGAAAGTATACTCACTTTTTT +ATAGACTTGATATAGTAAACTGAAAGTAATACTAGTTATAGATTAATAAATTGTAACACC +TCCGTCATTAAACAGGCTTGTCCAAGTAACCTTTGAGCCAATTCCCATACATTATTGTGC +CCCAGCTGGTTTTGCGATTCTAAAGTGTAAAGAAGGAGTTCAATGGAACAGGGCCATGCA +AGAATGTCAGCACAGTACAATGCACACATGGGATCAAGCCAAGTATCAACTCAGCTGCTG +TTAAATGGCAGCCTAGCAGAAGGAGAGGTAAGAATTAGATCTGAAAATAACAAACAATGC +CAAAAACCTAATAGTACAATTGACCACACCTATAAAAATTAATTGTACTAGACCTAAACA +ATACAAGAAAAAGCGTACATATAGGACCAGGACAAGCATTCTATGCAACAGGTGAAATAA +TAGGGTATAAGACAAGCACATTGTAATGTTAATAAAACACAATGGGATAAAACTTTGCAA +GAGGTAGCTAACATTAAAAACATACTTTAACAAAACAATAATCTTTACTCACTCATCAGG +AGGGGATATAGAAATTACAACATAGTTTTAATTGTGGAGGAGAGTTTTTCTATTGTAATA +CATCAAGCCTGTTTAATAGCACTTGGGGACAATACCAACACACAGGAGTCAAATAGCACA +GAATCAAATGAAACTATAACGATCCCATGCAGAATAACAAATTATAAATATGTGGCAGAG +AACAGGACAAGCAATATATGCCCCTCCCATACAAGGAATAATAAGGTGAATCAAATATTA +CAGGACTAATATTAACAAGAGATGGTGGGAATGATTCTAGGGAAAATGAAACCCAGACCT +GGAGGAGGAGAT +>gi|66864555|gb|DQ027775.1| HIV-1 isolate QA013_765M_B1 from Kenya envelope glycoprotein (env) gene, partial cds +TCTCTGCGTTACTTTAGATTGTCATAATGTCACCAATGACAATGCCAATATCATGAAAAA +CTGCTCTTAATGTGACCACAGTAATAAGGGATAAGCAACAGAAAGTATACTCACTTTTTT +ATAGACTTGATATAGTAAACTGAAAGTAATACTAGTTATAGATTAATAAATTGTAACACC +TCCGTCATTAAACAGGCTTGTCCAAGTAACCTTTGAGCCAATTCCCATACATTATTGTGC +CCCAGCTGGTTTTGCGACTCTAAAGTGTAAAGAAGGAGTTCAATGGAACAGGGCCATGCA +AGAATGTCAGCACAGTACAATGCGCACATGGGATCAAGCCAAGTATCAACTCAGCTGCTG +TTAAATGGCAGCCTAGCAGAAGGAGAGGTAAGAATTAGATCTGAAAATAACAAACAATGC +CAAAAACCTAATAGTACAATTGACCACACCTATAAAAATTAATTGTACCAGACCTAAACA +ATACAAGAAAAAGTGTACATATAGGACCAGGACAAGCATTCTATGCAACAGGTGAAATAA +TAGGGTATAAGACAAGCACATTGTAATGTTAGTGAAACACAATGGCATAAAACTTTGCAA +GAGGTAGCTAACATTAAAAACATACTTTAACAAAACAATAATCTTTGATCACTCATCAGG +AGGGGATATAGAAATTACAACATAGTTTTAATTGTGGAGGAGAGTTTTTCTATTGTAATA +CATCAGGCCTGTTTAATAGCACTTGGGAACGATGCCAGCACACAGGAGTCAAATAACACA +GAATCAAATACCACTATAACGATCCCATGCAGAATAACAAGTTATAAATATGTGGCAGAG +AACAGGACAAGCAATATATGCCCCTCCCATACAAGGAATAATAAGGTGAATCAAATATTA +CAGGACTAATATTAACAAGAGATGGTGGGGATAATTCTAGGGAAAATGAAACCCAGACCT +GGAGGAGGAGAT +>gi|66864553|gb|DQ027774.1| HIV-1 isolate QA013_559M_A1 from Kenya envelope glycoprotein (env) gene, partial cds +TCTCTGCGTTACTTTAGATTGTCATAATGTCACCACTGACAATGCCAATATCACCACCGA +CATGAAAAAGCTCTTACAATGTGACCACAGTAATAAGGGATAAGCAACAGAAAGTATACT +CACTTTTTTATAGACTTTATAGTACCAACTGAAAGTAATACTAGTTATAGATTAATAAAT +TGTAACACCTCCGTCATTAAACAGGTGTCCAAAGGTAACCTTTGAGCCAATTCCCATACA +TTATTGTGCCCCAGCTGGTTTTGCGATTCTAAAGTAAAGATAAGGAGTTCAATGGAACAG +GGCCATGCAAGAATGTCAGCACAGTACAATGCACACATGGGCAAGCCAGTAGTATCAACT +CAGCTGCTGTTAAATGGCAGCCTAGCAGAAGGAGAGGTAAGAATTAGATGAAAATATCAC +AAACAATGCCAAAAACATAATAGTACAATTGACCACACCTATAAAAATTAATTGTATGAC +CTAACAACAATACAAGAAAAAGTGTACATATAGGACCAGGACAAGCATTCTATGCAACAG +GTGAAAATAGGGGATATAAGACAAGCACATTGTAATGTTAGTAAAACACAATGGCATAAA +ACTTTGCAAGAGGGCTAACAAATTAAAAACATACTTTAACAAAACAATAATCTTTAATCA +CTCATCAGGAGGGGATATAGATTACAACACATAGTTTTAATTGTGGAGGAGAGTTTTTCT +ATTGTAATACATCAGGCCTGTTTAATAGCTTGGGGAAACCATGCCAGCATACAGGAGTCC +AATAACACAGAATCAAATGACAATATAACGATCCCATAGAATAAAGCAAATTATAAATAT +GTGGCAGAGAACAGGACAAGCAATATATGCCCCTCCCATACAAGGTAATAAGGTGTGAAT +CAAATATTACAGGACTAATATTAACAAGAGATGGTGGGGATAATCCTAGGGAATGAAACC +TTCAGACAAAATGAAACCTTCAGACCTGGAGGAGGAGAT +>gi|66864614|gb|DQ027773.1| HIV-1 isolate QA013_385M_B5 from Kenya envelope glycoprotein (env) gene, partial cds +TCTCTGCGTTACTTTAGATTGTCATAATGTCACCAATGACAATGCCAATATCACCACCGA +CATGAAAAAGCTCTTACAATGTGACCACAGTAATAAGGGATAAGCAACAGAAAGTATACT +CACTTTTTTATAGACTTTATAGTACCAACTGAAAGTAATACTAGTTATAGATTAATAAAT +TGTAACACCTCCGTCATTAAACAGGTGTCCAAAGGTAACCTTTGAGCCAATTCCCATACA +TTATTGTGCCCCAGCTGGTTTTGCGATTCTAAAGTAAAGATAAGGAGTTCAATGGAACAG +GGCCATGCAAGAATGTCAGCACAGTACAATGCACACATGGGCAAGCCAGTAGTATCAACT +CAGCTGCTGTTAAATGGCAGCCTAGCAGAAGGAGAGGTAAGAATTAGATGAAAATATCAC +AAACAATGCCAAAAACATAATAGTACAATTGACCACACCTATAAAAATTAATTGTATGAC +CTAACAACAATACAAGAAAAAGTGTACATATAGGACCAGGACAAGCATTCTATGCAACAG +GTGAAAATAGGGGATATAAGACAAGCACATTGTAATGTTAGTAAAACACAATGGAATAAA +ACTTTGCAAGAGGGCTAACAAATTAAAAACATACTTTAACAAAACAATAATCTTTAATCA +CTCATCAGGAGGGGATATAGATTACAATACATAGTTTTAATTGTGGAGGAGAGTTTTTCT +ATTGTAATACATCAGGCCTGTTTAATAGCTTGGGGAAACAATGCCAGCACACAGGAGTCA +AATAACACAGAATCAAATGACACTATAACGATCCCATAGAATAAAGCAAATTATAAATAT +GTGGCAGAGAACAGGACAAGCAATATATGCCCCTCCCATACAAGATAATAAGGTGTGAAT +CAAATATTACAGGACTAATATTAACAAGAGATGGTGGGGATAATCCTAGGGAATGAAACC +TTCAGACAAAATGAAACCTTCAGACCTGGAGGAGGAGAT diff --git a/examples/wrapped.fasta b/examples/wrapped.fasta new file mode 100644 index 0000000..0570cc8 --- /dev/null +++ b/examples/wrapped.fasta @@ -0,0 +1,8 @@ +>SEQUENCE_1 +MTEITAAMVKELRESTGAGMMDCKNALSETNGDFDKAVQLLREKGLGKAAKKADRLAAEG +LVSVKVSDDFTIAAMRPSYLSYEDLDMTFVENEYKALVAELEKENEERRRLKDPNKPEHK +IPQFASRKQLSDAILKEAEEKIKEELKAQGKPEKIWDNIIPGKMNSFIADNSQLDSKLTL +MGQFYVMDDKKTVEQVIAEKEKEFGGKIKIVEFICFEVGEGLEKKTEDFAAEVAAQL +>SEQUENCE_2 +SATVSEINSETDFVAKNDQFIALTKDTTAHIQSNSLQSVEELHSSTINGVKFEEYLKSQI +ATIGENLVVRRFATLKAGANGVVNGYIHTNGRVGVVIAAACDSAEVASKSRDLLRQICMH diff --git a/examples/wrapped.phy b/examples/wrapped.phy new file mode 100644 index 0000000..b3d56ea --- /dev/null +++ b/examples/wrapped.phy @@ -0,0 +1,28 @@ + 3 384 +CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- --------SQ +ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG ALGRTRHALR +CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- --------FH + + FLEFQDKFNK KY-SHEEYLE RFEIFKSNLG KIEELNLIAI NHKADTKFGV NKFADLSSDE + FARFAVRYGK SYESAAEVRR RFRIFSESLE EVRSTN---- RKGLPYRLGI NRFSDMSWEE + FKSWMSKHRK TY-STEEYHH RLQTFASNWR KINAHN---- NGNHTFKMAL NQFSDMSFAE + + FKNYYLNNKE AIFTDDLPVA DYLDDEFINS IPTAFDWRTR G-AVTPVKNQ GQCGSCWSFS + FQATRL-GAA QTCSATLAGN HLMRDA--AA LPETKDWRED G-IVSPVKNQ AHCGSCWTFS + IKHKYLWSEP QNCSAT--KS NYLRGT--GP YPPSVDWRKK GNFVSPVKNQ GACGSCWTFS + + TTGNVEGQHF ISQNKLVSLS EQNLVDCDHE CMEYEGEEAC DEGCNGGLQP NAYNYIIKNG + TTGALEAAYT QATGKNISLS EQQLVDCAGG FNNF------ --GCNGGLPS QAFEYIKYNG + TTGALESAIA IATGKMLSLA EQQLVDCAQD FNNY------ --GCQGGLPS QAFEYILYNK + + GIQTESSYPY TAETGTQCNF NSANIGAKIS NFTMIP-KNE TVMAGYIVST GPLAIAADAV + GIDTEESYPY KGVNGV-CHY KAENAAVQVL DSVNITLNAE DELKNAVGLV RPVSVAFQVI + GIMGEDTYPY QGKDGY-CKF QPGKAIGFVK DVANITIYDE EAMVEAVALY NPVSFAFEVT + + E-WQFYIGGV F-DIPCN--P NSLDHGILIV GYSAKNTIFR KNMPYWIVKN SWGADWGEQG + DGFRQYKSGV YTSDHCGTTP DDVNHAVLAV GYGVENGV-- ---PYWLIKN SWGADWGDNG + QDFMMYRTGI YSSTSCHKTP DKVNHAVLAV GYGEKNGI-- ---PYWIVKN SWGPQWGMNG + + YIYLRRGKNT CGVSNFVSTS II-- + YFKMEMGKNM CAIATCASYP VVAA + YFLIERGKNM CGLAACASYP IPLV diff --git a/logo/seqmagick_logo.png b/logo/seqmagick_logo.png new file mode 100644 index 0000000..4007b53 Binary files /dev/null and b/logo/seqmagick_logo.png differ diff --git a/logo/seqmagick_logo.psd b/logo/seqmagick_logo.psd new file mode 100644 index 0000000..d29c638 Binary files /dev/null and b/logo/seqmagick_logo.psd differ diff --git a/logo/seqmagick_logo_blue.png b/logo/seqmagick_logo_blue.png new file mode 100644 index 0000000..e96a328 Binary files /dev/null and b/logo/seqmagick_logo_blue.png differ diff --git a/logo/seqmagick_logo_dark_blue.png b/logo/seqmagick_logo_dark_blue.png new file mode 100644 index 0000000..1fa6bde Binary files /dev/null and b/logo/seqmagick_logo_dark_blue.png differ diff --git a/logo/seqmagick_logo_red.png b/logo/seqmagick_logo_red.png new file mode 100644 index 0000000..413e76b Binary files /dev/null and b/logo/seqmagick_logo_red.png differ diff --git a/logo/seqmagick_logo_sans_bg.png b/logo/seqmagick_logo_sans_bg.png new file mode 100644 index 0000000..ea83cac Binary files /dev/null and b/logo/seqmagick_logo_sans_bg.png differ diff --git a/logo/seqmagick_logo_small.png b/logo/seqmagick_logo_small.png new file mode 100644 index 0000000..184a6a8 Binary files /dev/null and b/logo/seqmagick_logo_small.png differ diff --git a/requirements-rtd.txt b/requirements-rtd.txt new file mode 100644 index 0000000..19d182d --- /dev/null +++ b/requirements-rtd.txt @@ -0,0 +1,3 @@ +biopython>=1.78 +pygtrie>=2.1 +sphinx diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..34a4604 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +biopython>=1.78 +pygtrie>=2.1 + +# for development +wheel +sphinx +nose +twine +ghp-import diff --git a/seqmagick.py b/seqmagick.py new file mode 100755 index 0000000..0e78ec4 --- /dev/null +++ b/seqmagick.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python + +import sys + +if __name__ == '__main__': + from seqmagick.scripts import cli + sys.exit(cli.main(sys.argv[1:])) diff --git a/seqmagick/__init__.py b/seqmagick/__init__.py new file mode 100644 index 0000000..d578bd6 --- /dev/null +++ b/seqmagick/__init__.py @@ -0,0 +1,7 @@ +from os import path + +try: + with open(path.join(path.dirname(__file__), 'data', 'ver')) as f: + __version__ = f.read().strip().replace('-', '+', 1).replace('-', '.') +except Exception as e: + __version__ = '' diff --git a/seqmagick/fileformat.py b/seqmagick/fileformat.py new file mode 100644 index 0000000..c06b2cc --- /dev/null +++ b/seqmagick/fileformat.py @@ -0,0 +1,80 @@ +""" +Mappings from file extensions to biopython types +""" + +# import bz2 +import gzip +import os.path +import sys + +# Define mappings in a dictionary with extension : BioPython_file_type. +EXTENSION_TO_TYPE = {'.aln': 'clustal', + '.afa': 'fasta', + '.fa': 'fasta', + '.faa': 'fasta', + '.fas': 'fasta', + '.fasta': 'fasta', + '.fastq': 'fastq', + '.fq': 'fastq', + '.ffn': 'fasta', + '.fna': 'fasta', + '.frn': 'fasta', + '.gb': 'genbank', + '.gbk': 'genbank', + '.needle': 'emboss', + '.nex': 'nexus', + '.phy': 'phylip', + '.phylip': 'phylip', + '.phyx': 'phylip-relaxed', + '.qual': 'qual', + '.sff': 'sff-trim', + '.sth': 'stockholm', + '.sto': 'stockholm',} + +COMPRESS_EXT = { + '.gz': gzip.open, + # '.bz2': bz2.BZ2File, + # '.bz': bz2.BZ2File, +} + + +class UnknownExtensionError(ValueError): + pass + + +def from_extension(extension): + """ + Look up the BioPython file type corresponding with input extension. + + Look up is case insensitive. + """ + if not extension.startswith('.'): + raise ValueError("Extensions must begin with a period.") + try: + return EXTENSION_TO_TYPE[extension.lower()] + except KeyError: + raise UnknownExtensionError( + "seqmagick does not know how to handle " + + "files with extensions like this: " + extension) + + +def from_filename(file_name): + """ + Look up the BioPython file type corresponding to an input file name. + """ + base, extension = os.path.splitext(file_name) + if extension in COMPRESS_EXT: + # Compressed file + extension = os.path.splitext(base)[1] + return from_extension(extension) + + +def from_handle(fh, stream_default='fasta'): + """ + Look up the BioPython file type corresponding to a file-like object. + + For stdin, stdout, and stderr, ``stream_default`` is used. + """ + if fh in (sys.stdin, sys.stdout, sys.stderr): + return stream_default + return from_filename(fh.name) diff --git a/seqmagick/scripts/__init__.py b/seqmagick/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seqmagick/scripts/cli.py b/seqmagick/scripts/cli.py new file mode 100644 index 0000000..01a889c --- /dev/null +++ b/seqmagick/scripts/cli.py @@ -0,0 +1,73 @@ +#! /usr/bin/env python + +import argparse +import logging +import sys + +from seqmagick import __version__ as version +from seqmagick import subcommands + + +def main(argv=sys.argv[1:]): + action, arguments = parse_arguments(argv) + + loglevel = { + 0: logging.ERROR, + 1: logging.WARNING, + 2: logging.INFO, + 3: logging.DEBUG, + }.get(arguments.verbosity, logging.DEBUG) + + if arguments.verbosity > 1: + logformat = '%(levelname)s %(module)s %(lineno)s %(message)s' + else: + logformat = '%(message)s' + + # set up logging + logging.basicConfig(stream=sys.stderr, format=logformat, level=loglevel) + + return action(arguments) + + +def parse_arguments(argv): + """ + Extract command-line arguments for different actions. + """ + parser = argparse.ArgumentParser(description='seqmagick - Manipulate ' + \ + ' sequence files.', prog='seqmagick') + + parser.add_argument('-V', '--version', action='version', + version='seqmagick v' + version, + help="Print the version number and exit") + parser.add_argument('-v', '--verbose', dest='verbosity', + action='count', default=1, + help="Be more verbose. Specify -vv or -vvv for even more") + parser.add_argument('-q', '--quiet', action='store_const', const=0, + dest='verbosity', help="Suppress output") + + # Subparsers + subparsers = parser.add_subparsers(dest='subparser_name') + + parser_help = subparsers.add_parser('help', + help='Detailed help for actions using help ') + + parser_help.add_argument('action') + + # Add actions + actions = {} + for name, mod in subcommands.itermodules(): + subparser = subparsers.add_parser(name, help=mod.__doc__, + description=mod.__doc__) + mod.build_parser(subparser) + actions[name] = mod.action + + arguments = parser.parse_args(argv) + arguments.argv = argv + action = arguments.subparser_name + + if action is None: + parse_arguments(['-h']) + if action == 'help': + return parse_arguments([str(arguments.action), '-h']) + + return actions[action], arguments diff --git a/seqmagick/subcommands/__init__.py b/seqmagick/subcommands/__init__.py new file mode 100644 index 0000000..35d3ea0 --- /dev/null +++ b/seqmagick/subcommands/__init__.py @@ -0,0 +1,8 @@ +commands = 'convert', 'info', 'mogrify', 'primer_trim', 'quality_filter', \ + 'extract_ids', 'backtrans_align' + + +def itermodules(root=__name__): + for command in commands: + yield (command.replace('_', '-'), + __import__('%s.%s' % (root, command), fromlist=[command])) diff --git a/seqmagick/subcommands/backtrans_align.py b/seqmagick/subcommands/backtrans_align.py new file mode 100644 index 0000000..198df1a --- /dev/null +++ b/seqmagick/subcommands/backtrans_align.py @@ -0,0 +1,167 @@ +""" +Given a protein alignment and unaligned nucleotides, align the nucleotides +using the protein alignment. + +Protein and nucleotide sequence files must contain the same number of +sequences, in the same order, with the same IDs. +""" + +# TODO: Add tests + +import itertools +import logging +import sys + +from Bio import SeqIO +from Bio.Data import CodonTable +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + +from seqmagick import fileformat + +from . import common + +TRANSLATION_TABLES = { + 'standard': CodonTable.unambiguous_dna_by_name["Standard"], + 'standard-ambiguous': CodonTable.ambiguous_dna_by_name["Standard"], + 'vertebrate-mito': CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"] +} + + +def build_parser(parser): + parser.add_argument( + 'protein_align', type=common.FileType('r'), help='Protein Alignment') + parser.add_argument( + 'nucl_align', type=common.FileType('r'), help='FASTA Alignment') + parser.add_argument( + '-o', '--out-file', type=common.FileType('w'), + default=sys.stdout, metavar='destination_file', + help='Output destination. Default: STDOUT') + parser.add_argument( + '-t', '--translation-table', choices=TRANSLATION_TABLES, + default='standard-ambiguous', + help='Translation table to use. [Default: %(default)s]') + parser.add_argument( + '-a', '--fail-action', choices=('fail', 'warn', 'none'), default='fail', + help='Action to take on an ambiguous codon [default: %(default)s]') + + return parser + + +def batch(iterable, chunk_size): + """ + Return items from iterable in chunk_size bits. + + If len(iterable) % chunk_size > 0, the last item returned will be shorter. + """ + i = iter(iterable) + while True: + r = list(itertools.islice(i, chunk_size)) + if not r: + break + yield r + + +class AlignmentMapper(object): + def __init__(self, translation_table, unknown_action='fail'): + self.translation_table = translation_table + self.unknown_action = unknown_action + + def _validate_translation(self, aligned_prot, aligned_nucl): + """ + Given a seq for protein and nucleotide, ensure that the translation holds + """ + codons = [''.join(i) for i in batch(str(aligned_nucl), 3)] + for codon, aa in zip(codons, str(aligned_prot)): + # Check gaps + if codon == '---' and aa == '-': + continue + + try: + trans = self.translation_table.forward_table[codon] + if not trans == aa: + raise ValueError("Codon {0} translates to {1}, not {2}".format( + codon, trans, aa)) + except (KeyError, CodonTable.TranslationError): + if aa != 'X': + if self.unknown_action == 'fail': + raise ValueError("Unknown codon: {0} mapped to {1}".format( + codon, aa)) + elif self.unknown_action == 'warn': + logging.warn('Cannot verify that unknown codon %s ' + 'maps to %s', codon, aa) + return True + + def map_alignment(self, prot_seq, nucl_seq): + """ + Use aligned prot_seq to align nucl_seq + """ + if prot_seq.id != nucl_seq.id: + logging.warning( + 'ID mismatch: %s != %s. Are the sequences in the same order?', + prot_seq.id, nucl_seq.id) + + # Ungap nucleotides + codons = batch(str(nucl_seq.seq.ungap('-')), 3) + codons = [''.join(i) for i in codons] + codon_iter = iter(codons) + + ungapped_prot = str(prot_seq.seq).replace('-', '') + + if len(ungapped_prot) != len(codons): + table = self.translation_table.forward_table + prot_str = ' '.join(' ' + p + ' ' for p in ungapped_prot) + codon_str = ' '.join(codons) + trans_str = ' '.join(' ' + table.get(codon, 'X') + ' ' + for codon in codons) + raise ValueError("""Length of codon sequence ({0}) does not match \ +length of protein sequence ({1}) for {2} +Protein: {3} +Codons: {4} +Trans. Codons: {5}""".format(len(codons), len(ungapped_prot), nucl_seq.id, prot_str, + codon_str, trans_str)) + + try: + nucl_align = ['---' if p == '-' else next(codon_iter) + for p in str(prot_seq.seq)] + except StopIteration: + assert False # Should be checked above + + result = SeqRecord(Seq(''.join(nucl_align)), id=nucl_seq.id, + description=nucl_seq.description) + + # Validate + self._validate_translation(prot_seq.seq.upper(), result.seq.upper()) + + return result + + def map_all(self, prot_alignment, nucl_sequences): + """ + Convert protein sequences to nucleotide alignment + """ + zipped = itertools.zip_longest(prot_alignment, nucl_sequences) + for p, n in zipped: + if p is None: + raise ValueError("Exhausted protein sequences") + elif n is None: + raise ValueError("Exhausted nucleotide sequences") + yield self.map_alignment(p, n) + +def action(arguments): + """ + Run + """ + # Ignore SIGPIPE, for head support + common.exit_on_sigpipe() + logging.basicConfig() + + prot_sequences = SeqIO.parse(arguments.protein_align, + fileformat.from_handle(arguments.protein_align)) + nucl_sequences = SeqIO.parse(arguments.nucl_align, + fileformat.from_handle(arguments.nucl_align)) + + instance = AlignmentMapper(TRANSLATION_TABLES[arguments.translation_table], + arguments.fail_action) + + SeqIO.write(instance.map_all(prot_sequences, nucl_sequences), + arguments.out_file, fileformat.from_filename(arguments.out_file.name)) diff --git a/seqmagick/subcommands/common.py b/seqmagick/subcommands/common.py new file mode 100644 index 0000000..d51016e --- /dev/null +++ b/seqmagick/subcommands/common.py @@ -0,0 +1,238 @@ +""" +Common functions for subcommands +""" +import argparse +import contextlib +import copy +import functools +import os +import os.path +import signal +import sys +import tempfile + +from seqmagick import fileformat + +def get_umask(): + """ + Gets the current umask + """ + current_umask = os.umask(0o777) + os.umask(current_umask) + return current_umask + +def apply_umask(permission=0o666, umask=None): + """ + Masks the provided permission with a umask. + + If umask is not given, the current umask is used. + """ + if umask is None: + umask = get_umask() + return permission & (~umask) + +@contextlib.contextmanager +def atomic_write(path, mode='wt', permissions=None, file_factory=None, **kwargs): + """ + Open a file for atomic writing. + + Generates a temp file, renames to value of ``path``. + + Arguments: + ``permissions``: Permissions to set (default: umask) + ``file_factory``: If given, the handle yielded will be the result of + calling file_factory(path) + + Additional arguments are passed to tempfile.NamedTemporaryFile + """ + if permissions is None: + permissions = apply_umask() + # Handle stdout: + if path == '-': + yield sys.stdout + else: + base_dir = os.path.dirname(path) + kwargs['suffix'] = os.path.basename(path) + tf = tempfile.NamedTemporaryFile( + dir=base_dir, mode=mode, delete=False, **kwargs) + + # If a file_factory is given, close, and re-open a handle using the + # file_factory + if file_factory is not None: + tf.close() + tf = file_factory(tf.name) + try: + with tf: + yield tf + # Move + os.rename(tf.name, path) + os.chmod(path, permissions) + except: + os.remove(tf.name) + raise + +def sequence_slices(string): + """ + Parses a list of slices from a string of format: + + start1:end1[,start2:end2[,start2:end3]] etc + """ + slices = string.split(',') + return [cut_range(i) for i in slices] + +def cut_range(string): + """ + A custom argparse 'type' to deal with sequences ranges such as 5:500. + + Returns a 0-based slice corresponding to the selection defined by the slice + """ + value_range = string.split(':') + if len(value_range) == 1: + start = int(value_range[0]) + stop = start + elif len(value_range) == 2: + start, stop = tuple(int(i) if i else None for i in value_range) + else: + msg = "{0} is not a valid, 1-indexed range.".format(string) + raise argparse.ArgumentTypeError(msg) + + if start == 0 or (stop or sys.maxsize) < (start or 0): + msg = "{0} is not a valid, 1-indexed range.".format(string) + raise argparse.ArgumentTypeError(msg) + + # Convert from 1-indexed to 0-indexed + if start is not None and start > 0: + start -= 1 + + return slice(start, stop) + + +def typed_range(type_func, minimum, maximum): + """ + Require variables to be of the specified type, between minimum and maximum + """ + @functools.wraps(type_func) + def inner(string): + result = type_func(string) + if not result >= minimum and result <= maximum: + raise argparse.ArgumentTypeError( + "Please provide a value between {0} and {1}".format( + minimum, maximum)) + return result + return inner + + +def partial_append_action(fn, argument_keys=None): + """ + Creates a new class extending argparse.Action, which appends a + partially-applied function to dest. + + The optional argument_keys argument should either be None (no additional + arguments to fn) or an iterable of function keys to partially apply. + """ + if isinstance(argument_keys, str): + argument_keys = [argument_keys] + argument_keys = argument_keys or [] + + class PartialAppendAction(argparse.Action): + def __init__(self, + option_strings, + dest, + const=None, + default=None, + required=False, + help=None, + type=None, + metavar=None, + nargs=None, + **kwargs): + super(PartialAppendAction, self).__init__( + option_strings=option_strings, + dest=dest, + nargs=len(argument_keys), + const=const, + default=default, + required=required, + metavar=metavar, + type=type, + help=help, **kwargs) + + def __call__(self, parser, namespace, values, option_string=None): + items = copy.copy(getattr(namespace, self.dest, None)) or [] + + # If no value was set default to empty list + if values is None: + values = [] + elif not isinstance(values, list): + values = [values] + + if len(argument_keys) != len(values): + raise ValueError("Unexpected number of values") + + # Generate keyword arguments for the input function + kwargs = dict(list(zip(argument_keys, values))) + f = functools.partial(fn, **kwargs) + items.append(f) + setattr(namespace, self.dest, items) + + return PartialAppendAction + + +def positive_value(target_type): + """ + Wraps target_type in a function that requires the parsed argument + be >= 0 + """ + def inner(string): + value = target_type(string) + if not value >= 0: + raise argparse.ArgumentTypeError("Invalid positive number: " + + string) + return value + + return inner + +def _exit_on_signal(sig, status=None, message=None): + def exit(sig, frame): + if message: + print(message, file=sys.stderr) + raise SystemExit(status) + signal.signal(sig, exit) + + +def exit_on_sigint(status=1, message="Canceled."): + """ + Set program to exit on SIGINT, with provided status and message. + """ + _exit_on_signal(signal.SIGINT, status, message) + + +def exit_on_sigpipe(status=None): + """ + Set program to exit on SIGPIPE + """ + _exit_on_signal(signal.SIGPIPE, status) + + +class FileType(object): + """ + Near clone of argparse.FileType, supporting gzip and bzip + """ + def __init__(self, mode='rt'): + self.mode = mode + self.ext_map = fileformat.COMPRESS_EXT.copy() + + def _get_handle(self, file_path): + ext = os.path.splitext(file_path)[1].lower() + return self.ext_map.get(ext, open)(file_path, self.mode) + + def __call__(self, string): + if string == '-': + if 'r' in self.mode: + return sys.stdin + elif 'w' in self.mode: + return sys.stdout + else: + raise ValueError("Invalid mode: {0}".format(string)) + else: + return self._get_handle(string) diff --git a/seqmagick/subcommands/convert.py b/seqmagick/subcommands/convert.py new file mode 100644 index 0000000..fddcfa8 --- /dev/null +++ b/seqmagick/subcommands/convert.py @@ -0,0 +1,362 @@ +""" +Convert between sequence formats +""" +import argparse +import functools +import logging +import random + +from Bio import SeqIO +from Bio.SeqIO import FastaIO +from seqmagick import transform +from seqmagick.fileformat import from_handle + +from . import common + +ALPHABETS = { + "dna": "DNA", + "dna-ambiguous": "DNA", + "rna": "RNA", + "rna-ambiguous": "RNA", + "protein": "protein", +} + +def add_options(parser): + """ + Add optional arguments to the parser + """ + partial_action = common.partial_append_action + file_mods = parser.add_argument_group("Sequence File Modification") + file_mods.add_argument('--line-wrap', dest='line_wrap', metavar='N', + type=int, help='Adjust line wrap for sequence strings. ' + 'When N is 0, all line breaks are removed. Only fasta files ' + 'are supported for the output format.') + file_mods.add_argument('--sort', dest='sort', + choices=['length-asc', 'length-desc', 'name-asc', 'name-desc'], + help='Perform sorting by length or name, ascending or descending. ' + 'ASCII sorting is performed for names') + + parser.epilog = """Filters using regular expressions are case-sensitive + by default. Append "(?i)" to a pattern to make it case-insensitive.""" + + seq_mods = parser.add_argument_group("Sequence Modificaton") + seq_mods.add_argument('--apply-function', type=module_function, + metavar='/path/to/module.py:function_name[:parameter]', + help="""Specify a custom function to apply to the input sequences, + specified as /path/to/file.py:function_name. Function should accept + an iterable of Bio.SeqRecord objects, and yield SeqRecords. If the + parameter is specified, it will be passed as a string as the second + argument to the function. Specify more than one to chain.""", + default=[], action='append') + seq_mods.add_argument('--cut', dest='transforms', + metavar="start:end[,start2:end2]", + type=common.sequence_slices, + action=partial_action(transform.multi_cut_sequences, 'slices'), + help="""Keep only the residues within the 1-indexed start and end + positions specified, : separated. Includes last item. Start or end + can be left unspecified to indicate start/end of sequence. A + negative start may be provided to indicate an offset from the end + of the sequence. Note that to prevent negative numbers being + interpreted as flags, this should be written with an equals + sign between `--cut` and the argument, e.g.: `--cut=-10:`""") + seq_mods.add_argument('--relative-to', dest='cut_relative', metavar='ID', + help="""Apply --cut relative to the indexes of non-gap residues in + sequence identified by ID""") + seq_mods.add_argument('--drop', dest='transforms', + metavar='start:end[,start2:end2]', + type=common.sequence_slices, + action=partial_action(transform.drop_columns, 'slices'), + help="""Remove the residues at the specified indices. Same format as `--cut`.""") + seq_mods.add_argument('--dash-gap', + action=partial_action(transform.dashes_cleanup), dest='transforms', + help="""Replace any of the characters "?.:~" with a "-" for all + sequences""") + seq_mods.add_argument('--lower', + action=partial_action(transform.lower_sequences), + dest='transforms', help='Translate the sequences to lower case') + seq_mods.add_argument('--mask', metavar="start1:end1[,start2:end2]", + action=partial_action(transform.multi_mask_sequences, 'slices'), + type=common.sequence_slices, dest='transforms', help="""Replace + residues in 1-indexed slice with gap-characters. If --relative-to + is also specified, coordinates are relative to the sequence ID + provided.""") + seq_mods.add_argument('--reverse', + action=partial_action(transform.reverse_sequences), + dest='transforms', help='Reverse the order of sites in sequences') + seq_mods.add_argument('--reverse-complement', dest='transforms', + action=partial_action(transform.reverse_complement_sequences), + help='Convert sequences into reverse complements') + seq_mods.add_argument('--squeeze', action=partial_action(transform.squeeze), + dest='transforms', + help='''Remove any gaps that are present in the same + position across all sequences in an alignment (equivalent to + --squeeze-threshold=1.0)''') + seq_mods.add_argument('--squeeze-threshold', dest='transforms', + action=partial_action(transform.squeeze, 'gap_threshold'), + type=common.typed_range(float, 0.0, 1.0), + metavar='PROP', help="""Trim columns from an alignment which + have gaps in least the specified proportion of sequences.""") + seq_mods.add_argument('--transcribe', dest='transforms', + action=partial_action(transform.transcribe, 'transcribe'), + choices=('dna2rna', 'rna2dna'), help="""Transcription and back + transcription for generic DNA and RNA. Source sequences must be the + correct alphabet or this action will likely produce incorrect + results.""") + seq_mods.add_argument('--translate', dest='transforms', + action=partial_action(transform.translate, 'translate'), + choices=['dna2protein', 'rna2protein', 'dna2proteinstop', + 'rna2proteinstop'], help="""Translate from generic DNA/RNA to + proteins. Options with "stop" suffix will NOT translate through + stop codons . Source sequences must be the correct alphabet or + this action will likely produce incorrect results.""") + seq_mods.add_argument('--ungap', + action=partial_action(transform.ungap_sequences), + dest='transforms', help='Remove gaps in the sequence alignment') + seq_mods.add_argument('--upper', + action=partial_action(transform.upper_sequences), + dest='transforms', help='Translate the sequences to upper case') + + seq_select = parser.add_argument_group("Record Selection") + + seq_select.add_argument('--deduplicate-sequences', + action='store_const', const=None, default=False, + dest='deduplicate_sequences', help='Remove any duplicate sequences ' + 'by sequence content, keep the first instance seen') + seq_select.add_argument('--deduplicated-sequences-file', action='store', + metavar='FILE', dest='deduplicate_sequences', default=False, + type=common.FileType('wt'), + help='Write all of the deduplicated sequences to a file') + seq_select.add_argument('--deduplicate-taxa', + action=partial_action(transform.deduplicate_taxa), + dest='transforms', help="""Remove any duplicate sequences by ID, + keep the first instance seen""") + seq_select.add_argument('--exclude-from-file', metavar='FILE', + type=common.FileType('rt'), help="""Filter sequences, removing + those sequence IDs in the specified file""", dest='transforms', + action=partial_action(transform.exclude_from_file, 'handle')) + seq_select.add_argument('--include-from-file', metavar='FILE', + type=common.FileType('rt'), help="""Filter sequences, keeping only + those sequence IDs in the specified file""", dest='transforms', + action=partial_action(transform.include_from_file, 'handle')) + seq_select.add_argument('--head', metavar='N', dest='transforms', + action=partial_action(transform.head, 'head'), help="""Trim + down to top N sequences. With the leading `-', print all but the last N sequences.""") + seq_select.add_argument('--max-length', dest='transforms', metavar='N', + action=partial_action(transform.max_length_discard, 'max_length'), + type=int, help="""Discard any sequences beyond the specified + maximum length. This operation occurs *before* all length-changing + options such as cut and squeeze.""") + seq_select.add_argument('--min-length', dest='transforms', metavar='N', + action=partial_action(transform.min_length_discard, 'min_length'), + type=int, help="""Discard any sequences less than the specified + minimum length. This operation occurs *before* cut and squeeze.""") + seq_select.add_argument('--min-ungapped-length', metavar='N', + action=partial_action(transform.min_ungap_length_discard, + 'min_length'), type=int, help="""Discard any sequences less + than the specified minimum length, excluding gaps. This + operation occurs *before* cut and squeeze.""", + dest='transforms') + seq_select.add_argument('--pattern-include', metavar='REGEX', + action=partial_action(transform.name_include, 'filter_regex'), + dest='transforms', help="""Filter the sequences by regular + expression in ID or description""") + seq_select.add_argument('--pattern-exclude', metavar='REGEX', + action=partial_action(transform.name_exclude, 'filter_regex'), + dest='transforms', help="""Filter the sequences by regular + expression in ID or description""") + seq_select.add_argument('--prune-empty', + action=partial_action(transform.prune_empty), dest='transforms', + help="Prune sequences containing only gaps ('-')") + seq_select.add_argument('--sample', metavar='N', dest='transforms', type=int, + action=partial_action(transform.sample, 'k'), + help = """ Select a random sampling of sequences """) + seq_select.add_argument('--sample-seed', metavar='N', type=int, + help = """Set random seed for sampling of sequences""") + seq_select.add_argument('--seq-pattern-include', metavar='REGEX', + action=partial_action(transform.seq_include, 'filter_regex'), + dest='transforms', help="""Filter the sequences by regular + expression in sequence""") + seq_select.add_argument('--seq-pattern-exclude', metavar='REGEX', + action=partial_action(transform.seq_exclude, 'filter_regex'), + dest='transforms', help="""Filter the sequences by regular + expression in sequence""") + seq_select.add_argument('--tail', metavar='N', dest='transforms', + action=partial_action(transform.tail, 'tail'), + help="""Trim down to bottom N sequences. Use +N to output sequences starting with the Nth.""") + + id_mods = parser.add_argument_group("Sequence ID Modification") + id_mods.add_argument('--first-name', + action=partial_action(transform.first_name_capture), + dest='transforms', help='''Take only the first whitespace-delimited + word as the name of the sequence''') + id_mods.add_argument('--name-suffix', metavar='SUFFIX', + action=partial_action(transform.name_append_suffix, 'suffix'), + dest='transforms', help='Append a suffix to all IDs.') + id_mods.add_argument('--name-prefix', metavar='PREFIX', + action=partial_action(transform.name_insert_prefix, 'prefix'), + dest='transforms', help="""Insert a prefix for all + IDs.""") + id_mods.add_argument('--pattern-replace', nargs=2, + metavar=('search_pattern', 'replace_pattern'), + action=partial_action(transform.name_replace, ('search_regex', + 'replace_pattern')), + dest='transforms', help="""Replace regex pattern "search_pattern" + with "replace_pattern" in sequence ID and description""") + id_mods.add_argument('--strip-range', dest='transforms', + action=partial_action(transform.strip_range), help="""Strip ranges + from sequences IDs, matching """) + + format_group = parser.add_argument_group('Format Options') + format_group.add_argument('--input-format', metavar='FORMAT', + help="Input file format (default: determine from extension)") + format_group.add_argument('--output-format', metavar='FORMAT', + help="Output file format (default: determine from extension)") + + parser.add_argument('--alphabet', choices=ALPHABETS, + help="""Input alphabet. Required for writing NEXUS.""") + + return parser + + +def build_parser(parser): + """ + Add shared arguments to the convert or mogrify parser. + """ + add_options(parser) + parser.add_argument('source_file', type=common.FileType('rt'), + help="Input sequence file") + parser.add_argument('dest_file', help="Output file") + + return parser + + +def append_annotation_iterator(records_iterator, alphabet): + for record in records_iterator: + record.annotations["molecule_type"] = ALPHABETS[alphabet] + yield record + + +def transform_file(source_file, destination_file, arguments): + # Get just the file name, useful for naming the temporary file. + source_file_type = (arguments.input_format or from_handle(source_file)) + + destination_file_type = (arguments.output_format or + from_handle(destination_file)) + + # Get an iterator. + sorters = {'length': transform.sort_length, + 'name': transform.sort_name,} + directions = {'asc': 1, 'desc': 0} + if arguments.sort: + # Sorted iterator + key, direction = arguments.sort.split('-') + records = sorters[key](source_file=source_file, + source_file_type=source_file_type, + direction=directions[direction]) + else: + # Unsorted iterator. + records = SeqIO.parse(source_file, source_file_type) + + + ######################################### + # Apply generator functions to iterator.# + ######################################### + + # Apply all the transform functions in transforms + if arguments.transforms: + + # TODO: might be nice to somehow pass this directly into sample action + if arguments.sample_seed is not None: + random.seed(arguments.sample_seed) + + # Special case handling for --cut and --relative-to + if arguments.cut_relative: + for o, n in ((transform.multi_cut_sequences, + transform.cut_sequences_relative), + (transform.multi_mask_sequences, + transform.mask_sequences_relative)): + # Add a function to trim any columns which are gaps in the + # sequence ID + try: + f = next(f for f in arguments.transforms + if f.func == o) + except StopIteration: + continue + i = arguments.transforms.index(f) + arguments.transforms.pop(i) + arguments.transforms.insert(i, + functools.partial(n, + record_id=arguments.cut_relative, **f.keywords)) + + for function in arguments.transforms: + records = function(records) + + if (arguments.deduplicate_sequences or + arguments.deduplicate_sequences is None): + records = transform.deduplicate_sequences( + records, arguments.deduplicate_sequences) + + # Apply all the partial functions + if arguments.apply_function: + for apply_function in arguments.apply_function: + records = apply_function(records) + + # Only the fasta format is supported, as SeqIO.write does not have a 'wrap' + # parameter. + if (arguments.line_wrap is not None and destination_file_type == 'fasta'): + logging.info("Attempting to write fasta with %d line breaks.", + arguments.line_wrap) + + with destination_file: + writer = FastaIO.FastaWriter( + destination_file, wrap=arguments.line_wrap) + writer.write_file(records) + else: + # Mogrify requires writing all changes to a temporary file by default, + # but convert uses a destination file instead if one was specified. Get + # sequences from an iterator that has generator functions wrapping it. + # After creation, it is then copied back over the original file if all + # tasks finish up without an exception being thrown. This avoids + # loading the entire sequence file up into memory. + logging.info("Applying transformations, writing to %s", + destination_file) + # Append datatype annotation, mandatory for Nexus files conversion. + if arguments.alphabet != None: + records = append_annotation_iterator(records, arguments.alphabet) + SeqIO.write(records, destination_file, destination_file_type) + + +def module_function(string): + """ + Load a function from a python module using a file name, function name + specification of format: + /path/to/x.py:function_name[:parameter] + """ + parts = string.split(':', 2) + if len(parts) < 2: + raise ValueError( + "Illegal specification. Should be module:function[:parameter]") + module_path, function_name = parts[:2] + + # Import the module + module_vars = {} + exec(compile(open(module_path).read(), module_path, 'exec'), module_vars) + + try: + function = module_vars[function_name] + except KeyError: + raise argparse.ArgumentTypeError("{0} has no attribute '{1}'".format( + module_path, function_name)) + + if len(parts) == 3: + old_function = function + function = lambda r: old_function(r, parts[2]) + + return function + + +def action(arguments): + with arguments.source_file as src, \ + common.atomic_write( + arguments.dest_file, file_factory=common.FileType('wt')) as dest: + transform_file(src, dest, arguments) diff --git a/seqmagick/subcommands/extract_ids.py b/seqmagick/subcommands/extract_ids.py new file mode 100644 index 0000000..2080b87 --- /dev/null +++ b/seqmagick/subcommands/extract_ids.py @@ -0,0 +1,41 @@ +""" +Extract the sequence IDs from a file +""" +import sys + +from Bio import SeqIO + +from seqmagick import fileformat + +from . import common + + +def build_parser(parser): + parser.add_argument( + 'sequence_file', type=common.FileType('rt'), help="Sequence file") + parser.add_argument( + '-o', '--output-file', type=common.FileType('wt'), default=sys.stdout, + help="Destination file") + parser.add_argument( + '--input-format', help="Input format for sequence file") + parser.add_argument( + '-d', '--include-description', action='store_true', default=False, + help='Include the sequence description in output [default: %(default)s]') + + +def action(arguments): + common.exit_on_sigpipe() + + # Determine file format for input and output + source_format = (arguments.input_format or + fileformat.from_handle(arguments.sequence_file)) + + with arguments.sequence_file: + sequences = SeqIO.parse(arguments.sequence_file, source_format) + if arguments.include_description: + ids = (sequence.description for sequence in sequences) + else: + ids = (sequence.id for sequence in sequences) + with arguments.output_file: + for i in ids: + print(i, file=arguments.output_file) diff --git a/seqmagick/subcommands/info.py b/seqmagick/subcommands/info.py new file mode 100644 index 0000000..94485e8 --- /dev/null +++ b/seqmagick/subcommands/info.py @@ -0,0 +1,179 @@ +""" +Info action +""" + +import collections +import csv +import multiprocessing +import sys + +from functools import partial + +from Bio import SeqIO + +from seqmagick import fileformat + +from . import common + +def build_parser(parser): + parser.add_argument('source_files', metavar='sequence_files', nargs='+') + parser.add_argument('--input-format', help="""Input format. Overrides + extension for all input files""") + parser.add_argument('--out-file', dest='destination_file', + type=common.FileType('wt'), default=sys.stdout, + metavar='destination_file', + help='Output destination. Default: STDOUT') + parser.add_argument('--format', dest='output_format', + choices=('tab', 'csv', 'align'), help="""Specify output format as + tab-delimited, CSV or aligned in a borderless table. Default is + tab-delimited if the output is directed to a file, aligned if output to + the console.""") + parser.add_argument('--threads', default=1, + type=int, + help="""Number of threads (CPUs). [%(default)s] """) + +class SeqInfoWriter(object): + """ + Base writer for sequence files + """ + + def __init__(self, sequence_files, rows, output): + self.sequence_files = sequence_files + self.rows = rows + self.output = output + + def write_row(self, row): + raise NotImplementedError("Override in subclass") + + def write_header(self, header): + self.write_row(header) + + def write(self): + header = ('name', 'alignment', 'min_len', 'max_len', 'avg_len', + 'num_seqs') + + self.write_header(header) + + for row in self.rows: + self.write_row(_SeqFileInfo(*row)) + +class CsvSeqInfoWriter(SeqInfoWriter): + delimiter = ',' + def __init__(self, sequence_files, rows, output): + super(CsvSeqInfoWriter, self).__init__(sequence_files, rows, output) + self.writer = csv.writer(self.output, delimiter=self.delimiter, + lineterminator='\n') + + def write_row(self, row): + # To cope with header + if hasattr(row, '_replace'): + row = row._replace(avg_len='{0:.2f}'.format(row.avg_len)) + self.writer.writerow(row) + +class TsvSeqInfoWriter(CsvSeqInfoWriter): + delimiter = '\t' + +class AlignedSeqInfoWriter(SeqInfoWriter): + def __init__(self, sequence_files, rows, output): + super(AlignedSeqInfoWriter, self).__init__(sequence_files, rows, output) + self.max_name_length = max(len(f) for f in self.sequence_files) + + def write_header(self, header): + fmt = ('{0:' + str(self.max_name_length + 1) + 's}{1:10s}' + '{2:>10s}{3:>10s}{4:>10s}{5:>10s}') + print(fmt.format(*header), file=self.output) + + def write_row(self, row): + fmt = ('{name:' + str(self.max_name_length + 1) + 's}{alignment:10s}' + '{min_len:10d}{max_len:10d}{avg_len:10.2f}{num_seqs:10d}') + print(fmt.format(**row._asdict()), file=self.output) + +_WRITERS = {'csv': CsvSeqInfoWriter, 'tab': TsvSeqInfoWriter, 'align': + AlignedSeqInfoWriter} + +_HEADERS = ('name', 'alignment', 'min_len', 'max_len', 'avg_len', + 'num_seqs') +_SeqFileInfo = collections.namedtuple('SeqFileInfo', _HEADERS) + +def summarize_sequence_file(source_file, file_type=None): + """ + Summarizes a sequence file, returning a tuple containing the name, + whether the file is an alignment, minimum sequence length, maximum + sequence length, average length, number of sequences. + """ + is_alignment = True + avg_length = None + min_length = sys.maxsize + max_length = 0 + sequence_count = 0 + + # Get an iterator and analyze the data. + with common.FileType('rt')(source_file) as fp: + if not file_type: + file_type = fileformat.from_handle(fp) + for record in SeqIO.parse(fp, file_type): + sequence_count += 1 + sequence_length = len(record) + if max_length != 0: + # If even one sequence is not the same length as the others, + # we don't consider this an alignment. + if sequence_length != max_length: + is_alignment = False + + # Lengths + if sequence_length > max_length: + max_length = sequence_length + if sequence_length < min_length: + min_length = sequence_length + + # Average length + if sequence_count == 1: + avg_length = float(sequence_length) + else: + avg_length = avg_length + ((sequence_length - avg_length) / + sequence_count) + + # Handle an empty file: + if avg_length is None: + min_length = max_length = avg_length = 0 + if sequence_count <= 1: + is_alignment = False + + return (source_file, str(is_alignment).upper(), min_length, + max_length, avg_length, sequence_count) + +def action(arguments): + """ + Given one more more sequence files, determine if the file is an alignment, + the maximum sequence length and the total number of sequences. Provides + different output formats including tab (tab-delimited), csv and align + (aligned as if part of a borderless table). + """ + # Ignore SIGPIPE, for head support + common.exit_on_sigpipe() + common.exit_on_sigint() + + handle = arguments.destination_file + output_format = arguments.output_format + if not output_format: + try: + output_format = 'align' if handle.isatty() else 'tab' + except AttributeError: + output_format = 'tab' + + writer_cls = _WRITERS[output_format] + + ssf = partial(summarize_sequence_file, file_type = arguments.input_format) + + # if only one thread, do not use the multithreading so parent process + # can be terminated using ctrl+c + if arguments.threads > 1: + pool = multiprocessing.Pool(processes=arguments.threads) + summary = pool.imap(ssf, arguments.source_files) + else: + summary = (ssf(f) for f in arguments.source_files) + + with handle: + writer = writer_cls(arguments.source_files, summary, handle) + writer.write() + diff --git a/seqmagick/subcommands/mogrify.py b/seqmagick/subcommands/mogrify.py new file mode 100644 index 0000000..4b156bc --- /dev/null +++ b/seqmagick/subcommands/mogrify.py @@ -0,0 +1,34 @@ +""" +Modify sequence file(s) in place. +""" + +import logging + +from . import convert, common + + +def build_parser(parser): + """ + """ + convert.add_options(parser) + + parser.add_argument( + 'input_files', metavar="sequence_file", nargs='+', + type=common.FileType('rt'), help="Sequence file(s) to mogrify") + + return parser + + +def action(arguments): + """ + Run mogrify. Most of the action is in convert, this just creates a temp + file for the output. + """ + for input_file in arguments.input_files: + logging.info(input_file) + # Generate a temporary file + with common.atomic_write( + input_file.name, file_factory=common.FileType('wt')) as tf: + convert.transform_file(input_file, tf, arguments) + if hasattr(input_file, 'close'): + input_file.close() diff --git a/seqmagick/subcommands/primer_trim.py b/seqmagick/subcommands/primer_trim.py new file mode 100644 index 0000000..f5d5687 --- /dev/null +++ b/seqmagick/subcommands/primer_trim.py @@ -0,0 +1,328 @@ +""" +Find a primer sequence in a gapped alignment, trim to amplicon +""" +import argparse +import itertools +import logging +import operator +import sys + +from Bio import SeqIO, pairwise2 +from Bio.Seq import Seq + +from seqmagick import transform, fileformat + +from . import common + + +def build_parser(parser): + parser.add_argument( + 'source_file', type=argparse.FileType('r'), help="Source alignment file") + parser.add_argument( + 'output_file', type=argparse.FileType('w'), help="Destination trimmed file") + parser.add_argument( + 'forward_primer', type=iupac_ambiguous_sequence, + help="The forward primer used") + parser.add_argument( + 'reverse_primer', type=iupac_ambiguous_sequence, + help="""The reverse primer used. By default the reverse primer + is assumed to be a subsequence of the top strand (that is, + the reverse complement of an actual downstream PCR + primer). Use --reverse-is-revcomp if this is not the + case.""") + parser.add_argument( + '--reverse-is-revcomp', dest="reverse_complement", default=False, + action='store_true', help="""Reverse primer is written as the + reverse complement of the top strand (default: + %(default)s)""") + parser.add_argument( + '--source-format', default=None, + help='Alignment format (default: detect from extension') + parser.add_argument( + '--output-format', default=None, + help='Alignment format (default: detect from extension') + parser.add_argument( + '--include-primers', action="store_true", default=False, + help="""Include the primers in the output (default: %(default)s)""") + parser.add_argument( + '--max-hamming-distance', type=common.positive_value(int), default=1, + help="""Maximum Hamming + distance between primer and alignment site (default: %(default)s). + IUPAC ambiguous bases in the primer matching unambiguous bases in + the alignment are not penalized""") + parser.add_argument( + '--prune-action', default='trim', choices=list(_ACTIONS.keys()), + help="""Action to take. Options are trim (trim to the region + defined by the two primers, decreasing the width of the alignment), + or isolate (convert all characters outside the primer-defined area + to gaps). default: %(default)s""") + + +# Sequence-related functions +def ungap_index_map(sequence, gap_chars='-'): + """ + Returns a dict mapping from an index in the ungapped sequence to an index + in the gapped sequence. + + >>> ungap_index_map('AC-TG-') + {0: 0, 1: 1, 2: 3, 3: 4} + """ + counter = itertools.count(0).__next__ + ungap_indexes = [ + counter() if c not in gap_chars else None for c in iter(sequence) + ] + return dict( + (ungapped, gapped) + for ungapped, gapped in zip(ungap_indexes, range(len(sequence))) + if ungapped is not None) + + +def gap_index_map(sequence, gap_chars='-'): + """ + Opposite of ungap_index_map: returns mapping from gapped index to ungapped + index. + + >>> gap_index_map('AC-TG-') + {0: 0, 1: 1, 3: 2, 4: 3} + """ + return dict( + (v, k) for k, v in list(ungap_index_map(sequence, gap_chars).items())) + + +def _iupac_ambiguous_equal(ambig_base, unambig_base): + """ + Tests two bases for equality, accounting for IUPAC ambiguous DNA + + ambiguous base may be IUPAC ambiguous, unambiguous must be one of ACGT + """ + iupac_translation = { + 'A': 'A', + 'C': 'C', + 'G': 'G', + 'T': 'T', + 'U': 'U', + 'R': 'AG', + 'Y': 'CT', + 'S': 'GC', + 'W': 'AT', + 'K': 'GT', + 'M': 'AC', + 'B': 'CGT', + 'D': 'AGT', + 'H': 'ACT', + 'V': 'ACG', + 'N': 'ACGT', + '-': '-' + } + for i in (ambig_base, unambig_base): + if not len(i) == 1: + raise ValueError("only one base may be passed.") + + return unambig_base.upper() in iupac_translation[ambig_base.upper()] + + +def hamming_distance(s1, s2, equality_function=operator.eq): + """ + Returns the hamming distance between two strings. + """ + if not len(s1) == len(s2): + raise ValueError("String lengths are not equal") + + # Number of non-matching characters: + return sum(not equality_function(c1, c2) for c1, c2 in zip(s1, s2)) + + +class PrimerNotFound(Exception): + pass + + +class PrimerOrderError(Exception): + def __init__(self, forward_indexes, reverse_indexes): + super(PrimerOrderError, self).__init__( + "Reverse primer before forward primer: {0} > {1}".format( + forward_indexes, reverse_indexes)) + + +class PrimerAligner(object): + """ + Get positions of pairwise alignments of a primer to a sequence. + """ + + def __init__(self, primer, match=5, difference=-4, gap_open=-10, + gap_extend=-0.5, penalize_end_gaps=False): + self.primer = primer + self.match = match + self.difference = difference + self.gap_open = gap_open + self.gap_extend = gap_extend + self.penalize_end_gaps = penalize_end_gaps + + def align(self, sequence): + """ + Aligns the primer to the given query sequence, returning a tuple of: + + hamming_distance, start, end + + Where hamming distance is the distance between the primer and aligned + sequence, and start and end give the start and end index of the primer + relative to the input sequence. + """ + seq_aln, primer_aln, score, start, end = pairwise2.align.globalms( + str(sequence).upper(), str(self.primer).upper(), + self.match, self.difference, self.gap_open, + self.gap_extend, one_alignment_only=True, + penalize_end_gaps=self.penalize_end_gaps)[0] + + # Get an ungapped mapping on the sequence + index_map = gap_index_map(seq_aln) + ungap_map = ungap_index_map(primer_aln) + + # Trim to primer + start = ungap_map[0] + end = ungap_map[len(self.primer) - 1] + + trimmed = seq_aln[start:end + 1] + + ham_dist = hamming_distance(primer_aln[start:end + 1], trimmed, + _iupac_ambiguous_equal) + # assert primer_aln[start:end].replace('-', '') == str(self.primer) + + # TODO: handle start or end being gap better. For now, just give up + # and return maxint for the hamming distance + if trimmed.endswith('-'): + tail = len(trimmed) - len(trimmed.rstrip('-')) + end = index_map[end - tail] + 1 + ham_dist = sys.maxsize + else: + end = index_map[end] + if trimmed.startswith('-'): + start = 0 + ham_dist = sys.maxsize + else: + start = index_map[start] + + return ham_dist, start, end + + @property + def max_score(self): + """ + Maximum possible alignment score + """ + return len(self.primer) * self.match + + +# Types for argparse +def iupac_ambiguous_sequence(string): + return Seq(string, IUPAC.ambiguous_dna) + + +def locate_primers(sequences, forward_primer, reverse_primer, + reverse_complement, max_hamming_distance): + """ + Find forward and reverse primers in a set of sequences, return two tuples: + (forward_start, forward_end), (reverse_start, reverse_end) + """ + forward_loc = None + reverse_loc = None + seq_length = None + + # Reverse complement the reverse primer, if appropriate + if reverse_complement: + reverse_primer = reverse_primer.reverse_complement() + + forward_aligner = PrimerAligner(forward_primer) + reverse_aligner = PrimerAligner(reverse_primer) + + for i, sequence in enumerate(sequences): + if seq_length is None: + seq_length = len(sequence) + elif len(sequence) != seq_length: + raise ValueError(("Sequence Length Heterogeneity: {0} != {1}. " + "Is this an alignment?").format( + len(sequence), seq_length)) + index_map = ungap_index_map(sequence.seq) + if forward_loc is None: + ham_dist, start, end = forward_aligner.align(sequence.seq.ungap()) + if ham_dist <= max_hamming_distance: + forward_loc = index_map[start], index_map[end] + logging.info("Forward in sequence %d: indexes %d to %d", i + 1, + *forward_loc) + if reverse_loc is None: + ham_dist, start, end = reverse_aligner.align(sequence.seq.ungap()) + if ham_dist <= max_hamming_distance: + reverse_loc = index_map[start], index_map[end] + logging.info("Reverse in sequence %d: indexes %d to %d", i + 1, + *reverse_loc) + if forward_loc and reverse_loc: + # Both found + # Check order + if forward_loc[0] > reverse_loc[0]: + raise PrimerOrderError(forward_loc[0], reverse_loc[0]) + return forward_loc, reverse_loc + else: + logging.debug( + "Sequence %d: %d/2 primers found", i + 1, + sum(j is not None for j in (forward_loc, reverse_loc))) + + # Did not find either the forward or reverse primer: + if not forward_loc: + raise PrimerNotFound(forward_primer) + else: + raise PrimerNotFound(reverse_primer) + + +def trim(sequences, start, end): + """ + Slice the input sequences from start to end + """ + logging.info("Trimming from %d to %d", start, end) + return (sequence[start:end] for sequence in sequences) + + +# Prune actions +_ACTIONS = {'trim': trim, 'isolate': transform.isolate_region} + + +def action(arguments): + """ + Trim the alignment as specified + """ + # Determine file format for input and output + source_format = (arguments.source_format or + fileformat.from_handle(arguments.source_file)) + output_format = (arguments.output_format or + fileformat.from_handle(arguments.output_file)) + + # Load the alignment + with arguments.source_file: + sequences = SeqIO.parse( + arguments.source_file, + source_format) + + # Locate primers + (forward_start, forward_end), (reverse_start, reverse_end) = locate_primers( + sequences, arguments.forward_primer, + arguments.reverse_primer, arguments.reverse_complement, + arguments.max_hamming_distance) + + # Generate slice indexes + if arguments.include_primers: + start = forward_start + end = reverse_end + 1 + else: + start = forward_end + 1 + end = reverse_start + + # Rewind the input file + arguments.source_file.seek(0) + sequences = SeqIO.parse( + arguments.source_file, + source_format) + + # Apply the transformation + prune_action = _ACTIONS[arguments.prune_action] + transformed_sequences = prune_action(sequences, start, end) + + with arguments.output_file: + SeqIO.write(transformed_sequences, arguments.output_file, + output_format) diff --git a/seqmagick/subcommands/quality_filter.py b/seqmagick/subcommands/quality_filter.py new file mode 100644 index 0000000..dce0c6b --- /dev/null +++ b/seqmagick/subcommands/quality_filter.py @@ -0,0 +1,776 @@ +""" +Filter reads based on quality scores +""" + +import collections +import csv +import itertools +import logging +import os +import sys +import time + +from Bio import SeqIO +import pygtrie as trie +from Bio.SeqIO import QualityIO + +from seqmagick import fileformat, __version__ +from .common import typed_range, FileType + + +def trie_match(string, trie): + def has_prefix(teststring, trie): + for key in trie.keys(): + if key.startswith(teststring): + return True + return False + longest = None + for i in range(len(string)): + substr = string[:i + 1] + if not has_prefix(substr, trie): + break + if trie.has_key(substr): + longest = substr + return longest + + +# Default minimummean quality score +DEFAULT_MEAN_SCORE = 25.0 + +# Tools for working with ambiguous bases +# Map from Ambiguous Base to regex +_AMBIGUOUS_MAP = { + 'R': 'GA', + 'Y': 'TC', + 'K': 'GT', + 'M': 'AC', + 'S': 'GC', + 'W': 'AT', + 'B': 'GTC', + 'D': 'GAT', + 'H': 'ACT', + 'V': 'GCA', + 'N': 'AGCT', +} + + +def all_unambiguous(sequence_str): + """ + All unambiguous versions of sequence_str + """ + result = [[]] + for c in sequence_str: + result = [i + [a] for i in result for a in _AMBIGUOUS_MAP.get(c, c)] + return [''.join(i) for i in result] + + +def build_parser(parser): + """ + Generate a subparser + """ + parser.add_argument( + 'sequence_file', + type=FileType('r'), + help="""Input fastq file. A fasta-format file may also be provided + if --input-qual is also specified.""") + parser.add_argument( + '--input-qual', + type=FileType('r'), + help="""The quality scores associated with the input file. Only + used if input file is fasta.""") + parser.add_argument( + 'output_file', + type=FileType('w'), + help="""Output file. Format determined from extension.""") + + output_group = parser.add_argument_group("Output") + output_group.add_argument( + '--report-out', + type=FileType('w'), + default=sys.stdout, + help="""Output file for report [default: + stdout]""") + output_group.add_argument( + '--details-out', + type=FileType('w'), + help="""Output file to report fate of each sequence""") + output_group.add_argument( + '--no-details-comment', + action='store_false', + default=True, + dest='details_comment', + help="""Do not write comment + lines with version and call to start --details-out""") + + parser.add_argument( + '--min-mean-quality', + metavar='QUALITY', + type=float, + default=DEFAULT_MEAN_SCORE, + help="""Minimum mean quality score for + each read [default: %(default)s]""") + parser.add_argument( + '--min-length', + metavar='LENGTH', + type=int, + default=200, + help="""Minimum length to keep sequence [default: + %(default)s]""") + parser.add_argument( + '--max-length', + metavar='LENGTH', + type=int, + default=1000, + help="""Maximum length to keep before truncating + [default: %(default)s]. This operation occurs before + --max-ambiguous""") + + window_group = parser.add_argument_group('Quality window options') + window_group.add_argument( + '--quality-window-mean-qual', + type=float, + help="""Minimum quality score within the window defined by + --quality-window. [default: same as --min-mean-quality]""") + window_group.add_argument( + '--quality-window-prop', + help="""Proportion of + reads within quality window to that must pass filter. Floats are [default: + %(default).1f]""", + default=1.0, + type=typed_range(float, 0.0, 1.0)) + window_group.add_argument( + '--quality-window', + type=int, + metavar='WINDOW_SIZE', + default=0, + help="""Window size for truncating sequences. When set + to a non-zero value, sequences are truncated where the mean mean + quality within the window drops below --min-mean-quality. + [default: %(default)s]""") + + parser.add_argument( + '--ambiguous-action', + choices=('truncate', 'drop'), + help="""Action to take on ambiguous base in sequence (N's). + [default: no action]""") + parser.add_argument( + '--max-ambiguous', + default=None, + help="""Maximum number + of ambiguous bases in a sequence. Sequences exceeding this count + will be removed.""", + type=int) + parser.add_argument( + '--pct-ambiguous', + help="""Maximun percent of + ambiguous bases in a sequence. Sequences exceeding this percent + will be removed.""", + type=float) + + barcode_group = parser.add_argument_group('Barcode/Primer') + primer_group = barcode_group.add_mutually_exclusive_group() + primer_group.add_argument( + '--primer', help="""IUPAC ambiguous primer to + require""") + primer_group.add_argument( + '--no-primer', + help="""Do not use a primer.""", + action='store_const', + const='', + dest='primer') + barcode_group.add_argument( + '--barcode-file', + help="""CSV file containing + sample_id,barcode[,primer] in the rows. A single primer for all + sequences may be specified with `--primer`, or `--no-primer` may be + used to indicate barcodes should be used without a primer + check.""", + type=FileType('r')) + barcode_group.add_argument( + '--barcode-header', + action='store_true', + default=False, + help="""Barcodes have a header row [default: + %(default)s]""") + barcode_group.add_argument( + '--map-out', + help="""Path to write + sequence_id,sample_id pairs""", + type=FileType('w'), + metavar='SAMPLE_MAP') + barcode_group.add_argument( + '--quoting', + help="""A string naming an + attribute of the csv module defining the quoting behavior for + `SAMPLE_MAP`. [default: %(default)s]""", + default='QUOTE_MINIMAL', + choices=[s for s in dir(csv) if s.startswith('QUOTE_')]) + + +def mean(sequence): + """ + Calculates the arithmetic mean of a list / tuple + """ + return sum(sequence) / float(len(sequence)) + + +def moving_average(iterable, n): + """ + From Python collections module documentation + + moving_average([40, 30, 50, 46, 39, 44]) --> 40.0 42.0 45.0 43.0 + """ + it = iter(iterable) + d = collections.deque(itertools.islice(it, n - 1)) + d.appendleft(0) + s = sum(d) + for elem in it: + s += elem - d.popleft() + d.append(elem) + yield s / float(n) + + +class FailedFilter(Exception): + """ + A read failed filtering + """ + + def __init__(self, value=None): + self.value = value + + +class RecordEventListener(object): + """ + Contains and dispatches to handlers on events around sequence records + + Event handlers take a single positional argument, the record, and optional + additional keyword arguments. + """ + + def __init__(self): + self.listeners = collections.defaultdict(set) + + def __call__(self, event, record, **kwargs): + """ + Trigger an event + + :param event: Event name + :param record: Record affected + :param **kwargs: Optional additional arguments to pass to handlers + """ + if event in self.listeners: + for listener in self.listeners[event]: + listener(record, **kwargs) + + def register_handler(self, event, handler): + """ + Register ``handler`` for ``event`` + """ + self.listeners[event].add(handler) + + def iterable_hook(self, name, iterable): + """ + Fire an event named ``name`` with each item in iterable + """ + for record in iterable: + self(name, record) + yield record + + +class RecordReportHandler(object): + """ + Generates a report to a CSV file detailing every record processed. + + Listens for events: [read, write, failed_filter, found_barcode] + """ + HEADERS = ('sequence_name', 'in_length', 'in_mean_qual', 'sample', + 'out_length', 'out_mean_qual', 'fail_filter', 'fail_value') + + def __init__(self, fp, args, write_comments=True): + if write_comments: + fp.write('# Generated by `seqmagick quality-filter` version {0}\n'. + format(__version__)) + fp.write('# Arguments: {0}\n'.format(' '.join(args))) + fp.write('# Working directory: {0}\n'.format(os.getcwd())) + + self.writer = csv.DictWriter( + fp, + self.HEADERS, + lineterminator='\n', + quoting=csv.QUOTE_NONNUMERIC) + self.writer.writeheader() + self.current_record = None + + self.read = 0 + self.failed = 0 + self.start = time.time() + self.last_report = 0.0 + + def register_with(self, listener): + listener.register_handler('failed_filter', self._record_failed) + listener.register_handler('read', self._read_record) + listener.register_handler('write', self._wrote_record) + listener.register_handler('found_barcode', self._found_barcode) + + def _write(self): + assert self.current_record + self.writer.writerow(self.current_record) + self.current_record = None + + def _record_failed(self, record, filter_name, value=None): + self.current_record.update({ + 'fail_filter': filter_name, + 'fail_value': value + }) + + self._write() + self.failed += 1 + self._report() + + def _read_record(self, record): + self.current_record = { + 'sequence_name': record.id, + 'in_length': len(record) + } + if 'phred_quality' in record.letter_annotations: + self.current_record['in_mean_qual'] = mean( + record.letter_annotations['phred_quality']) + self.read += 1 + + def _found_barcode(self, record, sample, barcode=None): + """Hook called when barcode is found""" + assert record.id == self.current_record['sequence_name'] + self.current_record['sample'] = sample + + def _wrote_record(self, record): + self.current_record['out_length'] = len(record) + if 'phred_quality' in record.letter_annotations: + self.current_record['out_mean_qual'] = mean( + record.letter_annotations['phred_quality']) + self._write() + self._report() + + def _report(self): + if not sys.stdout.isatty(): + return + t = time.time() + if t - self.last_report < 0.4 or not self.read: + return + + self.last_report = t + sys.stderr.write( + '{0:10.1f}s Processed {1:10d} records; {2:10d} passed ({3:6.2f}%)\r'. + format(t - self.start, self.read, self.read - self.failed, + float(self.read - self.failed) / self.read * 100.0)) + + +class BaseFilter(object): + """ + Base class for filters + """ + report_fields = ('name', 'passed_unchanged', 'passed_changed', 'failed', + 'total_filtered', 'proportion_passed') + + def __init__(self, listener=None): + self.passed_unchanged = 0 + self.passed_changed = 0 + self.failed = 0 + self.listener = listener + + def filter_record(self, record): + """ + Filter a record. If the filter succeeds, returns a SeqRecord. If it + fails, raises an instance of FailedFilter with an optional value. + """ + raise NotImplementedError("Override in subclass") + + def filter_records(self, records): + """ + Apply the filter to records + """ + for record in records: + try: + filtered = self.filter_record(record) + assert (filtered) + # Quick tracking whether the sequence was modified + if filtered.seq == record.seq: + self.passed_unchanged += 1 + else: + self.passed_changed += 1 + yield filtered + except FailedFilter as e: + self.failed += 1 + v = e.value + if self.listener: + self.listener( + 'failed_filter', + record, + filter_name=self.name, + value=v) + + @property + def passed(self): + return self.passed_changed + self.passed_unchanged + + @property + def total_filtered(self): + return self.passed + self.failed + + @property + def proportion_passed(self): + if not self.total_filtered: + return 0 + return float(self.passed) / self.total_filtered + + def report_dict(self): + return dict((f, getattr(self, f)) for f in self.report_fields) + + +class QualityScoreFilter(BaseFilter): + """ + Quality score filter - requires that the average base quality over the + length of the read is greater than some threshold. + """ + + def __init__(self, min_mean_score=DEFAULT_MEAN_SCORE): + super(QualityScoreFilter, self).__init__() + self.min_mean_score = min_mean_score + self.name = "Quality Score [min_mean: {0}]".format(min_mean_score) + + def filter_record(self, record): + """ + Filter a single record + """ + quality_scores = record.letter_annotations['phred_quality'] + + mean_score = mean(quality_scores) + if mean_score >= self.min_mean_score: + return record + else: + raise FailedFilter(mean_score) + + +class WindowQualityScoreFilter(BaseFilter): + """ + Filter records, truncating records when the mean score drops below a + certain value. + """ + + def __init__(self, window_size, min_mean_score=DEFAULT_MEAN_SCORE): + super(WindowQualityScoreFilter, self).__init__() + self.min_mean_score = min_mean_score + assert window_size and window_size > 0 + self.window_size = window_size + self.name = ("Windowed Quality Score " + + "[min_mean-quality: {0}; window_size: {1}]").format( + min_mean_score, window_size) + + def filter_record(self, record): + """ + Filter a single record + """ + quality_scores = record.letter_annotations['phred_quality'] + + # Simple case - window covers whole sequence + if len(record) <= self.window_size: + mean_score = mean(quality_scores) + if mean_score >= self.min_mean_score: + return record + else: + raise FailedFilter(mean_score) + + # Find the right clipping point. Start clipping at the beginning of the + # sequence, then extend the window to include regions with acceptable + # mean quality scores. + clip_right = 0 + for i, a in enumerate( + moving_average(quality_scores, self.window_size)): + + if a >= self.min_mean_score: + clip_right = i + self.window_size + else: + break + + if clip_right: + return record[:clip_right] + else: + # First window failed - record fails + raise FailedFilter() + + +class AmbiguousBaseFilter(BaseFilter): + """ + Filter records, taking some action if 'N' is encountered in the sequence. + + action - either 'truncate' (drop N and any sequence following) or 'drop' + (remove sequences with 'N's) + """ + name = 'Ambiguous Base' + + def __init__(self, action): + super(AmbiguousBaseFilter, self).__init__() + if action not in ('truncate', 'drop'): + raise ValueError("Unknown action: {0}".format(action)) + self.action = action + self.name = AmbiguousBaseFilter.name + " [{0}]".format(action) + + def filter_record(self, record): + """ + Filter a record, truncating or dropping at an 'N' + """ + nloc = record.seq.find('N') + if nloc == -1: + return record + elif self.action == 'truncate': + return record[:nloc] + elif self.action == 'drop': + raise FailedFilter() + else: + assert False + + +class MaxAmbiguousFilter(BaseFilter): + """ + Filters records exceeding some minimum number of ambiguous bases + """ + name = "Maximum Ambiguous Bases" + + def __init__(self, max_ambiguous): + super(MaxAmbiguousFilter, self).__init__() + assert max_ambiguous is not None + self.max_ambiguous = max_ambiguous + self.name = self.name + ' [{0}]'.format(max_ambiguous) + + def filter_record(self, record): + n_count = record.seq.upper().count('N') + if n_count > self.max_ambiguous: + raise FailedFilter(n_count) + else: + assert n_count <= self.max_ambiguous + return record + + +class PctAmbiguousFilter(BaseFilter): + """ + Filters records exceeding some minimum percent of ambiguous bases + """ + name = "Percent Ambiguous Bases" + + def __init__(self, pct_ambiguous): + super(PctAmbiguousFilter, self).__init__() + assert pct_ambiguous is not None + self.pct_ambiguous = pct_ambiguous + self.name = self.name + ' [{0}]'.format(pct_ambiguous) + + def filter_record(self, record): + n_count = record.seq.upper().count('N') + if n_count == 0: + return record + pct_ambig = n_count / float(len(record.seq)) + if pct_ambig > self.pct_ambiguous: + raise FailedFilter(pct_ambig) + else: + assert pct_ambig <= self.pct_ambiguous + return record + + +class MinLengthFilter(BaseFilter): + """ + Remove records which don't meet minimum length + """ + + def __init__(self, min_length): + super(MinLengthFilter, self).__init__() + assert min_length > 0 + self.min_length = min_length + self.name = "Minimum Length [{0}]".format(min_length) + + def filter_record(self, record): + """ + Filter record, dropping any that don't meet minimum length + """ + + if len(record) >= self.min_length: + return record + else: + raise FailedFilter(len(record)) + + +class MaxLengthFilter(BaseFilter): + """ + Truncate long sequences + """ + name = "Maximum Length" + + def __init__(self, max_length): + super(MaxLengthFilter, self).__init__() + self.max_length = max_length + self.name = self.name + " [{0}]".format(max_length) + + def filter_record(self, record): + """ + Filter record, truncating any over some maximum length + """ + if len(record) >= self.max_length: + return record[:self.max_length] + else: + return record + + +class PrimerBarcodeFilter(BaseFilter): + """ + Filter that checks that the sequence starts with a known barcode/primer + combination. + + Sequences that pass the filter have the barcode and primer removed. + + If an output_file is provided, (sequence_id, sample_id) tuples are written + to it. + """ + name = "Primer/Barcode" + + def __init__(self, + trie, + output_file=None, + trim=True, + quoting=csv.QUOTE_MINIMAL): + super(PrimerBarcodeFilter, self).__init__() + self.trim = True + self.trie = trie + + def filter_record(self, record): + m = trie_match(str(record.seq), self.trie) + if m: + if self.listener: + self.listener( + 'found_barcode', record, barcode=m, sample=self.trie[m]) + if self.trim: + record = record[len(m):] + return record + else: + raise FailedFilter() + + +def parse_barcode_file(fp, primer=None, header=False): + """ + Load label, barcode, primer records from a CSV file. + + Returns a map from barcode -> label + + Any additional columns are ignored + """ + tr = trie.StringTrie() + reader = csv.reader(fp) + + if header: + # Skip header + next(reader) + + # Skip blank rows + records = (record for record in reader if record) + + for record in records: + specimen, barcode = record[:2] + if primer is not None: + pr = primer + else: + pr = record[2] + for sequence in all_unambiguous(barcode + pr): + if sequence in tr: + raise ValueError("Duplicate sample: {0}, {1} both have {2}", + specimen, tr[sequence], sequence) + logging.info('%s->%s', sequence, specimen) + tr[sequence] = specimen + + return tr + + +def action(arguments): + """ + Given parsed arguments, filter input files. + """ + if arguments.quality_window_mean_qual and not arguments.quality_window: + raise ValueError("--quality-window-mean-qual specified without " + "--quality-window") + + filters = [] + input_type = fileformat.from_handle(arguments.sequence_file) + output_type = fileformat.from_handle(arguments.output_file) + with arguments.sequence_file as fp: + if arguments.input_qual: + sequences = QualityIO.PairedFastaQualIterator( + fp, arguments.input_qual) + else: + sequences = SeqIO.parse(fp, input_type) + + listener = RecordEventListener() + if arguments.details_out: + rh = RecordReportHandler(arguments.details_out, arguments.argv, + arguments.details_comment) + rh.register_with(listener) + + # Track read sequences + sequences = listener.iterable_hook('read', sequences) + + # Add filters + if arguments.min_mean_quality and input_type == 'fastq': + qfilter = QualityScoreFilter(arguments.min_mean_quality) + filters.append(qfilter) + if arguments.max_length: + max_length_filter = MaxLengthFilter(arguments.max_length) + filters.append(max_length_filter) + if arguments.min_length: + min_length_filter = MinLengthFilter(arguments.min_length) + filters.append(min_length_filter) + if arguments.max_ambiguous is not None: + max_ambig_filter = MaxAmbiguousFilter(arguments.max_ambiguous) + filters.append(max_ambig_filter) + if arguments.pct_ambiguous is not None: + pct_ambig_filter = PctAmbiguousFilter(arguments.pct_ambiguous) + filters.append(pct_ambig_filter) + if arguments.ambiguous_action: + ambiguous_filter = AmbiguousBaseFilter(arguments.ambiguous_action) + filters.append(ambiguous_filter) + if arguments.quality_window: + min_qual = (arguments.quality_window_mean_qual or + arguments.min_mean_quality) + window_filter = WindowQualityScoreFilter(arguments.quality_window, + min_qual) + filters.insert(0, window_filter) + + if arguments.barcode_file: + with arguments.barcode_file: + tr = parse_barcode_file(arguments.barcode_file, + arguments.primer, + arguments.barcode_header) + f = PrimerBarcodeFilter(tr) + filters.append(f) + + if arguments.map_out: + barcode_writer = csv.writer( + arguments.map_out, + quoting=getattr(csv, arguments.quoting), + lineterminator='\n') + + def barcode_handler(record, sample, barcode=None): + barcode_writer.writerow((record.id, sample)) + + listener.register_handler('found_barcode', barcode_handler) + for f in filters: + f.listener = listener + sequences = f.filter_records(sequences) + + # Track sequences which passed all filters + sequences = listener.iterable_hook('write', sequences) + + with arguments.output_file: + SeqIO.write(sequences, arguments.output_file, output_type) + + rpt_rows = (f.report_dict() for f in filters) + + # Write report + with arguments.report_out as fp: + writer = csv.DictWriter( + fp, BaseFilter.report_fields, lineterminator='\n', delimiter='\t') + writer.writeheader() + writer.writerows(rpt_rows) diff --git a/seqmagick/test/__init__.py b/seqmagick/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seqmagick/test/integration/__init__.py b/seqmagick/test/integration/__init__.py new file mode 100644 index 0000000..3e95cae --- /dev/null +++ b/seqmagick/test/integration/__init__.py @@ -0,0 +1,13 @@ +""" +Integration tests, mostly to ensure that basic commands continue working after +commits. + +Tests invoke seqmagick.scripts.cli.main, and compare the produced output to the +expected. +""" +import os.path + +data_dir = os.path.join(os.path.dirname(__file__), "data") + +def data_path(*args): + return os.path.join(data_dir, *args) diff --git a/seqmagick/test/integration/data/__init__.py b/seqmagick/test/integration/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seqmagick/test/integration/data/input1.fasta b/seqmagick/test/integration/data/input1.fasta new file mode 100644 index 0000000..6bea337 --- /dev/null +++ b/seqmagick/test/integration/data/input1.fasta @@ -0,0 +1,7 @@ +>test1 test sequence 1 +ACGT +>test2 test sequence 2 +AAAA +>test3 sequence 3 +ACGA + diff --git a/seqmagick/test/integration/data/input2.fasta b/seqmagick/test/integration/data/input2.fasta new file mode 100644 index 0000000..32669db --- /dev/null +++ b/seqmagick/test/integration/data/input2.fasta @@ -0,0 +1,6 @@ +>test1 test sequence 1 +AC-GT +>test2 test sequence 2 +A-AAA +>test3 sequence 3 +A---A diff --git a/seqmagick/test/integration/data/input2.fasta.bz2 b/seqmagick/test/integration/data/input2.fasta.bz2 new file mode 100644 index 0000000..ad124ce Binary files /dev/null and b/seqmagick/test/integration/data/input2.fasta.bz2 differ diff --git a/seqmagick/test/integration/data/input2.fasta.gz b/seqmagick/test/integration/data/input2.fasta.gz new file mode 100644 index 0000000..29d95c9 Binary files /dev/null and b/seqmagick/test/integration/data/input2.fasta.gz differ diff --git a/seqmagick/test/integration/data/input3.fasta b/seqmagick/test/integration/data/input3.fasta new file mode 100644 index 0000000..caab0c4 --- /dev/null +++ b/seqmagick/test/integration/data/input3.fasta @@ -0,0 +1,6 @@ +>HXB2 +AA--CGT +>Sequence 1 +-AACTCT +>Sequence 3 +TAG-ATG diff --git a/seqmagick/test/integration/data/input4_ambig.fasta b/seqmagick/test/integration/data/input4_ambig.fasta new file mode 100644 index 0000000..ec9c75d --- /dev/null +++ b/seqmagick/test/integration/data/input4_ambig.fasta @@ -0,0 +1,4 @@ +>sequence1 +GCYCCCAAG +>sequence2 +CCC-CC--- diff --git a/seqmagick/test/integration/data/input5.fasta b/seqmagick/test/integration/data/input5.fasta new file mode 100644 index 0000000..a9e91a5 --- /dev/null +++ b/seqmagick/test/integration/data/input5.fasta @@ -0,0 +1,14 @@ +>test1 +ACGT +>test2 +AAAA +>test3 +ACGA +>test4 +ACGT +>test5 +AAAA +>test6 +ACGA +>test7 +ACGA diff --git a/seqmagick/test/integration/data/input6.fasta b/seqmagick/test/integration/data/input6.fasta new file mode 100644 index 0000000..0c03005 --- /dev/null +++ b/seqmagick/test/integration/data/input6.fasta @@ -0,0 +1,6 @@ +>test1 test sequence 1 RNA +AC-GU +>test2 test sequence 2 +A-AAA +>test3 sequence 3 +A---A diff --git a/seqmagick/test/integration/data/output2.fasta b/seqmagick/test/integration/data/output2.fasta new file mode 100644 index 0000000..32669db --- /dev/null +++ b/seqmagick/test/integration/data/output2.fasta @@ -0,0 +1,6 @@ +>test1 test sequence 1 +AC-GT +>test2 test sequence 2 +A-AAA +>test3 sequence 3 +A---A diff --git a/seqmagick/test/integration/data/output2.nex b/seqmagick/test/integration/data/output2.nex new file mode 100644 index 0000000..068e63c --- /dev/null +++ b/seqmagick/test/integration/data/output2.nex @@ -0,0 +1,10 @@ +#NEXUS +begin data; +dimensions ntax=3 nchar=5; +format datatype=dna missing=? gap=-; +matrix +test1 AC-GT +test2 A-AAA +test3 A---A +; +end; diff --git a/seqmagick/test/integration/data/output2.phy b/seqmagick/test/integration/data/output2.phy new file mode 100644 index 0000000..b526b02 --- /dev/null +++ b/seqmagick/test/integration/data/output2.phy @@ -0,0 +1,4 @@ + 3 5 +test1 AC-GT +test2 A-AAA +test3 A---A diff --git a/seqmagick/test/integration/data/output2_ungap_cut.fasta b/seqmagick/test/integration/data/output2_ungap_cut.fasta new file mode 100644 index 0000000..9dfb6cb --- /dev/null +++ b/seqmagick/test/integration/data/output2_ungap_cut.fasta @@ -0,0 +1,4 @@ +>test2 test sequence 2 +AAA +>test3 sequence 3 +AA diff --git a/seqmagick/test/integration/data/output3.fasta b/seqmagick/test/integration/data/output3.fasta new file mode 100644 index 0000000..74dabde --- /dev/null +++ b/seqmagick/test/integration/data/output3.fasta @@ -0,0 +1,6 @@ +>HXB2 +A--C +>Sequence 1 +AACT +>Sequence 3 +AG-A diff --git a/seqmagick/test/integration/data/output3.nex b/seqmagick/test/integration/data/output3.nex new file mode 100644 index 0000000..dd72f02 --- /dev/null +++ b/seqmagick/test/integration/data/output3.nex @@ -0,0 +1,10 @@ +#NEXUS +begin data; +dimensions ntax=3 nchar=5; +format datatype=rna missing=? gap=-; +matrix +test1 AC-GU +test2 A-AAA +test3 A---A +; +end; diff --git a/seqmagick/test/integration/data/output4.fasta b/seqmagick/test/integration/data/output4.fasta new file mode 100644 index 0000000..7f3439e --- /dev/null +++ b/seqmagick/test/integration/data/output4.fasta @@ -0,0 +1,4 @@ +>sequence1 +APK +>sequence2 +PX- diff --git a/seqmagick/test/integration/data/output4.nex b/seqmagick/test/integration/data/output4.nex new file mode 100644 index 0000000..94aed55 --- /dev/null +++ b/seqmagick/test/integration/data/output4.nex @@ -0,0 +1,10 @@ +#NEXUS +begin data; +dimensions ntax=3 nchar=5; +format datatype=protein missing=? gap=-; +matrix +test1 AC-GT +test2 A-AAA +test3 A---A +; +end; diff --git a/seqmagick/test/integration/data/output5.fasta b/seqmagick/test/integration/data/output5.fasta new file mode 100644 index 0000000..d871505 --- /dev/null +++ b/seqmagick/test/integration/data/output5.fasta @@ -0,0 +1,4 @@ +>test5 +AAAA +>test3 +ACGA diff --git a/seqmagick/test/integration/test_convert.py b/seqmagick/test/integration/test_convert.py new file mode 100644 index 0000000..bc2f7a6 --- /dev/null +++ b/seqmagick/test/integration/test_convert.py @@ -0,0 +1,222 @@ +from io import StringIO +import os +import os.path +import logging +import random +import shlex +import shutil +import sys +import unittest +import tempfile + +from seqmagick.subcommands.common import FileType +from seqmagick.scripts import cli + + +d = os.path.dirname(__file__) +data_dir = os.path.join(d, "data") + + +def p(*args): + return os.path.join(data_dir, *args) + + +class CommandLineTestMixIn(object): + in_suffix = '' + out_suffix = '' + + def setUp(self): + self.input_file = tempfile.NamedTemporaryFile(suffix=self.in_suffix) + shutil.copy(self.input_path, self.input_file.name) + with tempfile.NamedTemporaryFile(suffix=self.out_suffix) as tf: + self.output_file = tf.name + + def test_run(self): + command = self.command.format( + input=self.input_file.name, output=self.output_file) + cli.main(shlex.split(command)) + + with FileType('rt')(self.output_file) as fp: + actual = fp.read() + with FileType('rt')(self.expected_path) as fp: + expected = fp.read() + self.assertEqual(expected, actual) + + def tearDown(self): + self.input_file.close() + if os.path.isfile(self.output_file): + os.remove(self.output_file) + + +class BasicConvertTestCase(CommandLineTestMixIn, unittest.TestCase): + in_suffix = '.fasta' + out_suffix = '.phy' + input_path = p('input2.fasta') + expected_path = p('output2.phy') + command = 'convert {input} {output}' + + +@unittest.skipIf(sys.version_info.major == 3, 'bzip2 not supported') +class BzipInputConvertTestCase(CommandLineTestMixIn, unittest.TestCase): + in_suffix = '.fasta.bz2' + out_suffix = '.phy' + input_path = p('input2.fasta.bz2') + expected_path = p('output2.phy') + command = 'convert {input} {output}' + + +@unittest.skipIf(sys.version_info.major == 3, 'bzip2 not supported') +class BzipOutputConvertTestCase(CommandLineTestMixIn, unittest.TestCase): + in_suffix = '.fasta' + out_suffix = '.phy.bz2' + input_path = p('input2.fasta') + expected_path = p('output2.phy') + command = 'convert {input} {output}' + + +class GzipInputConvertTestCase(CommandLineTestMixIn, unittest.TestCase): + in_suffix = '.fasta.gz' + out_suffix = '.phy' + input_path = p('input2.fasta.gz') + expected_path = p('output2.phy') + command = 'convert {input} {output}' + + +class GzipOutputConvertTestCase(CommandLineTestMixIn, unittest.TestCase): + in_suffix = '.fasta' + out_suffix = '.phy.gz' + input_path = p('input2.fasta') + expected_path = p('output2.phy') + command = 'convert {input} {output}' + + +class ConvertToNexusTestCase(CommandLineTestMixIn, unittest.TestCase): + in_suffix = '.fasta' + input_path = p('input2.fasta') + expected_path = p('output2.nex') + command = 'convert {input} {output} --output-format nexus --alphabet dna-ambiguous' + + +class ConvertToNexusRNATestCase(CommandLineTestMixIn, unittest.TestCase): + in_suffix = '.fasta' + input_path = p('input6.fasta') + expected_path = p('output3.nex') + command = 'convert {input} {output} --output-format nexus --alphabet rna' + + +class ConvertToNexusProteinTestCase(CommandLineTestMixIn, unittest.TestCase): + in_suffix = '.fasta' + input_path = p('input2.fasta') + expected_path = p('output4.nex') + command = 'convert {input} {output} --output-format nexus --alphabet protein' + + +class ConvertUngapCutTestCase(CommandLineTestMixIn, unittest.TestCase): + in_suffix = '.fasta' + out_suffix = '.fasta' + input_path = p('input2.fasta') + expected_path = p('output2_ungap_cut.fasta') + command = 'convert --ungap --cut 1:3 --tail 2 {input} {output}' + + +class ConvertToStdOutTestCase(unittest.TestCase): + + def setUp(self): + self.out = StringIO() + self.err = StringIO() + self.actual_stdout = sys.stdout + self.actual_stderr = sys.stderr + sys.stdout = self.out + sys.stderr = self.err + + def tearDown(self): + sys.stdout = self.actual_stdout + sys.stderr = self.actual_stderr + + def test_convert(self): + in_path = p('input2.fasta') + cli.main(['convert', in_path, '-', '--output-format', 'fasta']) + actual = self.out.getvalue() + with open(in_path) as fp: + expected = fp.read() + self.assertEqual(expected, actual) + + def test_convert_noformat(self): + in_path = p('input2.fasta') + cli.main(['convert', in_path, '-']) + actual = self.out.getvalue() + with open(in_path) as fp: + expected = fp.read() + self.assertEqual(expected, actual) + +class TestCutRelative(CommandLineTestMixIn, unittest.TestCase): + in_suffix = '.fasta' + out_suffix = '.fasta' + input_path = p('input3.fasta') + expected_path = p('output3.fasta') + command = 'convert --cut 2:3 --relative-to HXB2 {input} {output}' + + def test_unknown_seq(self): + args = ['convert', '--cut', '2:3', '--relative-to', 'OTHER', + self.input_path, '-', '--output-format', 'fasta'] + self.assertRaises(ValueError, cli.main, args) + +class TestTranslateAmbiguous(CommandLineTestMixIn, unittest.TestCase): + in_suffix = '.fasta' + out_suffix = '.fasta' + input_path = p('input4_ambig.fasta') + expected_path = p('output4.fasta') + command = 'convert --translate dna2protein {input} {output}' + + def setUp(self): + super(TestTranslateAmbiguous, self).setUp() + self.orig_level = logging.getLogger(None).level + logging.getLogger(None).setLevel(logging.FATAL) + + def tearDown(self): + super(TestTranslateAmbiguous, self).tearDown() + logging.getLogger(None).setLevel(self.orig_level) + +class TestSample(CommandLineTestMixIn, unittest.TestCase): + in_suffix = '.fasta' + out_suffix = '.fasta' + input_path = p('input5.fasta') + expected_path = p('output5.fasta') + command = 'convert --sample 2 --sample-seed 0 {input} {output}' + + def setUp(self): + super(TestSample, self).setUp() + self.orig_level = logging.getLogger(None).level + logging.getLogger(None).setLevel(logging.FATAL) + random.seed(1) + + def tearDown(self): + super(TestSample, self).tearDown() + logging.getLogger(None).setLevel(self.orig_level) + +class TestStdin(TestTranslateAmbiguous, unittest.TestCase): + command = 'convert --translate dna2protein - {output}' + + def setUp(self): + super(TestStdin, self).setUp() + self.orig_stdin = sys.stdin + sys.stdin = open(p('input4_ambig.fasta'), 'r') + + def tearDown(self): + super(TestStdin, self).tearDown() + sys.stdin.close() + sys.stdin = self.orig_stdin + + +class TestConvertFromStdin(TestTranslateAmbiguous, unittest.TestCase): + command = 'convert --translate dna2protein - {output}' + + def setUp(self): + super(TestConvertFromStdin, self).setUp() + self.orig_stdin = sys.stdin + sys.stdin = open(p('input4_ambig.fasta'), 'r') + + def tearDown(self): + super(TestConvertFromStdin, self).tearDown() + sys.stdin.close() + sys.stdin = self.orig_stdin diff --git a/seqmagick/test/integration/test_extract_ids.py b/seqmagick/test/integration/test_extract_ids.py new file mode 100644 index 0000000..85e6905 --- /dev/null +++ b/seqmagick/test/integration/test_extract_ids.py @@ -0,0 +1,47 @@ +import sys +import unittest +import tempfile + +from seqmagick.scripts import cli + +from seqmagick.test.integration import data_path + + +class ExtractIdsMixin(object): + expected = """test1 +test2 +test3 +""" + expected_desc = """test1 test sequence 1 +test2 test sequence 2 +test3 sequence 3 +""" + + def setUp(self): + self.tempfile = tempfile.NamedTemporaryFile('r+t') + + def tearDown(self): + self.tempfile.close() + + def test_ids(self): + args = ['extract-ids', self.seq_file, '-o', self.tempfile.name] + cli.main(args) + self.assertEqual(self.expected, self.tempfile.read()) + + def test_descriptions(self): + args = ['extract-ids', self.seq_file, '-o', self.tempfile.name, '-d'] + cli.main(args) + self.assertEqual(self.expected_desc, self.tempfile.read()) + + +class SimpleExtractIdsTestCase(ExtractIdsMixin, unittest.TestCase): + seq_file = data_path('input2.fasta') + + +@unittest.skipIf(sys.version_info.major == 3, 'bzip2 not supported') +class Bz2ExtractIdsTestCase(ExtractIdsMixin, unittest.TestCase): + seq_file = data_path('input2.fasta.bz2') + + +class GzipExtractIdsTestCase(ExtractIdsMixin, unittest.TestCase): + seq_file = data_path('input2.fasta.gz') diff --git a/seqmagick/test/integration/test_info.py b/seqmagick/test/integration/test_info.py new file mode 100644 index 0000000..a0f486f --- /dev/null +++ b/seqmagick/test/integration/test_info.py @@ -0,0 +1,48 @@ +import sys +import unittest +import tempfile + +from seqmagick.scripts import cli + +from seqmagick.test.integration import data_path + + +class InfoMixin(object): + expected = """name\talignment\tmin_len\tmax_len\tavg_len\tnum_seqs +{0}\tTRUE\t5\t5\t5.00\t3 +""" + threads = 1 + + def setUp(self): + self.infile = tempfile.NamedTemporaryFile() + self.tempfile = tempfile.NamedTemporaryFile('w+t') + + def tearDown(self): + self.infile.close() + self.tempfile.close() + + def test_info(self): + args = ['info', self.seq_file, + '--out-file', self.tempfile.name, + '--threads', str(self.threads)] + + cli.main(args) + self.assertEqual(self.expected.format(self.seq_file), self.tempfile.read()) + + +class SimpleInfoTestCase(InfoMixin, unittest.TestCase): + seq_file = data_path('input2.fasta') + + +class MultithreadedInfoTestCase(InfoMixin, unittest.TestCase): + seq_file = data_path('input2.fasta') + threads = 2 + + +class SimpleGzipInfoTestCase(InfoMixin, unittest.TestCase): + seq_file = data_path('input2.fasta.gz') + + +@unittest.skipIf(sys.version_info.major == 3, 'bzip2 not supported') +class SimpleBzip2InfoTestCase(InfoMixin, unittest.TestCase): + seq_file = data_path('input2.fasta.bz2') diff --git a/seqmagick/test/integration/test_mogrify.py b/seqmagick/test/integration/test_mogrify.py new file mode 100644 index 0000000..6026f32 --- /dev/null +++ b/seqmagick/test/integration/test_mogrify.py @@ -0,0 +1,57 @@ + +import os +import os.path +import shlex +import shutil +import tempfile + +from seqmagick.scripts import cli +from seqmagick.subcommands.common import FileType +from seqmagick.test.integration import data_path + +from . import test_convert + + +class CommandLineTestMixIn(object): + def setUp(self): + with tempfile.NamedTemporaryFile( + 'wt', suffix=os.path.basename(self.input_path), + delete=False) as tf: + + self.input_file = tf.name + shutil.copyfile(self.input_path, self.input_file) + + def test_run(self): + command = self.command.format(input=self.input_file) + try: + cli.main(shlex.split(command)) + except SystemExit as e: + self.fail(e) + + with FileType('rt')(self.input_file) as fp: + actual = fp.read() + with FileType('rt')(self.expected_path) as fp: + expected = fp.read() + self.assertEqual(expected, actual) + + def tearDown(self): + os.remove(self.input_file) + + +class MogrifyUngapCutTestCase( + CommandLineTestMixIn, test_convert.ConvertUngapCutTestCase): + command = 'mogrify --ungap --cut 1:3 --tail 2 {input}' + + +class MogrifyBzipInputTestCase( + CommandLineTestMixIn, test_convert.BzipInputConvertTestCase): + command = 'mogrify {input}' + expected_path = data_path('output2.fasta') + out_suffix = 'fasta.bz2' + + +class MogrifyGzipInputTestCase( + CommandLineTestMixIn, test_convert.GzipInputConvertTestCase): + command = 'mogrify {input}' + expected_path = data_path('output2.fasta') + out_suffix = 'fasta.gz' diff --git a/seqmagick/test/test_primer_trim.py b/seqmagick/test/test_primer_trim.py new file mode 100644 index 0000000..9434949 --- /dev/null +++ b/seqmagick/test/test_primer_trim.py @@ -0,0 +1,116 @@ +""" +Tests for primer trim +""" +import unittest + +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + +from seqmagick.subcommands import primer_trim + + +class PrimerAlignerTestCase(unittest.TestCase): + def setUp(self): + self.primer = 'AACTGCATTTGAATGG' + self.instance = primer_trim.PrimerAligner( + self.primer, match=5.0, gap_open=-10.0) + + def test_max_score(self): + self.assertEqual(len(self.primer) * 5.0, self.instance.max_score) + + def test_align_exact(self): + sequence = ('ACTCTGTGTCACTTTAAACTGCATTTGAATGGAAGAGTAATAGTAGCAATAACGGCA' + 'CTGATCAG') + hamming_distance, start, end = self.instance.align(sequence) + self.assertEqual(0, hamming_distance) + self.assertEqual(16, start) + self.assertEqual(31, end) + + def test_align_gap(self): + sequence = ('ACTCTGTGTCACTTTAAACTGCATTGAATGGAAGAGTAATAGTAGCAATAACGGCA' + 'CTGATCAG') + hamming_distance, start, end = self.instance.align(sequence) + expected_distance = 1 + self.assertEqual(expected_distance, hamming_distance) + self.assertEqual(16, start) + self.assertEqual(30, end) + + +class HammingDistanceTestCase(unittest.TestCase): + def test_unequal_length(self): + s1 = 'test' + s2 = 'te' + self.assertRaises(ValueError, primer_trim.hamming_distance, s1, s2) + + def test_no_difference(self): + s1 = s2 = 'test' + self.assertEqual(0, primer_trim.hamming_distance(s1, s2)) + + def test_all_different(self): + s1 = 'test' + s2 = 'ACGT' + self.assertEqual(4, primer_trim.hamming_distance(s1, s2)) + + def test_basic(self): + s1 = 'ACGT' + s2 = 'AGGT' + self.assertEqual(1, primer_trim.hamming_distance(s1, s2)) + + def test_ambiguous(self): + s1 = 'ACYT' + s2 = 'ACCT' + self.assertEqual(0, + primer_trim.hamming_distance( + s1, s2, primer_trim._iupac_ambiguous_equal)) + s2 = 'ACTT' + self.assertEqual(0, + primer_trim.hamming_distance( + s1, s2, primer_trim._iupac_ambiguous_equal)) + + +def _alignment_record(sequence): + return SeqRecord(Seq(sequence)) + + +class LocatePrimersTestCase(unittest.TestCase): + """ + Test for locate primers + """ + + def setUp(self): + self.sequences = [_alignment_record('--A--ACTGGACGTATTC-CCCC')] + + def test_basic(self): + forward = 'TGG' + reverse = 'TTC' + + forward_idx, reverse_idx = primer_trim.locate_primers( + self.sequences, forward, reverse, False, 1) + + self.assertEqual((7, 9), forward_idx) + self.assertEqual((15, 17), reverse_idx) + + def test_no_forward(self): + forward = 'GGGGGG' + reverse = 'TTC' + self.assertRaises(primer_trim.PrimerNotFound, + primer_trim.locate_primers, self.sequences, forward, + reverse, False, 1) + + def test_no_reverse(self): + forward = 'TGG' + reverse = 'GGGG' + self.assertRaises(primer_trim.PrimerNotFound, + primer_trim.locate_primers, self.sequences, forward, + reverse, False, 1) + + def test_bad_order(self): + """ + Should fail if reverse primer occurs before forward primer + """ + reverse = 'TGG' + forward = 'TTC' + + self.assertRaises(primer_trim.PrimerOrderError, + primer_trim.locate_primers, self.sequences, forward, + reverse, False, 1) diff --git a/seqmagick/test/test_subcommands_backtrans_align.py b/seqmagick/test/test_subcommands_backtrans_align.py new file mode 100644 index 0000000..aa2823a --- /dev/null +++ b/seqmagick/test/test_subcommands_backtrans_align.py @@ -0,0 +1,76 @@ +import unittest + +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.Data import CodonTable + +from seqmagick.subcommands import backtrans_align + +class BatchTestCase(unittest.TestCase): + def test_no_input(self): + i = [] + b = backtrans_align.batch(i, 1) + self.assertRaises(StopIteration, next, b) + + def test_singletons(self): + i = list(range(3)) + b = backtrans_align.batch(i, 1) + self.assertEqual([[0], [1], [2]], list(b)) + + def test_doubles(self): + i = list(range(6)) + b = backtrans_align.batch(i, 2) + self.assertEqual([[0, 1], [2, 3], [4, 5]], list(b)) + + def test_partial(self): + i = list(range(5)) + b = backtrans_align.batch(i, 2) + self.assertEqual([[0, 1], [2, 3], [4]], list(b)) + + +class AlignmentMapperTestCase(unittest.TestCase): + def setUp(self): + self.instance = backtrans_align.AlignmentMapper(CodonTable.unambiguous_dna_by_name['Standard']) + + def test_validate_valid(self): + nucl = 'TTTAAG' + prot = 'FK' + + self.assertTrue(self.instance._validate_translation(prot, nucl)) + + def test_validate_invalid(self): + nucl = 'AAGTTT' + prot = 'KK' + self.assertRaisesRegex(ValueError, r'Codon TTT translates to F, not K', + self.instance._validate_translation, prot, nucl) + + def test_map_alignment(self): + nucl = [SeqRecord(Seq('AAGTTT'), id='1'), # KF + SeqRecord(Seq('AAGGTCTTC'), id='2'), # KVF + SeqRecord(Seq('GGGGTTTTT'), id='3')] # GVF + prot = [SeqRecord(Seq('-K-F'), id='1'), + SeqRecord(Seq('-KVF'), id='2'), + SeqRecord(Seq('G-VF'), id='3')] + + result = self.instance.map_all(prot, nucl) + result = [(s.id, str(s.seq)) for s in result] + self.assertEqual([('1', '---AAG---TTT'), + ('2', '---AAGGTCTTC'), + ('3', 'GGG---GTTTTT')], result) + + def test_map_alignment_insufficient_codons(self): + nucl = [SeqRecord(Seq('AAGTTT'), id='1'), # KF + SeqRecord(Seq('AAGGTC'), id='2')] # KV + prot = [SeqRecord(Seq('K-F'), id='1'), + SeqRecord(Seq('KVF'), id='2')] + + mapped = self.instance.map_all(prot, nucl) + self.assertRaises(ValueError, list, mapped) + + def test_map_alignment_excess_codons(self): + nucl = [SeqRecord(Seq('AAGTTT'), id='1'), # KF + SeqRecord(Seq('AAGGTCTTC'), id='2')] # KVF + prot = [SeqRecord(Seq('K-F'), id='1'), + SeqRecord(Seq('KV-'), id='2')] + mapped = self.instance.map_all(prot, nucl) + self.assertRaises(ValueError, list, mapped) diff --git a/seqmagick/test/test_subcommands_common.py b/seqmagick/test/test_subcommands_common.py new file mode 100644 index 0000000..ff4a635 --- /dev/null +++ b/seqmagick/test/test_subcommands_common.py @@ -0,0 +1,201 @@ +import argparse +import os +import os.path +import sys +import unittest +import tempfile + +from seqmagick.subcommands import common + +d = os.path.dirname(__file__) +data_dir = os.path.join(d, "integration", "data") + + +def p(*args): + return os.path.join(data_dir, *args) + + +class PartialAppendTestCase(unittest.TestCase): + + def setUp(self): + self.namespace = argparse.Namespace() + + def test_single_arg(self): + def head(records, n): + return records[:n] + + a_cls = common.partial_append_action(head, 'n') + a = a_cls([], 'functions') + + a(None, self.namespace, 2) + + f = self.namespace.functions[0] + self.assertEqual([0, 1], f(list(range(10)))) + + def test_no_arg(self): + def head(records): + return records[:2] + + a_cls = common.partial_append_action(head) + a = a_cls([], 'functions') + + a(None, self.namespace, None) + f = self.namespace.functions[0] + self.assertEqual([0, 1], f(list(range(10)))) + + def test_multi_arg(self): + def fake_slice(records, i, j): + return records[i:j] + + a_cls = common.partial_append_action(fake_slice, ['i', 'j']) + a = a_cls([], 'functions') + + a(None, self.namespace, [0, 2]) + f = self.namespace.functions[0] + self.assertEqual([0, 1], f(list(range(10)))) + +class PositiveValueTestCase(unittest.TestCase): + + def test_negative(self): + self.assertRaises(argparse.ArgumentTypeError, + common.positive_value(int), '-1') + + def test_positive(self): + self.assertEqual(1, common.positive_value(int)('1')) + + def test_zero(self): + self.assertEqual(0, common.positive_value(int)('0')) + +class CutRangeTestCase(unittest.TestCase): + def test_out_of_order(self): + self.assertRaises(argparse.ArgumentTypeError, + common.cut_range, '10:5') + + def test_start(self): + actual = common.cut_range('5:10') + self.assertEqual(4, actual.start) + self.assertEqual(10, actual.stop) + + def test_negative(self): + actual = common.cut_range('-500:') + self.assertEqual(-500, actual.start) + self.assertIsNone(actual.stop) + actual = common.cut_range('-500:-203') + self.assertEqual(-500, actual.start) + self.assertEqual(-203, actual.stop) + + def test_no_start(self): + actual = common.cut_range(':10') + self.assertEqual(None, actual.start) + self.assertEqual(10, actual.stop) + + def test_no_end(self): + actual = common.cut_range('5:') + self.assertEqual(4, actual.start) + +class SequenceSlicesTestCase(unittest.TestCase): + def test_single(self): + actual = common.sequence_slices(':10') + self.assertEqual([slice(None, 10)], actual) + + def test_multiple(self): + actual = common.sequence_slices('1:10,3:20') + self.assertEqual([slice(0, 10), slice(2, 20)], actual) + +class AtomicWriteTestCase(unittest.TestCase): + + initial_content = "Initial Content" + new_content = "New Content" + + def setUp(self): + with tempfile.NamedTemporaryFile('wt', delete=False) as tf: + tf.write(self.initial_content) + self.input_file = tf.name + + def test_exception_leaves_unchanged(self): + try: + with common.atomic_write(self.input_file) as tf: + raise IOError() + except IOError: + with open(self.input_file) as fp: + self.assertEqual(self.initial_content, fp.read()) + + # Ensure deleted + self.assertFalse(os.path.exists(tf.name)) + + def test_write(self): + with common.atomic_write(self.input_file) as fp: + self.assertNotEqual(self.input_file, fp.name) + fp.write(self.new_content) + + self.assertFalse(os.path.exists(fp.name)) + + with open(self.input_file) as fp: + self.assertEqual(self.new_content, fp.read()) + + def tearDown(self): + os.remove(self.input_file) + + +class ApplyUmaskTestCase(unittest.TestCase): + + def setUp(self): + # Set umask + self.orig_umask = common.get_umask() + + def tearDown(self): + os.umask(self.orig_umask) + + def test_provided_umask(self): + self.assertEqual('0o770', oct(common.apply_umask(0o777, 0o07))) + self.assertEqual('0o660', oct(common.apply_umask(0o666, 0o07))) + self.assertEqual('0o644', oct(common.apply_umask(0o666, 0o22))) + + def test_user_umask(self): + os.umask(0o07) + self.assertEqual('0o770', oct(common.apply_umask(0o777))) + self.assertEqual('0o660', oct(common.apply_umask(0o666))) + + +class FileTypeTestCase(unittest.TestCase): + + def setUp(self): + # used in methods test_read_*() + self.testfile = 'input2.fasta' + with open(p(self.testfile)) as f: + self.expected = f.read() + + def test_stdin(self): + self.assertIs(sys.stdin, common.FileType('r')('-')) + + def test_stdout(self): + self.assertIs(sys.stdout, common.FileType('w')('-')) + + def test_read(self): + with tempfile.NamedTemporaryFile('w+t') as tf: + tf.write('TEST') + tf.flush() + with common.FileType('r')(tf.name) as fp: + self.assertEqual(tf.name, fp.name) + self.assertEqual('TEST', fp.read()) + + def test_write(self): + with tempfile.NamedTemporaryFile('w+t') as tf: + with common.FileType('w')(tf.name) as fp: + fp.write('TEST') + fp.flush() + self.assertEqual(tf.name, fp.name) + self.assertEqual('TEST', tf.read()) + + def test_read_text(self): + with common.FileType('rt')(p(self.testfile)) as fp: + self.assertEqual(fp.read(), self.expected) + + def test_read_gz(self): + with common.FileType('rt')(p(self.testfile + '.gz')) as fp: + self.assertEqual(fp.read(), self.expected) + + @unittest.skipIf(sys.version_info.major == 3, 'bzip2 not supported') + def test_read_bz2(self): + with common.FileType('rt')(p(self.testfile + '.bz2')) as fp: + self.assertEqual(fp.read(), self.expected) diff --git a/seqmagick/test/test_subcommands_convert.py b/seqmagick/test/test_subcommands_convert.py new file mode 100644 index 0000000..2d53371 --- /dev/null +++ b/seqmagick/test/test_subcommands_convert.py @@ -0,0 +1,139 @@ +""" +Tests for seqmagick.subcommands.convert - mostly integration with +seqmagick.transform +""" +import argparse +import os +import tempfile +import unittest + +from seqmagick.subcommands import convert +from seqmagick import transform + +# Test populating the transform +class PopulateTransformsMixIn(object): + """ + Tests that transforms list is populated + """ + def setUp(self): + self.parser = convert.build_parser(argparse.ArgumentParser()) + with tempfile.NamedTemporaryFile(delete=False) as tf: + self.infile = tf.name + with tempfile.NamedTemporaryFile(delete=False) as tf: + self.outfile = tf.name + + def tearDown(self): + os.remove(self.infile) + os.remove(self.outfile) + + def test_parse(self): + arguments = [self.infile, self.outfile] + arguments.extend(self.arguments) + try: + parsed_arguments = self.parser.parse_args(arguments) + except SystemExit: + self.fail("Couldn't parse arguments") + functions = [f.func for f in parsed_arguments.transforms] + self.assertEqual(self.functions, functions) + self.close_all_files(parsed_arguments) + + def close_all_files(self, parsed_arguments): + for attr in dir(parsed_arguments): + arg = getattr(parsed_arguments, attr) + if hasattr(arg, 'close'): + arg.close() + + +class OrderRespectedTestCase(PopulateTransformsMixIn, unittest.TestCase): + """ + Ensure that order of arguments translates to order of functions to apply. + """ + arguments = ['--upper', '--translate', 'dna2protein', '--lower', + '--squeeze'] + functions = [transform.upper_sequences, transform.translate, + transform.lower_sequences, transform.squeeze] + +class SequenceModTransformsTestCase(PopulateTransformsMixIn, unittest.TestCase): + arguments = ['--dash-gap', + '--lower', + '--reverse', + '--reverse-complement', + '--transcribe', 'dna2rna', + '--translate', 'dna2protein', + '--ungap', + '--upper',] + functions = [transform.dashes_cleanup, + transform.lower_sequences, + transform.reverse_sequences, + transform.reverse_complement_sequences, + transform.transcribe, + transform.translate, + transform.ungap_sequences, + transform.upper_sequences] + +class SeqSelectTransformsTestCase(PopulateTransformsMixIn, unittest.TestCase): + + def setUp(self): + with tempfile.NamedTemporaryFile(delete=False) as tf: + self.exclude_from = tf.name + self.arguments = ['--deduplicate-taxa', + '--exclude-from-file', self.exclude_from, + '--include-from-file', self.exclude_from, + '--head', '10', + '--max-length', '50', + '--min-length', '50', + '--min-ungapped-length', '50', + '--pattern-include', 'pattern', + '--pattern-exclude', 'pattern', + '--prune-empty', + '--seq-pattern-include', 'pattern', + '--seq-pattern-exclude', 'pattern', + ] + self.functions = [transform.deduplicate_taxa, + transform.exclude_from_file, + transform.include_from_file, + transform.head, + transform.max_length_discard, + transform.min_length_discard, + transform.min_ungap_length_discard, + transform.name_include, + transform.name_exclude, + transform.prune_empty, + transform.seq_include, + transform.seq_exclude, + ] + super(SeqSelectTransformsTestCase, self).setUp() + + def tearDown(self): + super(SeqSelectTransformsTestCase, self).tearDown() + os.remove(self.exclude_from) + +class IdModificationTransformsTestCase(PopulateTransformsMixIn, unittest.TestCase): + arguments = ['--first-name', + '--name-suffix', 'suffix', + '--name-prefix', 'prefix', + '--pattern-replace', '.', 'N', + '--strip-range'] + functions = [ + transform.first_name_capture, + transform.name_append_suffix, + transform.name_insert_prefix, + transform.name_replace, + transform.strip_range] + + +class ArgumentTypeTestCase(PopulateTransformsMixIn, unittest.TestCase): + arguments = ['--cut', '1:5'] + functions = [transform.multi_cut_sequences] + + def test_argument_type(self): + arguments = [self.infile, self.outfile] + arguments.extend(self.arguments) + try: + parsed_arguments = self.parser.parse_args(arguments) + except SystemExit: + self.fail("Couldn't parse arguments") + keywords = [f.keywords for f in parsed_arguments.transforms] + self.assertEqual([{'slices': [slice(0, 5)]}], keywords) + + self.close_all_files(parsed_arguments) diff --git a/seqmagick/test/test_subcommands_quality_filter.py b/seqmagick/test/test_subcommands_quality_filter.py new file mode 100644 index 0000000..c0becb0 --- /dev/null +++ b/seqmagick/test/test_subcommands_quality_filter.py @@ -0,0 +1,279 @@ +from io import StringIO +import sys +import unittest + +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + +from seqmagick.subcommands import quality_filter + +IS_PYPY = hasattr(sys, 'pypy_version_info') + + +class QualityFilterTestCase(unittest.TestCase): + def setUp(self): + self.instance = quality_filter.QualityScoreFilter(25.0) + self.sequence = SeqRecord(Seq('ACGT')) + + def test_nowindow_fail(self): + self.sequence.letter_annotations['phred_quality'] = [25, 25, 24, 25] + instance = quality_filter.QualityScoreFilter() + self.assertRaises(quality_filter.FailedFilter, instance.filter_record, + self.sequence) + + def test_nowindow_pass(self): + self.sequence.letter_annotations['phred_quality'] = [25, 25, 25, 25] + instance = quality_filter.QualityScoreFilter() + result = instance.filter_record(self.sequence) + self.assertEqual(self.sequence.seq, result.seq) + + +class WindowQualityFilterTestCase(unittest.TestCase): + def setUp(self): + self.instance = quality_filter.WindowQualityScoreFilter(2, 25) + self.sequence = SeqRecord(Seq('ACGT')) + + def test_window_pass(self): + self.sequence.letter_annotations['phred_quality'] = [25, 25, 25, 25] + result = self.instance.filter_record(self.sequence) + self.assertEqual(str(self.sequence), str(result)) + + def test_window_truncate_noseq(self): + self.sequence.letter_annotations['phred_quality'] = [25, 24, 25, 25] + self.assertRaises(quality_filter.FailedFilter, + self.instance.filter_record, self.sequence) + + def test_window_truncate_mid(self): + self.sequence.letter_annotations['phred_quality'] = [25, 25, 23, 25] + result = self.instance.filter_record(self.sequence) + self.assertEqual(2, len(result)) + self.assertEqual('AC', str(result.seq)) + + +class AmbiguousBaseFilterTestCase(unittest.TestCase): + """ + Tests for ambiguous_base_filter + """ + + def setUp(self): + self.records = [ + SeqRecord(Seq('ACGT')), + SeqRecord(Seq('NNNN')), + SeqRecord(Seq('NACT')), + SeqRecord(Seq('ACGTN')), + SeqRecord(Seq('GGNTTACT')), + ] + + def test_drop(self): + """ + Test that the first record (with no Ns) does not get filtered + """ + instance = quality_filter.AmbiguousBaseFilter('drop') + actual = list(instance.filter_records(self.records)) + self.assertEqual(1, len(actual)) + self.assertEqual(1, instance.passed) + self.assertEqual(4, instance.failed) + self.assertEqual(self.records[0].seq, actual[0].seq) + + def test_truncate(self): + instance = quality_filter.AmbiguousBaseFilter('truncate') + actual = list(instance.filter_records(self.records)) + self.assertEqual(5, len(actual)) + self.assertEqual(0, instance.failed) + self.assertEqual(5, instance.passed) + self.assertEqual(['ACGT', '', '', 'ACGT', 'GG'], + [str(s.seq) for s in actual]) + + def test_invalid_action(self): + self.assertRaises(ValueError, quality_filter.AmbiguousBaseFilter, + 'other') + + +class MaxAmbiguousFilterTestCase(unittest.TestCase): + def setUp(self): + self.records = [ + SeqRecord(Seq('ACGT')), + SeqRecord(Seq('NNNN')), + SeqRecord(Seq('NACT')), + SeqRecord(Seq('ACNTN')), + SeqRecord(Seq('GGNTTNACT')), + ] + + def test_none(self): + instance = quality_filter.MaxAmbiguousFilter(0) + filtered = list(instance.filter_records(self.records)) + self.assertEqual(len(filtered), 1) + self.assertEqual(str(self.records[0].seq), str(filtered[0].seq)) + + def test_10(self): + instance = quality_filter.MaxAmbiguousFilter(10) + filtered = list(instance.filter_records(self.records)) + self.assertEqual(filtered, self.records) + + def test_1(self): + instance = quality_filter.MaxAmbiguousFilter(1) + filtered = list(instance.filter_records(self.records)) + self.assertEqual([self.records[i] for i in (0, 2)], filtered) + + +class PctAmbiguousFilterTestCase(unittest.TestCase): + def setUp(self): + self.records = [ + SeqRecord(Seq('ACGT')), + SeqRecord(Seq('NNNN')), + SeqRecord(Seq('NACT')), + SeqRecord(Seq('ACNTN')), + SeqRecord(Seq('GGNTTNACT')), + ] + + def test_none(self): + instance = quality_filter.PctAmbiguousFilter(0) + filtered = list(instance.filter_records(self.records)) + self.assertEqual(len(filtered), 1) + self.assertEqual(str(self.records[0].seq), str(filtered[0].seq)) + + def test_10(self): + instance = quality_filter.PctAmbiguousFilter(100) + filtered = list(instance.filter_records(self.records)) + self.assertEqual(filtered, self.records) + + def test_1(self): + instance = quality_filter.PctAmbiguousFilter(0.23) + filtered = list(instance.filter_records(self.records)) + print(filtered) + self.assertEqual([self.records[i] for i in (0, 4)], filtered) + + +class MinLengthFilterTestCase(unittest.TestCase): + def setUp(self): + self.sequences = [ + SeqRecord(Seq('ACGT')), + SeqRecord(Seq('ACTTT')), + ] + + def test_none_pass(self): + instance = quality_filter.MinLengthFilter(6) + actual = list(instance.filter_records(self.sequences)) + self.assertEqual([], actual) + + def test_all_pass(self): + instance = quality_filter.MinLengthFilter(4) + actual = list(instance.filter_records(self.sequences)) + self.assertEqual(self.sequences, actual) + + def test_some_pass(self): + instance = quality_filter.MinLengthFilter(5) + actual = list(instance.filter_records(self.sequences)) + self.assertEqual(self.sequences[1:], actual) + + +class MaxLengthFilterTestCase(unittest.TestCase): + def setUp(self): + self.sequences = [ + SeqRecord(Seq('ACGT')), + SeqRecord(Seq('ACTTT')), + ] + + def test_none_truncated(self): + instance = quality_filter.MaxLengthFilter(6) + actual = list(instance.filter_records(self.sequences)) + self.assertEqual(self.sequences, actual) + + def test_some_truncated(self): + instance = quality_filter.MaxLengthFilter(4) + actual = list(instance.filter_records(self.sequences)) + self.assertEqual(['ACGT', 'ACTT'], [str(s.seq) for s in actual]) + + def test_all_truncated(self): + instance = quality_filter.MaxLengthFilter(3) + actual = list(instance.filter_records(self.sequences)) + self.assertEqual(['ACG', 'ACT'], [str(s.seq) for s in actual]) + self.assertEqual([i.id for i in self.sequences], + [i.id for i in actual]) + + +class PrimerBarcodeFilterTestCase(unittest.TestCase): + def setUp(self): + self.sequences = [ + SeqRecord(Seq('ACCGTTACGAT'), 'seq1'), + SeqRecord(Seq('ACTGTTACGCT'), 'seq2'), + SeqRecord(Seq('AACTGTTA'), 'seq3'), # Homopolymer in bc + SeqRecord(Seq('ACCGTA'), 'seq4'), # Error in primer + ] + + barcode_str = """Sample1,ACC\nSample2,ACT\n""" + self.primer = 'GTTA' + self.trie = quality_filter.parse_barcode_file( + StringIO(barcode_str), primer=self.primer) + self.outfile = StringIO() + self.instance = quality_filter.PrimerBarcodeFilter( + self.trie, self.outfile) + + def test_filter_trim(self): + actual = list(self.instance.filter_records(self.sequences)) + self.assertEqual(2, len(actual)) + self.assertEqual(['CGAT', 'CGCT'], [str(s.seq) for s in actual]) + + +class RecordEventListenerTestCase(unittest.TestCase): + def test_send(self): + events = [] + record = object() + + def e_handler(record, n=1): + events.append(n) + + rle = quality_filter.RecordEventListener() + rle.register_handler('e', e_handler) + + rle('e', record) + self.assertEqual(events, [1]) + + rle('e', record, n=5) + self.assertEqual(events, [1, 5]) + + # Test another event + rle('other', record, n=5) + self.assertEqual(events, [1, 5]) + + +class BarcodePrimerTrieTestCase(unittest.TestCase): + def setUp(self): + self.barcode_str = """p1d1bc205,TACTAGCG,CATTGCCTATG +p1d1bc206,TACTCGTC,CATTGCCTATG +p1d1bc207,TACTGTGC,CATTGCCTATG +p1d1bc208,TACTGCAG,CATTGCCTATG +p1d1bc209,TACACAGC,CATTGCCTATG +p1d1bc210,TACAGTCG,CAYGGCTA +p1d1bc211,TACGTACG,CAYGGCTA +p1d1bc212,TACGTCTC,CAYGGCTA +p1d1bc213,TACGAGAC,CAYGGCTA""" + self.fp = StringIO(self.barcode_str) + + def test_primer_provided(self): + res = quality_filter.parse_barcode_file(self.fp, primer='CATTGCCTATG') + self.assertEqual(9, len(list(res.keys()))) + self.assertEqual('p1d1bc210', res['TACAGTCGCATTGCCTATG']) + self.assertEqual(None, quality_filter.trie_match('TACAGTCGCATTGCCTAT', res)) + self.assertEqual('TACAGTCGCATTGCCTATG', + quality_filter.trie_match('TACAGTCGCATTGCCTATGCTACCTA', res)) + + def test_primer_in_file(self): + res = quality_filter.parse_barcode_file(self.fp, primer=None) + self.assertEqual(13, len(list(res.keys()))) + + # Test ambiguities + self.assertEqual('p1d1bc212', res['TACGTCTCCATGGCTA']) + self.assertEqual('p1d1bc212', res['TACGTCTCCACGGCTA']) + self.assertIsNone(res.get('TACGTCTCCAAGGCTA')) + self.assertIsNone(res.get('TACGTCTCCAGGGCTA')) + + +class AllUnambiguousTestCase(unittest.TestCase): + def test_one_nt(self): + self.assertEqual(set('ACGT'), set(quality_filter.all_unambiguous('N'))) + + def test_four_nt(self): + self.assertEqual( + set(['ACCG', 'ACCA']), set(quality_filter.all_unambiguous('ACCR'))) + self.assertEqual(4**4, len(quality_filter.all_unambiguous('NNNN'))) diff --git a/seqmagick/test/test_transform.py b/seqmagick/test/test_transform.py new file mode 100644 index 0000000..3e8b0e1 --- /dev/null +++ b/seqmagick/test/test_transform.py @@ -0,0 +1,673 @@ +""" +Tests for seqmagick.transform +""" + +from io import StringIO +import functools +import logging +import unittest + +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from Bio.Seq import Seq + +from seqmagick import transform + +logging.basicConfig(level=logging.FATAL) + +def _alignment_record(sequence): + return SeqRecord(Seq(sequence)) + +def seqrecord(sequence_id, sequence_text, description=None): + """ + Quick shortcut to make a SeqRecord + """ + record = SeqRecord(Seq(sequence_text), id=sequence_id) + if description: + record.description = description + return record + +class PatternReplaceTestCase(unittest.TestCase): + + def create_sequences(self): + return [seqrecord('test_sequence_1', 'ACTGT'), + seqrecord('test_REPLACE_2', 'ACTGT'), + seqrecord('other_sequence', 'ATGAG'), ] + + def setUp(self): + super(PatternReplaceTestCase, self).setUp() + self.sequences = self.create_sequences() + + def tearDown(self): + super(PatternReplaceTestCase, self).tearDown() + + # from http://stackoverflow.com/questions/13923072/shortening-fasta-header-perl + def test_pattern_replace_anchored_transform_id(self): + sequences = [seqrecord('gi|351517969|ref|NW_003613580.1|', 'CAGTC', + description='gi|351517969|ref|NW_003613580.1| Cricetulus griseus unplaced genomic scaffold'), + seqrecord('gi|351517969|ref|NW_003613580.1|', 'CAGTC', + description='gi|351517969|ref|NW_003613580.1|'), + seqrecord('gi|351517969|ref|NW_003613580.1|', 'CAGTC')] + + # capture the identifier after three groups of pipe-separated characters + transformed = list(transform.name_replace(sequences, r'^(?:[^|]+\|){3}([^|]+)\|', r'\1')) + + self.assertEqual(str(sequences[0].seq), str(transformed[0].seq)) + self.assertEqual('NW_003613580.1', transformed[0].id) + self.assertEqual('NW_003613580.1 Cricetulus griseus unplaced genomic scaffold', transformed[0].description) + + self.assertEqual(str(sequences[1].seq), str(transformed[1].seq)) + self.assertEqual('NW_003613580.1', transformed[1].id) + self.assertEqual('NW_003613580.1', transformed[1].description) + + self.assertEqual(str(sequences[2].seq), str(transformed[2].seq)) + self.assertEqual('NW_003613580.1', transformed[2].id) + self.assertEqual('', transformed[2].description) + + # from http://stackoverflow.com/questions/15155728/modifying-fasta-headers-with-unix-command-line-tools + def test_pattern_replace_anchored_id_from_description(self): + sequences = [seqrecord('hg19_ct_UserTrack_3545_691', 'GATGG', + description='hg19_ct_UserTrack_3545_691 range=chr1:8121498-8121502 5\'pad=0 3\'pad=0 strand=+ repeatMasking=none')] + + transformed = next(transform.name_replace(sequences, r'^\S+ range=(\S+)', r'\1')) + + self.assertEqual(str(sequences[0].seq), str(transformed.seq)) + self.assertEqual('chr1:8121498-8121502', transformed.id) + self.assertEqual('chr1:8121498-8121502 5\'pad=0 3\'pad=0 strand=+ repeatMasking=none', transformed.description) + + # from http://stackoverflow.com/questions/23280240/how-to-rename-fasta-file-headers-using-sed + def test_pattern_replace_anchored_add_to_description(self): + sequences = [seqrecord('Bra000001', 'CTTAT', description='Bra000001')] + + transformed = next(transform.name_replace(sequences, r'^(Bra\d+)$', r'\1 Brassica rapa')) + + self.assertEqual(str(sequences[0].seq), str(transformed.seq)) + self.assertEqual('Bra000001', transformed.id) + self.assertEqual('Bra000001 Brassica rapa', transformed.description) + + def test_pattern_replace_anchored_remove_from_description(self): + sequences = [seqrecord('Bra000001', 'CTTAT', description='Bra000001 Brassica rapa')] + + transformed = next(transform.name_replace(sequences, r' .*$', '')) + + self.assertEqual(str(sequences[0].seq), str(transformed.seq)) + self.assertEqual('Bra000001', transformed.id) + self.assertEqual('Bra000001', transformed.description) + + def test_pattern_replace_anchored_nomatch(self): + sequences = [seqrecord('hello', 'A', description='hello friend')] + transformed = next(transform.name_replace(sequences, r'^hello$', 'bye')) + + self.assertEqual(str(sequences[0].seq), str(transformed.seq)) + self.assertEqual('hello', transformed.id) + self.assertEqual('hello friend', transformed.description) + + def test_pattern_replace_anchored_match(self): + sequences = [seqrecord('hello', 'A', description='hello friend'), + seqrecord('hello', 'A')] + + transformed = list(transform.name_replace(sequences, r'^hello\b', 'bye')) + + self.assertEqual(str(sequences[0].seq), str(transformed[0].seq)) + self.assertEqual('bye', transformed[0].id) + self.assertEqual('bye friend', transformed[0].description) + + self.assertEqual(str(sequences[1].seq), str(transformed[1].seq)) + self.assertEqual('bye', transformed[1].id) + self.assertEqual('', transformed[1].description) + + def test_pattern_replace_none(self): + result = transform.name_replace(self.sequences, 'ZZZ', 'MATCH') + result = list(result) + self.assertEqual(self.sequences, result) + + def test_pattern_replace_static(self): + result = transform.name_replace(self.sequences, '_REPLACE_', + '_DONE_') + result = list(result) + expected = self.create_sequences() + expected[1].id = 'test_DONE_2' + self.assertEqual(self.sequences, result) + + def test_pattern_replace_case_insensitive(self): + """ + Substitutions are case insensitive + """ + result = transform.name_replace(self.sequences, '_replace_', + '_DONE_') + result = list(result) + expected = self.create_sequences() + expected[1].id = 'test_DONE_2' + self.assertEqual(self.sequences, result) + + def test_pattern_replace_group(self): + """ + Make sure capturing groups work + """ + result = transform.name_replace(self.sequences, '_(repl)ace_', + '_DONE-\\1_') + result = list(result) + expected = self.create_sequences() + expected[1].id = 'test_DONE-repl_2' + self.assertEqual(self.sequences, result) + +class SqueezeTestCase(unittest.TestCase): + + def setUp(self): + super(SqueezeTestCase, self).setUp() + + self.sequences = [ + seqrecord('sequence_1', 'AC-G--'), + seqrecord('sequence_2', '-C-GT-'), + seqrecord('sequence_3', '-T-AG-'), + ] + + def test_gap_proportion(self): + actual = transform.gap_proportion(self.sequences) + self.assertEqual([2./3, 0.0, 1.0, 0.0, 1./3, 1.0], actual) + + def test_basic_squeeze(self): + result = list(transform.squeeze(self.sequences, 1.0)) + + self.assertEqual([4, 4, 4], [len(i) for i in result]) + self.assertEqual([i.id for i in self.sequences], [i.id for i in result]) + expected = [ + seqrecord('sequence_1', 'ACG-'), + seqrecord('sequence_2', '-CGT'), + seqrecord('sequence_3', '-TAG'), + ] + + self.assertEqual([str(i.seq) for i in expected], + [str(i.seq) for i in result]) + + def test_squeeze_none(self): + """ + Threshold of 0.001 - nothing should be squeezed. + """ + result = list(transform.squeeze(self.sequences, 1.01)) + self.assertEqual([str(i.seq) for i in self.sequences], + [str(i.seq) for i in result]) + + +class SeqPatternTestCase(unittest.TestCase): + + def setUp(self): + super(SeqPatternTestCase, self).setUp() + + self.sequences = [ + seqrecord('s1', 'AC-G--'), + seqrecord('s2', '-C-GT-'), + seqrecord('s3', '-T-AG-'), + ] + + self.tests = [('^$', set()), + ('.*', {'s1', 's2', 's3'}), + ('^AC', {'s1'}), + ('^ac', set()), + ('(?i)^ac', {'s1'})] + + def test_include(self): + result = transform.seq_include(self.sequences, '^$') + + for regex, expected in self.tests: + result = {seq.id for seq in transform.seq_include(self.sequences, regex)} + self.assertEqual(expected, result) + + def test_exclude(self): + result = transform.seq_include(self.sequences, '^$') + + for regex, expected_include in self.tests: + result = {seq.id for seq in transform.seq_exclude(self.sequences, regex)} + expected = {seq.id for seq in self.sequences + if seq.id not in expected_include} + self.assertEqual(expected, result) + + +class HeadTestCase(unittest.TestCase): + """ + Test for transform.head + """ + + def setUp(self): + self.sequences = [seqrecord('sequence{0}'.format(i), 'A'*(i+1)) + for i in range(100)] + + def test_zero(self): + result = list(transform.head(self.sequences, '0')) + self.assertEqual([], result) + + def test_more_seqs_than_available(self): + """ + Specifying more sequences than are in input records should return + them all + """ + result = list(transform.head(self.sequences, '10000')) + self.assertEqual([s.id for s in self.sequences], + [r.id for r in result]) + self.assertEqual([str(s.seq) for s in self.sequences], + [str(r.seq) for r in result]) + + def test_values(self): + """ + Try specifying some values. + """ + for h in range(len(self.sequences) + 1): + result = list(transform.head(self.sequences, str(h))) + self.assertEqual(h, len(result)) + self.assertEqual([s.id for s in self.sequences[:h]], + [r.id for r in result]) + self.assertEqual([str(s.seq) for s in self.sequences[:h]], + [str(r.seq) for r in result]) + + def test_minus_zero(self): + """ + Test that -0 returns all sequences + """ + result = list(transform.head(self.sequences, '-0')) + self.assertEqual([s.id for s in self.sequences], + [r.id for r in result]) + self.assertEqual([str(s.seq) for s in self.sequences], + [str(r.seq) for r in result]) + + def test_minus_values(self): + """ + Try specifying some minus values. + """ + for h in range(1, len(self.sequences) + 1): + result = list(transform.head(self.sequences, str(-h))) + self.assertEqual(h, len(self.sequences) - len(result)) + self.assertEqual([s.id for s in self.sequences[:-h]], + [r.id for r in result]) + self.assertEqual([str(s.seq) for s in self.sequences[:-h]], + [str(r.seq) for r in result]) + +class TailTestCase(unittest.TestCase): + def setUp(self): + self.records = [ + seqrecord('sequence_1', 'AC-G--'), + seqrecord('sequence_2', '-C-GT-'), + seqrecord('sequence_3', '-T-AG-'), + ] + + def _do_test(self, size): + actual = list(transform.tail(self.records, str(size))) + expected = self.records[-size:] + self.assertEqual([e.id for e in expected], [a.id for a in actual]) + self.assertEqual([str(e.seq) for e in expected], [str(a.seq) for a in actual]) + + def test_tail_1(self): + self._do_test(1) + + def test_tail_2(self): + self._do_test(2) + + def test_tail_3(self): + self._do_test(3) + + def test_plus_zero(self): + """ + Test that +0 returns all sequences + """ + result = list(transform.tail(self.records, '+0')) + self.assertEqual([s.id for s in self.records], + [r.id for r in result]) + self.assertEqual([str(s.seq) for s in self.records], + [str(r.seq) for r in result]) + + def test_plus_values(self): + """ + Try specifying some plus values. + """ + for h in range(1, len(self.records) + 1): + result = list(transform.tail(self.records, '+{}'.format(h))) + self.assertEqual(len(self.records) + 1 - h, len(result)) + self.assertEqual([s.id for s in self.records[h-1:]], + [r.id for r in result]) + self.assertEqual([str(s.seq) for s in self.records[h-1:]], + [str(r.seq) for r in result]) + +class IsolateRegionTestCase(unittest.TestCase): + + def setUp(self): + self.sequences = [_alignment_record('--A--ACTGGACGTATTC-CCCC'), + _alignment_record('--AGCACTGGA---ATTC-CCCC')] + + def test_no_isolation(self): + result = list(transform.isolate_region(self.sequences, 0, + len(self.sequences[0]))) + + self.assertEqual(self.sequences, result) + + def test_single_loc(self): + start = 2 + end = 3 + result = list(transform.isolate_region(self.sequences, start, end)) + for seq in result: + self.assertEqual('--A--------------------', str(seq.seq)) + + def test_middle(self): + expected = ['--A--ACTGGA------------', '--AGCACTGGA------------'] + start = 1 + end = 11 + + actual = list(transform.isolate_region(self.sequences, start, end)) + actual = [str(s.seq) for s in actual] + self.assertEqual(expected, actual) + + def test_invalid(self): + self.assertRaises(ValueError, transform.isolate_region( + self.sequences, 5, 5).__next__) + self.assertRaises(ValueError, transform.isolate_region( + self.sequences, 10, 5).__next__) + +class MinUngapLengthTestCase(unittest.TestCase): + + def setUp(self): + self.sequences = [_alignment_record('--AAC--'), + _alignment_record('AAAA...'), + _alignment_record('-------'), + _alignment_record('ACGRAGT')] + + def test_none_pass(self): + result = list(transform.min_ungap_length_discard(self.sequences, 8)) + self.assertEqual([], result) + + def test_all_pass(self): + result = list(transform.min_ungap_length_discard(self.sequences, 0)) + self.assertEqual(self.sequences, result) + + def test_partial(self): + result = transform.min_ungap_length_discard(self.sequences, 4) + self.assertEqual( + [self.sequences[1].seq, self.sequences[3].seq], + [seq.seq for seq in result]) + + +class IncludeExcludeMixIn(object): + + def setUp(self): + ids = """sequenceid1 +sequenceid2 +sequenceid4 +""" + self.handle = StringIO(ids) + + self.sequences = [seqrecord("sequenceid1", "AAA"), + seqrecord("sequenceid2", "BBB"), + seqrecord("sequenceid3", "CCC"), + seqrecord("sequenceid4", "DDD", + description='sequence id 4'), + seqrecord("test", "EEE", + description='test sequence'), ] + + +class IncludeFromFileTestCase(IncludeExcludeMixIn, unittest.TestCase): + + def test_filter(self): + expected = [self.sequences[0], self.sequences[1], self.sequences[3]] + actual = list(transform.include_from_file(self.sequences, self.handle)) + self.assertEqual(3, len(actual)) + self.assertEqual(expected, actual) + +class ExcludeFromFileTestCase(IncludeExcludeMixIn, unittest.TestCase): + + def test_filter(self): + expected = [self.sequences[2], self.sequences[4]] + actual = list(transform.exclude_from_file(self.sequences, self.handle)) + self.assertEqual(2, len(actual)) + self.assertEqual(expected, actual) + +class NameIncludeTestCase(IncludeExcludeMixIn, unittest.TestCase): + + def test_filter_id(self): + expected = self.sequences[:2] + actual = list(transform.name_include(self.sequences, r'sequenceid[12]')) + self.assertEqual(2, len(actual)) + self.assertEqual(expected, actual) + + def test_filter_description(self): + expected = self.sequences[3:] + actual = list(transform.name_include(self.sequences, r'sequence id 4|test seq')) + self.assertEqual(2, len(actual)) + self.assertEqual(expected, actual) + +class NameExcludeTestCase(IncludeExcludeMixIn, unittest.TestCase): + + def test_filter_id(self): + expected = self.sequences[2:] + actual = list(transform.name_exclude(self.sequences, r'sequenceid[12]')) + self.assertEqual(3, len(actual)) + self.assertEqual(expected, actual) + + def test_filter_description(self): + expected = self.sequences[:3] + actual = list(transform.name_exclude(self.sequences, r'sequence id 4|test seq')) + self.assertEqual(expected, actual) + +class CutTestCase(unittest.TestCase): + + def setUp(self): + self.sequences = [SeqRecord(Seq("ABC"), id="sequenceid1"), + SeqRecord(Seq("BCD"), id="sequenceid2"), + SeqRecord(Seq("DEF"), id="sequence id 4"), + SeqRecord(Seq("EFG"), id="test sequence"), ] + + def test_no_sequences(self): + actual = list(transform._cut_sequences(self.sequences, slice(0, 0))) + for sequence in actual: + self.assertEqual(0, len(sequence)) + + def test_full_sequence(self): + actual = list(transform._cut_sequences(self.sequences, slice(0, 3))) + self.assertEqual(['ABC', 'BCD', 'DEF', 'EFG'], [str(s.seq) for s in + actual]) + + def test_cut_sequences(self): + actual = list(transform._cut_sequences(self.sequences, slice(0, 2))) + self.assertEqual(['AB', 'BC', 'DE', 'EF'], [str(s.seq) for s in + actual]) + actual = list(transform._cut_sequences(self.sequences, slice(1, None))) + self.assertEqual(['BC', 'CD', 'EF', 'FG'], [str(s.seq) for s in + actual]) + +class CodonWarningTableTestCase(unittest.TestCase): + + def warn(self, *args, **kwargs): + self.warnings.append((args, kwargs)) + + def setUp(self): + self.warnings = [] + self.warning_dict = transform.CodonWarningTable({'UUU': 'F'}) + self.old_warn = transform.logging.warning + transform.logging.warning = self.warn + + def tearDown(self): + transform.logging.warning = self.old_warn + + def test_nowarn(self): + actual = self.warning_dict['UUU'] + self.assertEqual('F', actual) + self.assertEqual([], self.warnings) + + def test_warn(self): + codon = 'UU-' + actual = self.warning_dict[codon] + self.assertEqual('X', actual) + self.assertEqual([(("Unknown Codon: %s", codon), {})], self.warnings) + +class TranslateTestCase(unittest.TestCase): + + def test_dna_protein_nogap(self): + sequences = [seqrecord('A', 'TTTTTATAA')] + expected = ['FL*'] + actual = transform.translate(sequences, 'dna2protein') + self.assertEqual(expected, [str(i.seq) for i in actual]) + + def test_dna_protein_nogap_stop(self): + sequences = [seqrecord('A', 'TTTTTATAA')] + expected = ['FL'] + actual = transform.translate(sequences, 'dna2proteinstop') + self.assertEqual(expected, [str(i.seq) for i in actual]) + + def test_dna_protein_gap(self): + sequences = [seqrecord('A', 'TTTTT-TAA')] + expected = ['FX*'] + actual = transform.translate(sequences, 'dna2protein') + self.assertEqual(expected, [str(i.seq) for i in actual]) + + def test_dna_protein_gap_stop(self): + sequences = [seqrecord('A', '---TTATAA')] + expected = ['-L'] + actual = transform.translate(sequences, 'dna2proteinstop') + self.assertEqual(expected, [str(i.seq) for i in actual]) + +class UngapSequencesTestCase(unittest.TestCase): + + def test_dot_gap(self): + sequences = [SeqRecord(Seq("AAA"), id="s1"), + SeqRecord(Seq("A.G"), id="s2"), + SeqRecord(Seq(".A."), id="s3"),] + + ungapped = list(transform.ungap_sequences(sequences)) + self.assertEqual(["AAA", "AG", "A"], [str(s.seq) for s in ungapped]) + + def test_dash_gap(self): + sequences = [SeqRecord(Seq("AAA"), id="s1"), + SeqRecord(Seq("A-G"), id="s2"), + SeqRecord(Seq("-A-"), id="s3"),] + + ungapped = list(transform.ungap_sequences(sequences)) + self.assertEqual(["AAA", "AG", "A"], [str(s.seq) for s in ungapped]) + +# Name Modification functions +class IdModifyMixin(object): + """ + Mixin to ease testing name prefix and suffix + """ + + def setUp(self): + self.input_fp = StringIO(self.initial_fasta) + self.output_fp = StringIO() + + def test_modify(self): + records = SeqIO.parse(self.input_fp, 'fasta') + records = self.__class__.modify_fn(records) + SeqIO.write(records, self.output_fp, 'fasta') + self.assertEqual(self.target_fasta, self.output_fp.getvalue().strip()) + +class NamePrefixTestCase(IdModifyMixin, unittest.TestCase): + initial_fasta = """>seq1 +ACGT +>gi|260674|gb|S52561.1| {long terminal repeat} [human immunodeficiency virus type] +ACGT""" + target_fasta = """>pre.seq1 +ACGT +>pre.gi|260674|gb|S52561.1| {long terminal repeat} [human immunodeficiency virus type] +ACGT""" + modify_fn = functools.partial(transform.name_insert_prefix, prefix="pre.") + +class NameSuffixTestCase(IdModifyMixin, unittest.TestCase): + initial_fasta = """>seq1 +ACGT +>gi|260674|gb|S52561.1| {long terminal repeat} [human immunodeficiency virus type] +ACGT""" + target_fasta = """>seq1.post +ACGT +>gi|260674|gb|S52561.1|.post {long terminal repeat} [human immunodeficiency virus type] +ACGT""" + modify_fn = functools.partial(transform.name_append_suffix, suffix=".post") + + +class MultiCutTestCase(unittest.TestCase): + def setUp(self): + self.inputs = [seqrecord("Sequence 1", "ACGT--TCAGA")] + + def test_multicut(self): + actual = list(transform.multi_cut_sequences(self.inputs, + [slice(None, 2), slice(8, None)])) + self.assertEqual(['ACAGA'], [str(s.seq) for s in actual]) + +class MultiMaskSequences(unittest.TestCase): + + def setUp(self): + self.sequences = [SeqRecord(Seq("AAA"), id="sequenceid1"), + SeqRecord(Seq("BBB"), id="sequenceid2"), + SeqRecord(Seq("DDDD"), id="sequence id 4"), + SeqRecord(Seq("EEE"), id="test sequence"), ] + + def test_mask_whole(self): + masks = [slice(0, 200)] + actual = list(transform.multi_mask_sequences(self.sequences, masks)) + self.assertEqual(len(self.sequences), len(actual)) + for e, a in zip(self.sequences, actual): + self.assertEqual(e.id, a.id) + self.assertEqual('-'*len(e), str(a.seq)) + + def test_mask(self): + masks = [slice(1, 2)] + actual = list(transform.multi_mask_sequences(self.sequences, masks)) + self.assertEqual(len(self.sequences), len(actual)) + self.assertEqual(['A-A', 'B-B', 'D-DD', 'E-E'], + [str(a.seq) for a in actual]) + +class RecordBufferTestCase(unittest.TestCase): + def setUp(self): + self.sequences = [SeqRecord(Seq("AAA"), id="s1"), + SeqRecord(Seq("A-G"), id="s2"), + SeqRecord(Seq("-A-"), id="s3"),] + self.seq_iter = iter(self.sequences) + + def _compare(self, records): + self.assertEqual(len(self.sequences), len(records)) + + for e, a in zip(self.sequences, records): + self.assertEqual(e.id, a.id) + self.assertEqual(e.description, a.description) + self.assertEqual(str(e.seq), str(a.seq)) + + def test_single_pass(self): + with transform._record_buffer(self.seq_iter) as iter_f: + records = list(iter_f()) + self._compare(records) + + def test_multi_pass(self): + with transform._record_buffer(self.seq_iter) as iter_f: + records = list(iter_f()) + self._compare(records) + + records = list(iter_f()) + self._compare(records) + +class DropColumnsTestCase(unittest.TestCase): + def setUp(self): + self.sequences = [SeqRecord(Seq("AAA"), id="s1"), + SeqRecord(Seq("A-G"), id="s2"), + SeqRecord(Seq("-A-"), id="s3"),] + + def test_basic(self): + r = list(transform.drop_columns(self.sequences, [slice(1, None)])) + self.assertEqual([i.id for i in self.sequences], + [i.id for i in r]) + self.assertEqual(['A', 'A', '-'], [str(i.seq) for i in r]) + + def test_multi(self): + r = list(transform.drop_columns(self.sequences, [slice(0, 1), slice(2, None)])) + self.assertEqual([i.id for i in self.sequences], + [i.id for i in r]) + self.assertEqual(['A', '-', 'A'], [str(i.seq) for i in r]) + +class DashesCleanupTestCase(unittest.TestCase): + def setUp(self): + self.sequences = [SeqRecord(Seq("A~-.?~GT"), id="s1"), + SeqRecord(Seq("A-GGGG?-"), id="s2"), + SeqRecord(Seq("-A-:ACA-"), id="s3"), + SeqRecord(Seq("ACTGGTCA"), id="s4"),] + + def test_basic(self): + actual = list(transform.dashes_cleanup(self.sequences)) + actual = [(i.id, str(i.seq)) for i in actual] + self.assertEqual( + [('s1', 'A-----GT'), + ('s2', 'A-GGGG--'), + ('s3', '-A--ACA-'), + ('s4', 'ACTGGTCA')], actual) diff --git a/seqmagick/transform.py b/seqmagick/transform.py new file mode 100644 index 0000000..3150d1d --- /dev/null +++ b/seqmagick/transform.py @@ -0,0 +1,810 @@ +""" +Functions to transform / filter sequences +""" +import collections +import contextlib +import pickle as pickle +import gzip +import itertools +import logging +import re +import string +import tempfile +import random + +from Bio import SeqIO +from Bio.Data import CodonTable +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.SeqUtils.CheckSum import seguid +from functools import reduce + +# Characters to be treated as gaps +GAP_CHARS = "-." +GAP_TABLE = {ord(c): None for c in GAP_CHARS} + +# Size of temporary file buffer: default to 256MB +DEFAULT_BUFFER_SIZE = 268435456 # 256 * 2**20 + + +@contextlib.contextmanager +def _record_buffer(records, buffer_size=DEFAULT_BUFFER_SIZE): + """ + Buffer for transform functions which require multiple passes through data. + + Value returned by context manager is a function which returns an iterator + through records. + """ + with tempfile.SpooledTemporaryFile(buffer_size, mode='wb+') as tf: + pickler = pickle.Pickler(tf) + for record in records: + pickler.dump(record) + + def record_iter(): + tf.seek(0) + # _file is used below because it implements the necessary methods for pickle.Unpickler(), namely 'readinto' which is newly required in 3.8. See https://docs.python.org/3/library/tempfile.html#tempfile.SpooledTemporaryFile for details on the _file attribute of tempfile.SpooledTemporaryFile. + unpickler = pickle.Unpickler(tf._file) + while True: + try: + yield unpickler.load() + except EOFError: + break + + yield record_iter + + +def dashes_cleanup(records, prune_chars='.:?~'): + """ + Take an alignment and convert any undesirable characters such as ? or ~ to + -. + """ + logging.info( + "Applying _dashes_cleanup: converting any of '{}' to '-'.".format(prune_chars)) + translation_table = {ord(c): '-' for c in prune_chars} + for record in records: + record.seq = Seq(str(record.seq).translate(translation_table)) + yield record + + +def deduplicate_sequences(records, out_file): + """ + Remove any duplicate records with identical sequences, keep the first + instance seen and discard additional occurences. + """ + + logging.info('Applying _deduplicate_sequences generator: ' + 'removing any duplicate records with identical sequences.') + checksum_sequences = collections.defaultdict(list) + for record in records: + checksum = seguid(record.seq) + sequences = checksum_sequences[checksum] + if not sequences: + yield record + sequences.append(record.id) + + if out_file is not None: + with out_file: + for sequences in checksum_sequences.values(): + out_file.write('%s\n' % (' '.join(sequences),)) + + +def deduplicate_taxa(records): + """ + Remove any duplicate records with identical IDs, keep the first + instance seen and discard additional occurences. + """ + logging.info('Applying _deduplicate_taxa generator: ' + \ + 'removing any duplicate records with identical IDs.') + taxa = set() + for record in records: + # Default to full ID, split if | is found. + taxid = record.id + if '|' in record.id: + try: + taxid = int(record.id.split("|")[0]) + except: + # If we couldn't parse an integer from the ID, just fall back + # on the ID + logging.warn("Unable to parse integer taxid from %s", + taxid) + if taxid in taxa: + continue + taxa.add(taxid) + yield record + + +def first_name_capture(records): + """ + Take only the first whitespace-delimited word as the name of the sequence. + Essentially removes any extra text from the sequence's description. + """ + logging.info('Applying _first_name_capture generator: ' + 'making sure ID only contains the first whitespace-delimited ' + 'word.') + whitespace = re.compile(r'\s+') + for record in records: + if whitespace.search(record.description): + yield SeqRecord(record.seq, id=record.id, + description="") + else: + yield record + + +def include_from_file(records, handle): + """ + Filter the records, keeping only sequences whose ID is contained in the + handle. + """ + ids = set(i.strip() for i in handle) + + for record in records: + if record.id.strip() in ids: + yield record + + +def exclude_from_file(records, handle): + """ + Filter the records, keeping only sequences whose ID is not contained in the + handle. + """ + ids = set(i.strip() for i in handle) + + for record in records: + if record.id.strip() not in ids: + yield record + + +def isolate_region(sequences, start, end, gap_char='-'): + """ + Replace regions before and after start:end with gap chars + """ + # Check arguments + if end <= start: + raise ValueError("start of slice must precede end ({0} !> {1})".format( + end, start)) + + for sequence in sequences: + seq = sequence.seq + start_gap = gap_char * start + end_gap = gap_char * (len(seq) - end) + seq = Seq(start_gap + str(seq[start:end]) + end_gap) + sequence.seq = seq + yield sequence + + +def _cut_sequences(records, cut_slice): + """ + Cut sequences given a slice. + """ + for record in records: + yield record[cut_slice] + +def drop_columns(records, slices): + """ + Drop all columns present in ``slices`` from records + """ + for record in records: + # Generate a set of indices to remove + drop = set(i for slice in slices + for i in range(*slice.indices(len(record)))) + keep = [i not in drop for i in range(len(record))] + record.seq = Seq(''.join(itertools.compress(record.seq, keep))) + yield record + +def multi_cut_sequences(records, slices): + # If only a single slice is specified, use _cut_sequences, + # since this preserves per-letter annotations + if len(slices) == 1: + for sequence in _cut_sequences(records, slices[0]): + yield sequence + else: + # For multiple slices, concatenate the slice results + for record in records: + pieces = (record[s] for s in slices) + # SeqRecords support addition as concatenation + yield reduce(lambda x, y: x + y, pieces) + +def _update_slices(record, slices): + n = itertools.count().__next__ + # Generate a map from indexes in the specified sequence to those in the + # alignment + ungap_map = dict((n(), i) for i, base in enumerate(str(record.seq)) + if base not in GAP_CHARS) + def update_slice(s): + """ + Maps a slice relative to ungapped record_id to a slice valid for the + whole alignment. + """ + start, end = s.start, s.stop + if start is not None: + try: + start = ungap_map[start] + except KeyError: + raise KeyError("""No index {0} in {1}.""".format( + start, record.id)) + if end is not None: + # We need the base in the slice identified by end, not the base + # at end, otherwise insertions between end-1 and end will be + # included. + try: + end = ungap_map[end - 1] + 1 + except KeyError: + logging.warn("""No index %d in %s. Keeping columns to end + of alignment.""", end, record.id) + end = None + + return slice(start, end) + + return [update_slice(s) for s in slices] + +def cut_sequences_relative(records, slices, record_id): + """ + Cuts records to slices, indexed by non-gap positions in record_id + """ + with _record_buffer(records) as r: + try: + record = next(i for i in r() if i.id == record_id) + except StopIteration: + raise ValueError("Record with id {0} not found.".format(record_id)) + + new_slices = _update_slices(record, slices) + for record in multi_cut_sequences(r(), new_slices): + yield record + +def multi_mask_sequences(records, slices): + """ + Replace characters sliced by slices with gap characters. + """ + for record in records: + record_indices = list(range(len(record))) + keep_indices = reduce(lambda i, s: i - frozenset(record_indices[s]), + slices, frozenset(record_indices)) + seq = ''.join(b if i in keep_indices else '-' + for i, b in enumerate(str(record.seq))) + record.seq = Seq(seq) + yield record + +def mask_sequences_relative(records, slices, record_id): + with _record_buffer(records) as r: + try: + record = next(i for i in r() if i.id == record_id) + except StopIteration: + raise ValueError("Record with id {0} not found.".format(record_id)) + + new_slices = _update_slices(record, slices) + for record in multi_mask_sequences(r(), new_slices): + yield record + + +def lower_sequences(records): + """ + Convert sequences to all lowercase. + """ + logging.info('Applying _lower_sequences generator: ' + 'converting sequences to all lowercase.') + for record in records: + yield record.lower() + + +def upper_sequences(records): + """ + Convert sequences to all uppercase. + """ + logging.info('Applying _upper_sequences generator: ' + 'converting sequences to all uppercase.') + for record in records: + yield record.upper() + + +def prune_empty(records): + """ + Remove any sequences which are entirely gaps ('-') + """ + for record in records: + if not all(c == '-' for c in str(record.seq)): + yield record + + +def _reverse_annotations(old_record, new_record): + """ + Copy annotations form old_record to new_record, reversing any + lists / tuples / strings. + """ + # Copy the annotations over + for k, v in list(old_record.annotations.items()): + # Trim if appropriate + if isinstance(v, (tuple, list)) and len(v) == len(old_record): + assert len(v) == len(old_record) + v = v[::-1] + new_record.annotations[k] = v + + # Letter annotations must be lists / tuples / strings of the same + # length as the sequence + for k, v in list(old_record.letter_annotations.items()): + assert len(v) == len(old_record) + new_record.letter_annotations[k] = v[::-1] + + +def reverse_sequences(records): + """ + Reverse the order of sites in sequences. + """ + logging.info('Applying _reverse_sequences generator: ' + 'reversing the order of sites in sequences.') + for record in records: + rev_record = SeqRecord(record.seq[::-1], id=record.id, + name=record.name, + description=record.description) + # Copy the annotations over + _reverse_annotations(record, rev_record) + + yield rev_record + + +def reverse_complement_sequences(records): + """ + Transform sequences into reverse complements. + """ + logging.info('Applying _reverse_complement_sequences generator: ' + 'transforming sequences into reverse complements.') + for record in records: + rev_record = SeqRecord(record.seq.reverse_complement(), + id=record.id, name=record.name, + description=record.description) + # Copy the annotations over + _reverse_annotations(record, rev_record) + + yield rev_record + + +def ungap_sequences(records, gap_chars=GAP_TABLE): + """ + Remove gaps from sequences, given an alignment. + """ + logging.info('Applying _ungap_sequences generator: removing all gap characters') + for record in records: + yield ungap_all(record, gap_chars) + + +def ungap_all(record, gap_chars=GAP_TABLE): + + record = SeqRecord( + Seq(str(record.seq).translate(gap_chars)), + id=record.id, description=record.description + ) + return record + + +def _update_id(record, new_id): + """ + Update a record id to new_id, also modifying the ID in record.description + """ + old_id = record.id + record.id = new_id + + # At least for FASTA, record ID starts the description + record.description = re.sub('^' + re.escape(old_id), new_id, record.description) + return record + + +def name_append_suffix(records, suffix): + """ + Given a set of sequences, append a suffix for each sequence's name. + """ + logging.info('Applying _name_append_suffix generator: ' + 'Appending suffix ' + suffix + ' to all ' + 'sequence IDs.') + for record in records: + new_id = record.id + suffix + _update_id(record, new_id) + yield record + + +def name_insert_prefix(records, prefix): + """ + Given a set of sequences, insert a prefix for each sequence's name. + """ + logging.info('Applying _name_insert_prefix generator: ' + 'Inserting prefix ' + prefix + ' for all ' + 'sequence IDs.') + for record in records: + new_id = prefix + record.id + _update_id(record, new_id) + yield record + + + +def name_include(records, filter_regex): + """ + Given a set of sequences, filter out any sequences with names + that do not match the specified regular expression. Ignore case. + """ + logging.info('Applying _name_include generator: ' + 'including only IDs matching ' + filter_regex + + ' in results.') + regex = re.compile(filter_regex) + for record in records: + if regex.search(record.id) or regex.search(record.description): + yield record + + +def name_exclude(records, filter_regex): + """ + Given a set of sequences, filter out any sequences with names + that match the specified regular expression. Ignore case. + """ + logging.info('Applying _name_exclude generator: ' + 'excluding IDs matching ' + filter_regex + ' in results.') + regex = re.compile(filter_regex) + for record in records: + if not regex.search(record.id) and not regex.search(record.description): + yield record + + +def name_replace(records, search_regex, replace_pattern): + """ + Given a set of sequences, replace all occurrences of search_regex + with replace_pattern. Ignore case. + + If the ID and the first word of the description match, assume the + description is FASTA-like and apply the transform to the entire + description, then set the ID from the first word. If the ID and + the first word of the description do not match, apply the transform + to each individually. + """ + regex = re.compile(search_regex) + for record in records: + maybe_id = record.description.split(None, 1)[0] + if maybe_id == record.id: + record.description = regex.sub(replace_pattern, record.description) + record.id = record.description.split(None, 1)[0] + else: + record.id = regex.sub(replace_pattern, record.id) + record.description = regex.sub(replace_pattern, record.description) + yield record + + +def seq_include(records, filter_regex): + """ + Filter any sequences who's seq does not match the filter. Ignore case. + """ + regex = re.compile(filter_regex) + for record in records: + if regex.search(str(record.seq)): + yield record + + +def seq_exclude(records, filter_regex): + """ + Filter any sequences whose seq matches the filter. Ignore case. + """ + regex = re.compile(filter_regex) + for record in records: + if not regex.search(str(record.seq)): + yield record + + +def sample(records, k, random_seed=None): + """Choose a length-``k`` subset of ``records``, retaining the input + order. If k > len(records), all are returned. If an integer + ``random_seed`` is provided, sets ``random.seed()`` + + """ + + if random_seed is not None: + random.seed(random_seed) + + result = [] + for i, record in enumerate(records): + if len(result) < k: + result.append(record) + else: + r = random.randint(0, i) + if r < k: + result[r] = record + return result + + +def head(records, head): + """ + Limit results to the top N records. + With the leading `-', print all but the last N records. + """ + logging.info('Applying _head generator: ' + 'limiting results to top ' + head + ' records.') + + if head == '-0': + for record in records: + yield record + elif '-' in head: + with _record_buffer(records) as r: + record_count = sum(1 for record in r()) + end_index = max(record_count + int(head), 0) + for record in itertools.islice(r(), end_index): + yield record + else: + for record in itertools.islice(records, int(head)): + yield record + +def tail(records, tail): + """ + Limit results to the bottom N records. + Use +N to output records starting with the Nth. + """ + logging.info('Applying _tail generator: ' + 'limiting results to top ' + tail + ' records.') + + if tail == '+0': + for record in records: + yield record + elif '+' in tail: + tail = int(tail) - 1 + for record in itertools.islice(records, tail, None): + yield record + else: + with _record_buffer(records) as r: + record_count = sum(1 for record in r()) + start_index = max(record_count - int(tail), 0) + for record in itertools.islice(r(), start_index, None): + yield record + +# Squeeze-related +def gap_proportion(sequences, gap_chars='-'): + """ + Generates a list with the proportion of gaps by index in a set of + sequences. + """ + aln_len = None + gaps = [] + for i, sequence in enumerate(sequences): + if aln_len is None: + aln_len = len(sequence) + gaps = [0] * aln_len + else: + if not len(sequence) == aln_len: + raise ValueError(("Unexpected sequence length {0}. Is this " + "an alignment?").format(len(sequence))) + + # Update any gap positions in gap list + for j, char in enumerate(sequence.seq): + if char in gap_chars: + gaps[j] += 1 + + sequence_count = float(i + 1) + gap_props = [i / sequence_count for i in gaps] + return gap_props + + +def squeeze(records, gap_threshold=1.0): + """ + Remove any gaps that are present in the same position across all sequences + in an alignment. Takes a second sequence iterator for determining gap + positions. + """ + with _record_buffer(records) as r: + gap_proportions = gap_proportion(r()) + + keep_columns = [g < gap_threshold for g in gap_proportions] + + for record in r(): + sequence = str(record.seq) + # Trim + squeezed = itertools.compress(sequence, keep_columns) + yield SeqRecord(Seq(''.join(squeezed)), id=record.id, + description=record.description) + +def strip_range(records): + """ + Cut off trailing /- ranges from IDs. Ranges must be 1-indexed and + the stop integer must not be less than the start integer. + """ + logging.info('Applying _strip_range generator: ' + 'removing /- ranges from IDs') + # Split up and be greedy. + cut_regex = re.compile(r"(?P.*)\/(?P\d+)\-(?P\d+)") + for record in records: + name = record.id + match = cut_regex.match(str(record.id)) + if match: + sequence_id = match.group('id') + start = int(match.group('start')) + stop = int(match.group('stop')) + if start > 0 and start <= stop: + name = sequence_id + yield SeqRecord(record.seq, id=name, + description='') + + +def transcribe(records, transcribe): + """ + Perform transcription or back-transcription. + transcribe must be one of the following: + dna2rna + rna2dna + """ + logging.info('Applying _transcribe generator: ' + 'operation to perform is ' + transcribe + '.') + for record in records: + sequence = str(record.seq) + description = record.description + name = record.id + if transcribe == 'dna2rna': + dna = Seq(sequence, IUPAC.ambiguous_dna) + rna = dna.transcribe() + yield SeqRecord(rna, id=name, description=description) + elif transcribe == 'rna2dna': + rna = Seq(sequence, IUPAC.ambiguous_rna) + dna = rna.back_transcribe() + yield SeqRecord(dna, id=name, description=description) + +# Translate-related functions +class CodonWarningTable(object): + """ + Translation table for codons tht prints a warning when an unknown + codon is requested, then returns the value passed as missing_char + """ + + def __init__(self, wrapped, missing_char='X'): + self.wrapped = wrapped + self.missing_char = missing_char + self.seen = set() + + def get(self, codon, missing=None): + try: + return self.__getitem__(codon) + except KeyError: + return missing + + def __getitem__(self, codon): + if codon == '---': + return '-' + elif '-' in codon: + if codon not in self.seen: + logging.warning("Unknown Codon: %s", codon) + self.seen.add(codon) + return self.missing_char + else: + return self.wrapped.__getitem__(codon) + + def __contains__(self, value): + return value in self.wrapped + + +def translate(records, translate): + """ + Perform translation from generic DNA/RNA to proteins. Bio.Seq + does not perform back-translation because the codons would + more-or-less be arbitrary. Option to translate only up until + reaching a stop codon. translate must be one of the following: + dna2protein + dna2proteinstop + rna2protein + rna2proteinstop + """ + logging.info('Applying translation generator: ' + 'operation to perform is ' + translate + '.') + + to_stop = translate.endswith('stop') + + source_type = translate[:3] + + # Get a translation table + table = {'dna': CodonTable.ambiguous_dna_by_name["Standard"], + 'rna': CodonTable.ambiguous_rna_by_name["Standard"]}[source_type] + + # Handle ambiguities by replacing ambiguous codons with 'X' + # TODO: this copy operation causes infinite recursion with python3.6 - + # not sure why it was here to begin with. + # table = copy.deepcopy(table) + table.forward_table = CodonWarningTable(table.forward_table) + + for record in records: + sequence = str(record.seq) + seq = Seq(sequence) + protein = seq.translate(table, to_stop=to_stop) + yield SeqRecord(protein, id=record.id, description=record.description) + + +def max_length_discard(records, max_length): + """ + Discard any records that are longer than max_length. + """ + logging.info('Applying _max_length_discard generator: ' + 'discarding records longer than ' + '.') + for record in records: + if len(record) > max_length: + # Discard + logging.debug('Discarding long sequence: %s, length=%d', + record.id, len(record)) + else: + yield record + + +def min_length_discard(records, min_length): + """ + Discard any records that are shorter than min_length. + """ + logging.info('Applying _min_length_discard generator: ' + 'discarding records shorter than %d.', min_length) + for record in records: + if len(record) < min_length: + logging.debug('Discarding short sequence: %s, length=%d', + record.id, len(record)) + else: + yield record + + +def min_ungap_length_discard(records, min_length): + """ + Discard any records that are shorter than min_length after removing gaps. + """ + for record in records: + if len(ungap_all(record)) >= min_length: + yield record + + +def sort_length(source_file, source_file_type, direction=1): + """ + Sort sequences by length. 1 is ascending (default) and 0 is descending. + """ + direction_text = 'ascending' if direction == 1 else 'descending' + + logging.info('Indexing sequences by length: %s', direction_text) + + # Adapted from the Biopython tutorial example. + + # Get the lengths and ids, and sort on length + len_and_ids = sorted((len(rec), rec.id) + for rec in SeqIO.parse(source_file, source_file_type)) + + if direction == 0: + ids = reversed([seq_id for (length, seq_id) in len_and_ids]) + else: + ids = [seq_id for (length, seq_id) in len_and_ids] + del len_and_ids # free this memory + + # SeqIO.index does not handle gzip instances + if isinstance(source_file, gzip.GzipFile): + tmpfile = tempfile.NamedTemporaryFile() + source_file.seek(0) + tmpfile.write(source_file.read()) + tmpfile.seek(0) + source_file = tmpfile + + record_index = SeqIO.index(source_file.name, source_file_type) + + for seq_id in ids: + yield record_index[seq_id] + + +def sort_name(source_file, source_file_type, direction=1): + """ + Sort sequences by name. 1 is ascending (default) and 0 is descending. + """ + + direction_text = 'ascending' if direction == 1 else 'descending' + + logging.info("Indexing sequences by name: %s", direction_text) + + # Adapted from the Biopython tutorial example. + + # Sort on id + ids = sorted((rec.id) for rec in SeqIO.parse(source_file, + source_file_type)) + + if direction == 0: + ids = reversed(ids) + + # SeqIO.index does not handle gzip instances + if isinstance(source_file, gzip.GzipFile): + tmpfile = tempfile.NamedTemporaryFile() + source_file.seek(0) + tmpfile.write(source_file.read()) + tmpfile.seek(0) + source_file = tmpfile + + record_index = SeqIO.index(source_file.name, source_file_type) + + for id in ids: + yield record_index[id] diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..c752a53 --- /dev/null +++ b/setup.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python + +import os +import sys +import subprocess +from setuptools import setup, find_packages + +subprocess.call( + ('mkdir -p seqmagick/data && ' + 'git describe --tags --dirty > seqmagick/data/ver.tmp ' + '&& mv seqmagick/data/ver.tmp seqmagick/data/ver ' + '|| rm -f seqmagick/data/ver.tmp'), + shell=True, stderr=open(os.devnull, "w")) + +# must import __version__ after call to 'git describe' above +from seqmagick import __version__ + +setup(name='seqmagick', + version=__version__, + description='Tools for converting and modifying sequence files ' + 'from the command-line', + url='http://github.com/fhcrc/seqmagick', + download_url='http://pypi.python.org/pypi/seqmagick', + author='Matsen Group', + # author_email='http://matsen.fhcrc.org/', + packages=find_packages(), + entry_points={ + 'console_scripts': [ + 'seqmagick = seqmagick.scripts.cli:main' + ]}, + package_data={ + 'seqmagick': ['data/*'], + 'seqmagick.test.integration': ['data/*'] + }, + setup_requires=['nose>=1.0'], + python_requires='>=3.5', + test_suite='nose.collector', + install_requires=['biopython>=1.78', 'pygtrie>=2.1'], + classifiers=[ + 'License :: OSI Approved :: GNU General Public License (GPL)', + 'Development Status :: 4 - Beta', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Topic :: Scientific/Engineering :: Bio-Informatics', + ], + license="GPL V3") diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..1e005df --- /dev/null +++ b/tox.ini @@ -0,0 +1,15 @@ +[tox] +envlist = py27,pypy +[testenv] +deps = + numpy + nose + rednose + biopython +commands = nosetests --rednose [] + +[testenv:pypy] +deps = + nose + rednose + biopython -- cgit v1.2.3 From 5e1c29f801e36c69ed7445a2e5a0f2ee32ee91fd Mon Sep 17 00:00:00 2001 From: Andreas Tille Date: Wed, 22 Dec 2021 16:31:31 +0100 Subject: Import seqmagick_0.8.4-3.debian.tar.xz [dgit import tarball seqmagick 0.8.4-3 seqmagick_0.8.4-3.debian.tar.xz] --- changelog | 71 +++++++++++++++++++++ control | 45 ++++++++++++++ copyright | 24 +++++++ createmanpages | 22 +++++++ manpages | 1 + patches/biopython_1.71_dual_coding_support.patch | 32 ++++++++++ patches/pytest.patch | 79 ++++++++++++++++++++++++ patches/series | 2 + rules | 12 ++++ salsa-ci.yml | 4 ++ seqmagick.1 | 60 ++++++++++++++++++ source/format | 1 + tests/control | 4 ++ tests/run-unit-test | 24 +++++++ upstream/metadata | 13 ++++ watch | 5 ++ 16 files changed, 399 insertions(+) create mode 100644 changelog create mode 100644 control create mode 100644 copyright create mode 100755 createmanpages create mode 100644 manpages create mode 100644 patches/biopython_1.71_dual_coding_support.patch create mode 100644 patches/pytest.patch create mode 100644 patches/series create mode 100755 rules create mode 100644 salsa-ci.yml create mode 100644 seqmagick.1 create mode 100644 source/format create mode 100644 tests/control create mode 100644 tests/run-unit-test create mode 100644 upstream/metadata create mode 100644 watch diff --git a/changelog b/changelog new file mode 100644 index 0000000..1acb571 --- /dev/null +++ b/changelog @@ -0,0 +1,71 @@ +seqmagick (0.8.4-3) unstable; urgency=medium + + [ Étienne Mollier ] + * d/control,pytest.patch: replace nose by pytest. + + -- Andreas Tille Wed, 22 Dec 2021 16:31:31 +0100 + +seqmagick (0.8.4-2) unstable; urgency=medium + + * d/watch: adjust link, following changes in Github + * d/control: updated maintainer address + * Standards-Version: 4.6.0 (routine-update) + * Set field Upstream-Name in debian/copyright. + + -- Étienne Mollier Wed, 01 Sep 2021 17:20:18 +0200 + +seqmagick (0.8.4-1) unstable; urgency=medium + + * New upstream version + * Standards-Version: 4.5.1 (routine-update) + * More verbose autopkgtest and minor fix in case several shell scripts are in + the same directory in the future. + + -- Étienne Mollier Wed, 09 Dec 2020 14:27:08 +0100 + +seqmagick (0.8.2-1) unstable; urgency=medium + + * Add myself to Uploaders. + * New upstream version + * Refresh patches: + - remove fix-myfunctions-example-file.patch, applied upstream; + - remove replace-biopython-trie-with-pygtrie.patch, applied upstream; + - refresh biopython_1.71_dual_coding_support.patch hunks. + + -- Étienne Mollier Sat, 14 Nov 2020 20:01:17 +0100 + +seqmagick (0.8.0-2) unstable; urgency=medium + + * Team Upload. + [ Steffen Moeller ] + * Remove autogenerated version string and egg-info + + [ Nilesh Patra ] + * Encode for adding support to utf-8 + * Replace Bio.trie with pygtrie (Closes: #963326) + * Update Depends + * Add autopkgtests + * Fix manpage + * Standards-Version: 4.5.0 (routine-update) + * debhelper-compat 13 (routine-update) + * Add salsa-ci file (routine-update) + * Rules-Requires-Root: no (routine-update) + * Set upstream metadata fields: Bug-Database, Bug-Submit, Repository, + Repository-Browse. + + -- Nilesh Patra Sun, 16 Aug 2020 20:45:50 +0530 + +seqmagick (0.8.0-1) unstable; urgency=medium + + * Team upload. + * New upstream version + * debhelper-compat 12 (routine-update) + * Standards-Version: 4.4.1 (routine-update) + + -- Steffen Moeller Sat, 18 Jan 2020 22:22:51 +0100 + +seqmagick (0.7.0-1) unstable; urgency=medium + + * Initial release (Closes: #900394) + + -- Andreas Tille Wed, 30 May 2018 09:47:34 +0200 diff --git a/control b/control new file mode 100644 index 0000000..af4b0d9 --- /dev/null +++ b/control @@ -0,0 +1,45 @@ +Source: seqmagick +Maintainer: Debian Med Packaging Team +Uploaders: Andreas Tille , + Étienne Mollier +Section: science +Priority: optional +Build-Depends: debhelper-compat (= 13), + dh-python, + python3-all, + python3-setuptools, + python3-pytest , + python3-biopython, + python3-pygtrie +Standards-Version: 4.6.0 +Vcs-Browser: https://salsa.debian.org/med-team/seqmagick +Vcs-Git: https://salsa.debian.org/med-team/seqmagick.git +Homepage: https://github.com/fhcrc/seqmagick/ +Rules-Requires-Root: no + +Package: seqmagick +Architecture: all +Depends: ${python3:Depends}, + ${misc:Depends}, + python3-biopython, + python3-pygtrie +Description: imagemagick-like frontend to Biopython SeqIO + Seqmagick is a little utility to expose the file format conversion + in BioPython in a convenient way. + . + Features include: + . + * Modifying sequences: + - Remove gaps + - Reverse & reverse complement + - Trim to a range of residues + - Change case + - Sort by length or ID + * Displaying information about sequence files + * Subsetting sequence files by: + - Position + - ID + - Deduplication + * Filtering sequences by quality score + * Trimming alignments to a region of interest defined by the forward + and reverse primers diff --git a/copyright b/copyright new file mode 100644 index 0000000..74fb941 --- /dev/null +++ b/copyright @@ -0,0 +1,24 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Source: https://github.com/fhcrc/seqmagick/releases +Upstream-Name: seqmagick + +Files: * +Copyright: 2014-2017 Matsen Group +License: GPL-3 + +Files: debian/* +Copyright: 2017 Andreas Tille +License: GPL-3 + +License: GPL-3 + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License. + . + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + . + On Debian systems you can find a copy of the full text of the GNU General + Public License version 3 at /usr/share/common-licenses/GPL-3. diff --git a/createmanpages b/createmanpages new file mode 100755 index 0000000..b143f88 --- /dev/null +++ b/createmanpages @@ -0,0 +1,22 @@ +#!/bin/sh +MANDIR=debian +mkdir -p $MANDIR + +VERSION=`dpkg-parsechangelog | awk '/^Version:/ {print $2}' | sed -e 's/^[0-9]*://' -e 's/-.*//' -e 's/[+~]dfsg$//'` + +AUTHOR=".SH AUTHOR\nThis manpage was written by $DEBFULLNAME for the Debian distribution and +can be used for any other usage of the program. +" + +progname=seqmagick +help2man --no-info --no-discard-stderr \ + --name='Manipulate sequence files' \ + --version-string="$VERSION" ${progname} > $MANDIR/${progname}.1 +echo $AUTHOR >> $MANDIR/${progname}.1 + +cat < +Date: Tue, 29 May 2018 15:05:22 +0100 +Subject: [PATCH] Fix for Biopython 1.71 dual coding support. +Origin: https://github.com/fhcrc/seqmagick/pull/76 + +This closes issue #73, indirectly triggered by changes +in Biopython to support recent NCBI codon tables with +codons which can be amino acids of stop codons. + +Previously the monkey-patched sub-class was breaking here: + +dual_coding = [c for c in stop_codons if c in forward_table] + +This change adds direct support for __contains__ by +forwarding this to the wrapped forward table. +--- + seqmagick/transform.py | 4 ++++ + 1 file changed, 4 insertions(+) + +--- seqmagick.orig/seqmagick/transform.py ++++ seqmagick/seqmagick/transform.py +@@ -670,6 +670,10 @@ + return value in self.wrapped + + ++ def __contains__(self, value): ++ return value in self.wrapped ++ ++ + def translate(records, translate): + """ + Perform translation from generic DNA/RNA to proteins. Bio.Seq diff --git a/patches/pytest.patch b/patches/pytest.patch new file mode 100644 index 0000000..2a39113 --- /dev/null +++ b/patches/pytest.patch @@ -0,0 +1,79 @@ +Description: replace nosetests by pytest + nosetests 1 is not maintained anymore and going to be removed from Debian. + See https://lists.debian.org/debian-python/2021/10/msg00060.html +Author: Étienne Mollier +Forwarded: not-needed +Last-Update: 2021-11-28 +--- +This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ +--- seqmagick.orig/.travis.yml ++++ seqmagick/.travis.yml +@@ -12,12 +12,12 @@ + # BioPython doesn't always play well with pip install. + install: + - "if [[ $TRAVIS_PYTHON_VERSION != 'pypy' ]]; then pip install -q numpy; fi" +- - "pip install -q biopython nose pygtrie" ++ - "pip install -q biopython pytest pygtrie" + - "pip install ." + + script: + - seqmagick --version +- - nosetests ++ - pytest-3 + + notifications: + email: +--- seqmagick.orig/DEVELOPING.rst ++++ seqmagick/DEVELOPING.rst +@@ -50,7 +50,7 @@ + + Run tests, and make sure docs build without errors:: + +- nosetests ++ pytest-3 + (cd docs && make html) + + Push one last time to master to trigger tests on travis:: +--- seqmagick.orig/requirements.txt ++++ seqmagick/requirements.txt +@@ -4,6 +4,6 @@ + # for development + wheel + sphinx +-nose ++pytest + twine + ghp-import +--- seqmagick.orig/setup.py ++++ seqmagick/setup.py +@@ -32,9 +32,9 @@ + 'seqmagick': ['data/*'], + 'seqmagick.test.integration': ['data/*'] + }, +- setup_requires=['nose>=1.0'], ++ setup_requires=['pytest>=6.0'], + python_requires='>=3.5', +- test_suite='nose.collector', ++ test_suite='pytest.collect', + install_requires=['biopython>=1.78', 'pygtrie>=2.1'], + classifiers=[ + 'License :: OSI Approved :: GNU General Public License (GPL)', +--- seqmagick.orig/tox.ini ++++ seqmagick/tox.ini +@@ -3,13 +3,11 @@ + [testenv] + deps = + numpy +- nose +- rednose ++ pytest + biopython +-commands = nosetests --rednose [] ++commands = pytest-3 [] + + [testenv:pypy] + deps = +- nose +- rednose ++ pytest + biopython diff --git a/patches/series b/patches/series new file mode 100644 index 0000000..609c395 --- /dev/null +++ b/patches/series @@ -0,0 +1,2 @@ +biopython_1.71_dual_coding_support.patch +pytest.patch diff --git a/rules b/rules new file mode 100755 index 0000000..d2b1442 --- /dev/null +++ b/rules @@ -0,0 +1,12 @@ +#!/usr/bin/make -f + +# DH_VERBOSE := 1 +export LC_ALL=C.UTF-8 + +%: + dh $@ --with python3 --buildsystem=pybuild + +override_dh_auto_clean: + dh_auto_clean + rm -rf seqmagick/data + rm -rf seqmagick.egg-info diff --git a/salsa-ci.yml b/salsa-ci.yml new file mode 100644 index 0000000..33c3a64 --- /dev/null +++ b/salsa-ci.yml @@ -0,0 +1,4 @@ +--- +include: + - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/salsa-ci.yml + - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/pipeline-jobs.yml diff --git a/seqmagick.1 b/seqmagick.1 new file mode 100644 index 0000000..fc3460b --- /dev/null +++ b/seqmagick.1 @@ -0,0 +1,60 @@ +.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.46.4. +.TH SEQMAGICK "1" "June 2017" "seqmagick 0.6.1" "User Commands" +.SH NAME +seqmagick \- Manipulate sequence files +.SH SYNOPSIS +usage: seqmagick [\-h] [\-V] [\-v] [\-q] +.IP +help, convert, info, mogrify, primer\-trim, quality\-filter, extract\-ids, backtrans\-align +\&... +.SH DESCIPTION +Seqmagick is a little utility to expose the file format conversion +in BioPython in a convenient way. +.SH OPTIONS +.SS "positional arguments:" +.IP +help, convert, info, mogrify, primer\-trim, quality\-filter, extract\-ids, backtrans\-align +.TP +help +Detailed help for actions using help +.TP +convert +Convert between sequence formats +.TP +info +Info action +.TP +mogrify +Modify sequence file(s) in place. +.TP +primer\-trim +Find a primer sequence in a gapped alignment, trim to +amplicon +.TP +quality\-filter +Filter reads based on quality scores +.TP +extract\-ids +Extract the sequence IDs from a file +.TP +backtrans\-align +Given a protein alignment and unaligned nucleotides, +align the nucleotides using the protein alignment. +Protein and nucleotide sequence files must contain the +same number of sequences, in the same order, with the +same IDs. +.SS "optional arguments:" +.TP +\fB\-h\fR, \fB\-\-help\fR +show this help message and exit +.TP +\fB\-V\fR, \fB\-\-version\fR +Print the version number and exit +.TP +\fB\-v\fR, \fB\-\-verbose\fR +Be more verbose. Specify \fB\-vv\fR or \fB\-vvv\fR for even more +.TP +\fB\-q\fR, \fB\-\-quiet\fR +Suppress output +.SH AUTHOR +This manpage was written by Andreas Tille for the Debian distribution and can be used for any other usage of the program. diff --git a/source/format b/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/tests/control b/tests/control new file mode 100644 index 0000000..f2c3b32 --- /dev/null +++ b/tests/control @@ -0,0 +1,4 @@ +Tests: run-unit-test +Depends: @ +Restrictions: allow-stderr + diff --git a/tests/run-unit-test b/tests/run-unit-test new file mode 100644 index 0000000..90d7db5 --- /dev/null +++ b/tests/run-unit-test @@ -0,0 +1,24 @@ +#!/bin/bash + +pkg=seqmagick +TEST_DIR=`pwd` + +if [ "${AUTOPKGTEST_TMP}" = "" ] ; then + AUTOPKGTEST_TMP=$(mktemp -d /tmp/${pkg}-test.XXXXXX) + trap "rm -rf ${AUTOPKGTEST_TMP}" 0 INT QUIT ABRT PIPE TERM +fi + +cp ${TEST_DIR}/examples -a "${AUTOPKGTEST_TMP}" + +cd "${AUTOPKGTEST_TMP}" +gunzip -r * + +set -x +for f in `find examples/ -name '*.sh'` +do + cd `dirname $f` + /bin/bash `basename $f` + cd - +done +echo 'PASS' + diff --git a/upstream/metadata b/upstream/metadata new file mode 100644 index 0000000..0fccaf4 --- /dev/null +++ b/upstream/metadata @@ -0,0 +1,13 @@ +Bug-Database: https://github.com/fhcrc/seqmagick/issues +Bug-Submit: https://github.com/fhcrc/seqmagick/issues/new +Registry: + - Name: bio.tools + Entry: NA + - Name: OMICtools + Entry: OMICS_11290 + - Name: conda:bioconda + Entry: seqmagick + - Name: SciCrunch + Entry: NA +Repository: https://github.com/fhcrc/seqmagick.git +Repository-Browse: https://github.com/fhcrc/seqmagick diff --git a/watch b/watch new file mode 100644 index 0000000..087c98b --- /dev/null +++ b/watch @@ -0,0 +1,5 @@ +version=4 + +opts="filenamemangle=s%(?:.*?)?v?(\d[\d.]*)\.tar\.gz%@PACKAGE@-$1.tar.gz%" \ +https://github.com/fhcrc/seqmagick/tags \ +(?:.*?/)?v?(\d[\d.]*)\.tar\.gz debian uupdate -- cgit v1.2.3 From feb0deabc9fbf9999b6ddb8fbc4359b75a0d770a Mon Sep 17 00:00:00 2001 From: peterjc Date: Tue, 29 May 2018 15:05:22 +0100 Subject: [PATCH] Fix for Biopython 1.71 dual coding support. Origin: https://github.com/fhcrc/seqmagick/pull/76 This closes issue #73, indirectly triggered by changes in Biopython to support recent NCBI codon tables with codons which can be amino acids of stop codons. Previously the monkey-patched sub-class was breaking here: dual_coding = [c for c in stop_codons if c in forward_table] This change adds direct support for __contains__ by forwarding this to the wrapped forward table. Gbp-Pq: Name biopython_1.71_dual_coding_support.patch --- seqmagick/transform.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/seqmagick/transform.py b/seqmagick/transform.py index 3150d1d..4478786 100644 --- a/seqmagick/transform.py +++ b/seqmagick/transform.py @@ -670,6 +670,10 @@ class CodonWarningTable(object): return value in self.wrapped + def __contains__(self, value): + return value in self.wrapped + + def translate(records, translate): """ Perform translation from generic DNA/RNA to proteins. Bio.Seq -- cgit v1.2.3 From 02b3c9912cbc739c820ba4914c9a4eab3079556d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20Mollier?= Date: Wed, 22 Dec 2021 16:31:31 +0100 Subject: replace nosetests by pytest Forwarded: not-needed Last-Update: 2021-11-28 nosetests 1 is not maintained anymore and going to be removed from Debian. See https://lists.debian.org/debian-python/2021/10/msg00060.html Last-Update: 2021-11-28 Gbp-Pq: Name pytest.patch --- .travis.yml | 4 ++-- DEVELOPING.rst | 2 +- requirements.txt | 2 +- setup.py | 4 ++-- tox.ini | 8 +++----- 5 files changed, 9 insertions(+), 11 deletions(-) diff --git a/.travis.yml b/.travis.yml index 562f71b..a7497c7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,12 +12,12 @@ python: # BioPython doesn't always play well with pip install. install: - "if [[ $TRAVIS_PYTHON_VERSION != 'pypy' ]]; then pip install -q numpy; fi" - - "pip install -q biopython nose pygtrie" + - "pip install -q biopython pytest pygtrie" - "pip install ." script: - seqmagick --version - - nosetests + - pytest-3 notifications: email: diff --git a/DEVELOPING.rst b/DEVELOPING.rst index eada8dd..57f8857 100644 --- a/DEVELOPING.rst +++ b/DEVELOPING.rst @@ -50,7 +50,7 @@ First, make sure you have committed all changes. Run tests, and make sure docs build without errors:: - nosetests + pytest-3 (cd docs && make html) Push one last time to master to trigger tests on travis:: diff --git a/requirements.txt b/requirements.txt index 34a4604..96ad5f5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,6 @@ pygtrie>=2.1 # for development wheel sphinx -nose +pytest twine ghp-import diff --git a/setup.py b/setup.py index c752a53..ce76dce 100755 --- a/setup.py +++ b/setup.py @@ -32,9 +32,9 @@ setup(name='seqmagick', 'seqmagick': ['data/*'], 'seqmagick.test.integration': ['data/*'] }, - setup_requires=['nose>=1.0'], + setup_requires=['pytest>=6.0'], python_requires='>=3.5', - test_suite='nose.collector', + test_suite='pytest.collect', install_requires=['biopython>=1.78', 'pygtrie>=2.1'], classifiers=[ 'License :: OSI Approved :: GNU General Public License (GPL)', diff --git a/tox.ini b/tox.ini index 1e005df..e7260f5 100644 --- a/tox.ini +++ b/tox.ini @@ -3,13 +3,11 @@ envlist = py27,pypy [testenv] deps = numpy - nose - rednose + pytest biopython -commands = nosetests --rednose [] +commands = pytest-3 [] [testenv:pypy] deps = - nose - rednose + pytest biopython -- cgit v1.2.3 From 159c630dd628bd546dfc279e9ec25b25b9dbbf6a Mon Sep 17 00:00:00 2001 From: peterjc Date: Tue, 29 May 2018 15:05:22 +0100 Subject: [PATCH] Fix for Biopython 1.71 dual coding support. Origin: https://github.com/fhcrc/seqmagick/pull/76 This closes issue #73, indirectly triggered by changes in Biopython to support recent NCBI codon tables with codons which can be amino acids of stop codons. Previously the monkey-patched sub-class was breaking here: dual_coding = [c for c in stop_codons if c in forward_table] This change adds direct support for __contains__ by forwarding this to the wrapped forward table. Gbp-Pq: Name biopython_1.71_dual_coding_support.patch --- seqmagick/transform.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/seqmagick/transform.py b/seqmagick/transform.py index 3150d1d..4478786 100644 --- a/seqmagick/transform.py +++ b/seqmagick/transform.py @@ -670,6 +670,10 @@ class CodonWarningTable(object): return value in self.wrapped + def __contains__(self, value): + return value in self.wrapped + + def translate(records, translate): """ Perform translation from generic DNA/RNA to proteins. Bio.Seq -- cgit v1.2.3 From 4de27a053fcaf7354b69510ede20dd0440dbaadd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20Mollier?= Date: Sun, 11 Jun 2023 11:11:40 +0200 Subject: replace nosetests by pytest Forwarded: not-needed Last-Update: 2021-11-28 nosetests 1 is not maintained anymore and going to be removed from Debian. See https://lists.debian.org/debian-python/2021/10/msg00060.html Last-Update: 2021-11-28 Gbp-Pq: Name pytest.patch --- .travis.yml | 4 ++-- DEVELOPING.rst | 2 +- requirements.txt | 2 +- setup.py | 4 ++-- tox.ini | 8 +++----- 5 files changed, 9 insertions(+), 11 deletions(-) diff --git a/.travis.yml b/.travis.yml index 562f71b..a7497c7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,12 +12,12 @@ python: # BioPython doesn't always play well with pip install. install: - "if [[ $TRAVIS_PYTHON_VERSION != 'pypy' ]]; then pip install -q numpy; fi" - - "pip install -q biopython nose pygtrie" + - "pip install -q biopython pytest pygtrie" - "pip install ." script: - seqmagick --version - - nosetests + - pytest-3 notifications: email: diff --git a/DEVELOPING.rst b/DEVELOPING.rst index eada8dd..57f8857 100644 --- a/DEVELOPING.rst +++ b/DEVELOPING.rst @@ -50,7 +50,7 @@ First, make sure you have committed all changes. Run tests, and make sure docs build without errors:: - nosetests + pytest-3 (cd docs && make html) Push one last time to master to trigger tests on travis:: diff --git a/requirements.txt b/requirements.txt index 34a4604..96ad5f5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,6 @@ pygtrie>=2.1 # for development wheel sphinx -nose +pytest twine ghp-import diff --git a/setup.py b/setup.py index c752a53..ce76dce 100755 --- a/setup.py +++ b/setup.py @@ -32,9 +32,9 @@ setup(name='seqmagick', 'seqmagick': ['data/*'], 'seqmagick.test.integration': ['data/*'] }, - setup_requires=['nose>=1.0'], + setup_requires=['pytest>=6.0'], python_requires='>=3.5', - test_suite='nose.collector', + test_suite='pytest.collect', install_requires=['biopython>=1.78', 'pygtrie>=2.1'], classifiers=[ 'License :: OSI Approved :: GNU General Public License (GPL)', diff --git a/tox.ini b/tox.ini index 1e005df..e7260f5 100644 --- a/tox.ini +++ b/tox.ini @@ -3,13 +3,11 @@ envlist = py27,pypy [testenv] deps = numpy - nose - rednose + pytest biopython -commands = nosetests --rednose [] +commands = pytest-3 [] [testenv:pypy] deps = - nose - rednose + pytest biopython -- cgit v1.2.3