diff options
author | Michael R. Crusoe <michael.crusoe@gmail.com> | 2018-04-02 18:47:32 +0200 |
---|---|---|
committer | Michael R. Crusoe <michael.crusoe@gmail.com> | 2018-04-02 18:47:32 +0200 |
commit | 92f4514af4d8bfb50daf300a2fe8825c5552702a (patch) | |
tree | e57a5f8b7d788e0ed12c6c00a88c0904b3331fb2 |
Import bamtools_2.5.1+dfsg.orig.tar.gz
[dgit import orig bamtools_2.5.1+dfsg.orig.tar.gz]
147 files changed, 32429 insertions, 0 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..e2b96c3 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,99 @@ +# ========================== +# BamTools CMakeLists.txt +# (c) 2010 Derek Barnett +# +# top-level +# ========================== + +# CMake requirements +cmake_minimum_required( VERSION 3.0 ) + +# allow setting project version in project() +# https://cmake.org/cmake/help/v3.0/policy/CMP0048.html#policy:CMP0048 +cmake_policy( SET CMP0048 NEW ) + +# set project name and version +project( BamTools LANGUAGES CXX VERSION 2.5.1 ) + +# on macOS, MACOSX_RPATH is enabled by default on more recent versions +# of CMake. Disable this behaviour, and let user enable it if need be. +cmake_policy( SET CMP0042 OLD ) + +# Set Release type for builds where CMAKE_BUILD_TYPE is unset +# This is usually a good default as this implictly enables +# +# CXXFLAGS = -O3 -DNDEBUG +# +if( NOT CMAKE_BUILD_TYPE ) + set( CMAKE_BUILD_TYPE "Release" ) +endif() + +# Adhere to GNU filesystem layout conventions +include( GNUInstallDirs ) + +# Force the build directory to be different from source directory +macro( ENSURE_OUT_OF_SOURCE_BUILD MSG ) + string( COMPARE EQUAL "${CMAKE_SOURCE_DIR}" "${CMAKE_BINARY_DIR}" insource ) + get_filename_component( PARENTDIR ${CMAKE_SOURCE_DIR} PATH ) + string( COMPARE EQUAL "${CMAKE_SOURCE_DIR}" "${PARENTDIR}" insourcesubdir ) + IF( insource OR insourcesubdir ) + message( FATAL_ERROR "${MSG}" ) + ENDIF( insource OR insourcesubdir ) +endmacro( ENSURE_OUT_OF_SOURCE_BUILD ) + +ensure_out_of_source_build( " + ${PROJECT_NAME} requires an out of source build. + $ mkdir build + $ cd build + $ cmake .. + $ make +(or the Windows equivalent)\n" ) + +# define compiler flags for all code, copied from Autoconf's AC_SYS_LARGEFILE +if( NOT WIN32 ) + add_definitions( -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE ) + add_compile_options( -Wall ) +endif() + +# ----------------------------------------------- +# handle platform-/environment-specific defines + +# By default build bamtools as a static library +# Most users will prefer static libraries, distributions +# can always switch the standard CMake variable over to ON. +set( BUILD_SHARED_LIBS OFF CACHE BOOL "Build all libraries as shared" ) + +# If planning to run in Node.js environment, run: +# cmake -DEnableNodeJS=true +if( EnableNodeJS ) + add_definitions( -DSYSTEM_NODEJS=1 ) +endif() + +# If running on SunOS +if( "${CMAKE_SYSTEM_NAME}" MATCHES "SunOS" ) + add_definitions( -DSUN_OS ) +endif() + +# find system JsonCpp +find_package( PkgConfig ) +pkg_search_module( JSONCPP jsoncpp>=1 ) + +set( BAMTOOLS_PRIVATE_DEPS "zlib" ) + +if( JSONCPP_FOUND ) + message( "Found system JsonCpp, not using bundled version" ) + set( BAMTOOLS_PRIVATE_DEPS "${BAMTOOLS_PRIVATE_DEPS} jsoncpp" ) +else() + message( "Did NOT find system JsonCpp, instead using bundled version" ) + set( JSONCPP_LDFLAGS jsoncpp ) + set( JSONCPP_INCLUDE_DIRS ${BamTools_SOURCE_DIR}/src/third_party/jsoncpp ) +endif() + + +# ------------------------------------------- + +# add our includes root path +include_directories( src ) + +# list subdirectories to build in +add_subdirectory( src ) @@ -0,0 +1,22 @@ +The MIT License + +Copyright (c) 2009-2010 Derek Barnett, Erik Garrison, Gabor Marth, Michael Stromberg + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + @@ -0,0 +1,60 @@ +-------------------------------------------------------------------------------- +README : BAMTOOLS +-------------------------------------------------------------------------------- + +BamTools provides both a programmer's API and an end-user's toolkit for handling +BAM files. + +I. Learn More + +II. License + +III. Acknowledgements + +IV. Contact + +-------------------------------------------------------------------------------- +I. Learn More: +-------------------------------------------------------------------------------- + +Installation steps, tutorial, API documentation, etc. are all now available +through the BamTools project wiki: + +https://github.com/pezmaster31/bamtools/wiki + +Join the mailing list(s) to stay informed of updates or get involved with +contributing: + +https://github.com/pezmaster31/bamtools/wiki/Mailing-lists + +-------------------------------------------------------------------------------- +II. License : +-------------------------------------------------------------------------------- + +Both the BamTools API and toolkit are released under the MIT License. +Copyright (c) 2009-2010 Derek Barnett, Erik Garrison, Gabor Marth, + Michael Stromberg + +See included file LICENSE for details. + +-------------------------------------------------------------------------------- +III. Acknowledgements : +-------------------------------------------------------------------------------- + + * Aaron Quinlan for several key feature ideas and bug fix contributions + * Baptiste Lepilleur for the public-domain JSON parser (JsonCPP) + * Heng Li, author of SAMtools - the original C-language BAM API/toolkit. + +-------------------------------------------------------------------------------- +IV. Contact : +-------------------------------------------------------------------------------- + +Feel free to contact me with any questions, comments, suggestions, bug reports, + etc. + +Derek Barnett +Marth Lab +Biology Dept., Boston College + +Email: derekwbarnett@gmail.com +Project Website: http://github.com/pezmaster31/bamtools diff --git a/docs/Doxyfile b/docs/Doxyfile new file mode 100644 index 0000000..ee7ba2d --- /dev/null +++ b/docs/Doxyfile @@ -0,0 +1,1605 @@ +# Doxyfile 1.6.3 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project +# +# All text after a hash (#) is considered a comment and will be ignored +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" ") + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# http://www.gnu.org/software/libiconv for the list of possible encodings. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded +# by quotes) that should identify the project. + +PROJECT_NAME = BamTools + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. +# This could be handy for archiving the generated documentation or +# if some version control system is used. + +PROJECT_NUMBER = 2.5.1 + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create +# 4096 sub-directories (in 2 levels) under the output directory of each output +# format and will distribute the generated files over these directories. +# Enabling this option can be useful when feeding doxygen a huge amount of +# source files, where putting all generated files in the same directory would +# otherwise cause performance problems for the file system. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, +# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, +# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English +# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, +# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak, +# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to JavaDoc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator +# that is used to form the text in various listings. Each string +# in this list, if found as the leading text of the brief description, will be +# stripped from the text and the result after processing the whole list, is +# used as the annotated text. Otherwise, the brief description is used as-is. +# If left blank, the following values are used ("$name" is automatically +# replaced with the name of the entity): "The $name class" "The $name widget" +# "The $name file" "is" "provides" "specifies" "contains" +# "represents" "a" "an" "the" + +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = NO + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user-defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the +# path to strip. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of +# the path mentioned in the documentation of a class, which tells +# the reader which header file to include in order to use a class. +# If left blank only the name of the header file containing the class +# definition is used. Otherwise one should specify the include paths that +# are normally passed to the compiler using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful is your file systems +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a JavaDoc-style +# comment as the brief description. If set to NO, the JavaDoc +# comments will behave just like regular Qt-style comments +# (thus requiring an explicit @brief command for a brief description.) + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then Doxygen will +# interpret the first line (until the first dot) of a Qt-style +# comment as the brief description. If set to NO, the comments +# will behave just like regular Qt-style comments (thus requiring +# an explicit \brief command for a brief description.) + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen +# treat a multi-line C++ special comment block (i.e. a block of //! or /// +# comments) as a brief description. This used to be the default behaviour. +# The new default is to treat a multi-line C++ comment block as a detailed +# description. Set this tag to YES if you prefer the old behaviour instead. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# re-implements. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce +# a new page for each member. If set to NO, the documentation of a member will +# be part of the file/class/namespace that contains it. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 1 + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user-defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = samSpecURL=http://samtools.sourceforge.net/SAM1.pdf + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C +# sources only. Doxygen will then generate output that is more tailored for C. +# For instance, some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java +# sources only. Doxygen will then generate output that is more tailored for +# Java. For instance, namespaces will be presented as packages, qualified +# scopes will look different, etc. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources only. Doxygen will then generate output that is more tailored for +# Fortran. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for +# VHDL. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it parses. +# With this tag you can assign which parser to use for a given extension. +# Doxygen has a built-in mapping, but you can override or extend it using this tag. +# The format is ext=language, where ext is a file extension, and language is one of +# the parsers supported by doxygen: IDL, Java, Javascript, C#, C, C++, D, PHP, +# Objective-C, Python, Fortran, VHDL, C, C++. For instance to make doxygen treat +# .inc files as Fortran files (default is PHP), and .f files as C (default is Fortran), +# use: inc=Fortran f=C. Note that for custom extensions you also need to set +# FILE_PATTERNS otherwise the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should +# set this tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. +# func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. +# Doxygen will parse them like normal C++ but will assume all classes use public +# instead of private inheritance when no explicit protection keyword is present. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate getter +# and setter methods for a property. Setting this option to YES (the default) +# will make doxygen to replace the get and set methods by a property in the +# documentation. This will only work if the methods are indeed getting or +# setting a simple type. If this is not the case, or you want to show the +# methods anyway, you should set this option to NO. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES (the default) to allow class member groups of +# the same type (for instance a group of public functions) to be put as a +# subgroup of that type (e.g. under the Public Functions section). Set it to +# NO to prevent subgrouping. Alternatively, this can be done per class using +# the \nosubgrouping command. + +SUBGROUPING = YES + +# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum +# is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically +# be useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. + +TYPEDEF_HIDES_STRUCT = NO + +# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to +# determine which symbols to keep in memory and which to flush to disk. +# When the cache is full, less often used symbols will be written to disk. +# For small to medium size projects (<1000 input files) the default value is +# probably good enough. For larger projects a too small cache size can cause +# doxygen to be busy swapping symbols to and from disk most of the time +# causing a significant performance penality. +# If the system has enough physical memory increasing the cache will improve the +# performance by keeping more symbols in memory. Note that the value works on +# a logarithmic scale so increasing the size by one will rougly double the +# memory usage. The cache size is given by this formula: +# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, +# corresponding to a cache size of 2^16 = 65536 symbols + +SYMBOL_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. When set to YES local +# methods, which are defined in the implementation section but not in +# the interface are included in the documentation. +# If set to NO (the default) only methods in the interface are included. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base +# name of the file that contains the anonymous namespace. By default +# anonymous namespace are hidden. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these classes will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all +# friend (class|struct|union) declarations. +# If set to NO (the default) these declarations will be included in the +# documentation. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any +# documentation blocks found inside the body of a function. +# If set to NO (the default) these blocks will be appended to the +# function's detailed documentation block. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. + +CASE_SENSE_NAMES = NO + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put a list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = YES + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen +# will list include files with double quotes in the documentation +# rather than with sharp brackets. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the +# brief documentation of file, namespace and class members alphabetically +# by member name. If set to NO (the default) the members will appear in +# declaration order. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen +# will sort the (brief and detailed) documentation of class members so that +# constructors and destructors are listed first. If set to NO (the default) +# the constructors will appear in the respective orders defined by +# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. +# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO +# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the +# hierarchy of group names into alphabetical order. If set to NO (the default) +# the group names will appear in their defined order. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be +# sorted by fully-qualified names, including namespaces. If set to +# NO (the default), the class list will be sorted only by class name, +# not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the +# alphabetical list. + +SORT_BY_SCOPE_NAME = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or +# disable (NO) the deprecated list. This list is created by putting +# \deprecated commands in the documentation. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if sectionname ... \endif. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or define consists of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and defines in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = YES + +# If the sources in your project are distributed over multiple directories +# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy +# in the documentation. The default is NO. + +SHOW_DIRECTORIES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. +# This will remove the Files entry from the Quick Index and from the +# Folder Tree View (if specified). The default is YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the +# Namespaces page. This will remove the Namespaces entry from the Quick Index +# and from the Folder Tree View (if specified). The default is YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command <command> <input-file>, where <command> is the value of +# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file +# provided by doxygen. Whatever the program writes to standard output +# is used as the file version. See the manual for examples. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed by +# doxygen. The layout file controls the global structure of the generated output files +# in an output format independent way. The create the layout file that represents +# doxygen's defaults, run doxygen with the -l option. You can optionally specify a +# file name after the option, if omitted DoxygenLayout.xml will be used as the name +# of the layout file. + +LAYOUT_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = YES + +# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some +# parameters in a documented function, or documenting parameters that +# don't exist or using markup commands wrongly. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be abled to get warnings for +# functions that are documented, but have no documentation for their parameters +# or return value. If set to NO (the default) doxygen will only warn about +# wrong or incomplete parameter documentation, but not about the absence of +# documentation. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. Optionally the format may contain +# $version, which will be replaced by the version of the file (if it could +# be obtained via FILE_VERSION_FILTER) + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +INPUT = src/api + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is +# also the default input encoding. Doxygen uses libiconv (or the iconv built +# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for +# the list of possible encodings. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx +# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90 + +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.d \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.idl \ + *.odl \ + *.cs \ + *.php \ + *.php3 \ + *.inc \ + *.m \ + *.mm \ + *.dox \ + *.py \ + *.f90 \ + *.f \ + *.vhd \ + *.vhdl + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. + +EXCLUDE = src/api/internal + +# The EXCLUDE_SYMLINKS tag can be used select whether or not files or +# directories that are symbolic links (a Unix filesystem feature) are excluded +# from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. Note that the wildcards are matched +# against the file with absolute path, so to exclude all test directories +# for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test + +EXCLUDE_SYMBOLS = BamTools::Internal \ + BamTools::BamAlignment::BamAlignmentSupportData \ + BamTools::RaiiBuffer \ + UsesCharData \ + sort_helper \ + AlignmentSortBase + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = * + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command <filter> <input-file>, where <filter> +# is the value of the INPUT_FILTER tag, and <input-file> is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. If FILTER_PATTERNS is specified, this tag will be +# ignored. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: +# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further +# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER +# is applied to all files. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse (i.e. when SOURCE_BROWSER is set to YES). + +FILTER_SOURCE_FILES = NO + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. +# Note: To get rid of all source code in the generated output, make sure also +# VERBATIM_HEADERS is set to NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C and C++ comments will always remain visible. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) +# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from +# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will +# link to the source code. Otherwise they will link to the documentation. + +REFERENCES_LINK_SOURCE = YES + +# If the USE_HTAGS tag is set to YES then the references to source code +# will point to the HTML generated by the htags(1) tool instead of doxygen +# built-in source browser. The htags tool is part of GNU's global source +# tagging system (see http://www.gnu.org/software/global/global.html). You +# will need version 4.8.6 or higher. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = NO + +# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then +# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns +# in which this list will be split (can be a number in the range [1..20]) + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If the tag is left blank doxygen +# will generate a default style sheet. Note that doxygen will try to copy +# the style sheet file to the HTML output directory, so don't put your own +# stylesheet in the HTML output directory as well, or it will be erased! + +HTML_STYLESHEET = + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting +# this to NO can help when comparing the output of multiple runs. + +HTML_TIMESTAMP = YES + +# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, +# files or namespaces will be aligned in HTML using tables. If set to +# NO a bullet list will be used. + +HTML_ALIGN_MEMBERS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. For this to work a browser that supports +# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox +# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). + +HTML_DYNAMIC_SECTIONS = YES + +# If the GENERATE_DOCSET tag is set to YES, additional index files +# will be generated that can be used as input for Apple's Xcode 3 +# integrated development environment, introduced with OSX 10.5 (Leopard). +# To create a documentation set, doxygen will generate a Makefile in the +# HTML output directory. Running make will produce the docset in that +# directory and running "make install" will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find +# it at startup. +# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html for more information. + +GENERATE_DOCSET = NO + +# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the +# feed. A documentation feed provides an umbrella under which multiple +# documentation sets from a single provider (such as a company or product suite) +# can be grouped. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that +# should uniquely identify the documentation set bundle. This should be a +# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen +# will append .docset to the name. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can +# be used to specify the file name of the resulting .chm file. You +# can add a path in front of the file if the result should not be +# written to the html output directory. + +CHM_FILE = + +# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can +# be used to specify the location (absolute path including file name) of +# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run +# the HTML help compiler on the generated index.hhp. + +HHC_LOCATION = + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the master .chm file (NO). + +GENERATE_CHI = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING +# is used to encode HtmlHelp index (hhk), content (hhc) and project file +# content. + +CHM_INDEX_ENCODING = + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the HTML help documentation and to the tree view. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and QHP_VIRTUAL_FOLDER +# are set, an additional index file will be generated that can be used as input for +# Qt's qhelpgenerator to generate a Qt Compressed Help (.qch) of the generated +# HTML documentation. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can +# be used to specify the file name of the resulting .qch file. +# The path specified is relative to the HTML output folder. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#namespace + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#virtual-folders + +QHP_VIRTUAL_FOLDER = doc + +# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to add. +# For more information please see +# http://doc.trolltech.com/qthelpproject.html#custom-filters + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the custom filter to add.For more information please see +# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">Qt Help Project / Custom Filters</a>. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this project's +# filter section matches. +# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">Qt Help Project / Filter Attributes</a>. + +QHP_SECT_FILTER_ATTRS = + +# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can +# be used to specify the location of Qt's qhelpgenerator. +# If non-empty doxygen will try to run qhelpgenerator on the generated +# .qhp file. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files +# will be generated, which together with the HTML files, form an Eclipse help +# plugin. To install this plugin and make it available under the help contents +# menu in Eclipse, the contents of the directory containing the HTML and XML +# files needs to be copied into the plugins directory of eclipse. The name of +# the directory within the plugins directory should be the same as +# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before +# the help appears. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have +# this name. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index at +# top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. + +DISABLE_INDEX = NO + +# This tag can be used to set the number of enum values (range [1..20]) +# that doxygen will group on one line in the generated HTML documentation. + +ENUM_VALUES_PER_LINE = 4 + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. +# If the tag value is set to YES, a side panel will be generated +# containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). +# Windows users are probably better off using the HTML help feature. + +GENERATE_TREEVIEW = NO + +# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, +# and Class Hierarchy pages using a tree view instead of an ordered list. + +USE_INLINE_TREES = NO + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +# Use this tag to change the font size of Latex formulas included +# as images in the HTML documentation. The default is 10. Note that +# when you change the font size after a successful doxygen run you need +# to manually remove any form_*.png images from the HTML output directory +# to force them to be regenerated. + +FORMULA_FONTSIZE = 10 + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box +# for the HTML output. The underlying search engine uses javascript +# and DHTML and should work on any modern browser. Note that when using +# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets +# (GENERATE_DOCSET) there is already a search function so this one should +# typically be disabled. For large projects the javascript based search engine +# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. + +SEARCHENGINE = YES + +# When the SERVER_BASED_SEARCH tag is enabled the search engine will be +# implemented using a PHP enabled web server instead of at the web client +# using Javascript. Doxygen will generate the search PHP script and index +# file to put on the web server. The advantage of the server +# based approach is that it scales better to large projects and allows +# full text search. The disadvances is that it is more difficult to setup +# and does not have live searching capabilities. + +SERVER_BASED_SEARCH = NO + +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = NO + +# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = latex + +# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be +# invoked. If left blank `latex' will be used as the default command name. +# Note that when enabling USE_PDFLATEX this option is only used for +# generating bitmaps for formulas in the HTML output, but not in the +# Makefile that is written to the output directory. + +LATEX_CMD_NAME = latex + +# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to +# generate index for LaTeX. If left blank `makeindex' will be used as the +# default command name. + +MAKEINDEX_CMD_NAME = makeindex + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# LaTeX documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = NO + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, a4wide, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = a4wide + +# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX +# packages that should be included in the LaTeX output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal LaTeX header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = + +# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = YES + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = YES + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated LaTeX files. This will instruct LaTeX to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +# If LATEX_HIDE_INDICES is set to YES then doxygen will not +# include the index chapters (such as File Index, Compound Index, etc.) +# in the output. + +LATEX_HIDE_INDICES = NO + +# If LATEX_SOURCE_CODE is set to YES then doxygen will include +# source code with syntax highlighting in the LaTeX output. +# Note that which sources are shown also depends on other settings +# such as SOURCE_BROWSER. + +LATEX_SOURCE_CODE = NO + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimized for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = rtf + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = NO + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = man + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = .3 + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = NO + +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- + +# If the GENERATE_XML tag is set to YES Doxygen will +# generate an XML file that captures the structure of +# the code including all documentation. + +GENERATE_XML = NO + +# The XML_OUTPUT tag is used to specify where the XML pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `xml' will be used as the default path. + +XML_OUTPUT = xml + +# The XML_SCHEMA tag can be used to specify an XML schema, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_SCHEMA = + +# The XML_DTD tag can be used to specify an XML DTD, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_DTD = + +# If the XML_PROGRAMLISTING tag is set to YES Doxygen will +# dump the program listings (including syntax highlighting +# and cross-referencing information) to the XML output. Note that +# enabling this will significantly increase the size of the XML output. + +XML_PROGRAMLISTING = YES + +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an AutoGen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- + +# If the GENERATE_PERLMOD tag is set to YES Doxygen will +# generate a Perl module file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_PERLMOD = NO + +# If the PERLMOD_LATEX tag is set to YES Doxygen will generate +# the necessary Makefile rules, Perl scripts and LaTeX code to be able +# to generate PDF and DVI output from the Perl module output. + +PERLMOD_LATEX = NO + +# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be +# nicely formatted so it can be parsed by a human reader. This is useful +# if you want to understand what is going on. On the other hand, if this +# tag is set to NO the size of the Perl module output will be much smaller +# and Perl will parse it just the same. + +PERLMOD_PRETTY = YES + +# The names of the make variables in the generated doxyrules.make file +# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. +# This is useful so different doxyrules.make files included by the same +# Makefile don't overwrite each other's variables. + +PERLMOD_MAKEVAR_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = NO + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_DEFINED tags. + +EXPAND_ONLY_PREDEF = NO + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# in the INCLUDE_PATH (see below) will be search if a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. To prevent a macro definition from being +# undefined via #undef or recursively expanded use the := operator +# instead of the = operator. + +PREDEFINED = + +# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition. + +EXPAND_AS_DEFINED = + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all function-like macros that are alone +# on a line, have an all uppercase name, and do not end with a semicolon. Such +# function macros are typically used for boiler-plate code, and will confuse +# the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES option can be used to specify one or more tagfiles. +# Optionally an initial location of the external documentation +# can be added for each tagfile. The format of a tag file without +# this location is as follows: +# TAGFILES = file1 file2 ... +# Adding location for the tag files is done as follows: +# TAGFILES = file1=loc1 "file2 = loc2" ... +# where "loc1" and "loc2" can be relative or absolute paths or +# URLs. If a location is present for each tag, the installdox tool +# does not have to be run to correct the links. +# Note that each tag file must have a unique name +# (where the name does NOT include the path) +# If a tag file is not located in the directory in which doxygen +# is run, you must also specify the path to the tagfile here. + +TAGFILES = + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = NO + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = NO + +# The PERL_PATH should be the absolute path and name of the perl script +# interpreter (i.e. the result of `which perl'). + +PERL_PATH = /usr/bin/perl + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will +# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base +# or super classes. Setting the tag to NO turns the diagrams off. Note that +# this option is superseded by the HAVE_DOT option below. This is only a +# fallback. It is recommended to install and use dot, since it yields more +# powerful graphs. + +CLASS_DIAGRAMS = NO + +# You can define message sequence charts within doxygen comments using the \msc +# command. Doxygen will then run the mscgen tool (see +# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the +# documentation. The MSCGEN_PATH tag allows you to specify the directory where +# the mscgen tool resides. If left empty the tool is assumed to be found in the +# default search path. + +MSCGEN_PATH = + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = YES + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = NO + +# By default doxygen will write a font called FreeSans.ttf to the output +# directory and reference it in all dot files that doxygen generates. This +# font does not include all possible unicode characters however, so when you need +# these (or just want a differently looking font) you can specify the font name +# using DOT_FONTNAME. You need need to make sure dot is able to find the font, +# which can be done by putting it in a standard location or by setting the +# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory +# containing the font. + +# DOT_FONTNAME = FreeSans + +# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. +# The default size is 10pt. + +DOT_FONTSIZE = 10 + +# By default doxygen will tell dot to use the output directory to look for the +# FreeSans.ttf font (which doxygen will put there itself). If you specify a +# different font using DOT_FONTNAME you can set the path where dot +# can find it using this tag. + +DOT_FONTPATH = + +# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect inheritance relations. Setting this tag to YES will force the +# the CLASS_DIAGRAMS tag to NO. + +CLASS_GRAPH = YES + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = YES + +# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for groups, showing the direct groups dependencies + +GROUP_GRAPHS = YES + +# If the UML_LOOK tag is set to YES doxygen will generate inheritance and +# collaboration diagrams in a style similar to the OMG's Unified Modeling +# Language. + +UML_LOOK = NO + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = NO + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = YES + +# If the CALL_GRAPH and HAVE_DOT options are set to YES then +# doxygen will generate a call dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable call graphs +# for selected functions only using the \callgraph command. + +CALL_GRAPH = NO + +# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then +# doxygen will generate a caller dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable caller +# graphs for selected functions only using the \callergraph command. + +CALLER_GRAPH = NO + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES +# then doxygen will show the dependencies a directory has on other directories +# in a graphical way. The dependency relations are determined by the #include +# relations between the files in the directories. + +DIRECTORY_GRAPH = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are png, jpg, or gif +# If left blank png will be used. + +DOT_IMAGE_FORMAT = png + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found in the path. + +DOT_PATH = + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of +# nodes that will be shown in the graph. If the number of nodes in a graph +# becomes larger than this value, doxygen will truncate the graph, which is +# visualized by representing a node as a red box. Note that doxygen if the +# number of direct children of the root node in a graph is already larger than +# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note +# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. + +DOT_GRAPH_MAX_NODES = 50 + +# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the +# graphs generated by dot. A depth value of 3 means that only nodes reachable +# from the root by following a path via at most 3 edges will be shown. Nodes +# that lay further from the root node will be omitted. Note that setting this +# option to 1 or 2 may greatly reduce the computation time needed for large +# code bases. Also note that the size of a graph can be further restricted by +# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. + +MAX_DOT_GRAPH_DEPTH = 0 + +# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent +# background. This is disabled by default, because dot on Windows does not +# seem to support this out of the box. Warning: Depending on the platform used, +# enabling this option may lead to badly anti-aliased labels on the edges of +# a graph (i.e. they become hard to read). + +DOT_TRANSPARENT = NO + +# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output +# files in one run (i.e. multiple -o and -T options on the command line). This +# makes dot run faster, but since only newer versions of dot (>1.8.10) +# support this, this feature is disabled by default. + +DOT_MULTI_TARGETS = NO + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermediate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..04ecf6e --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,20 @@ +# ========================== +# BamTools CMakeLists.txt +# (c) 2010 Derek Barnett +# +# src/ +# ========================== + +add_subdirectory( api ) +add_subdirectory( third_party ) +add_subdirectory( toolkit ) +add_subdirectory( utils ) + +# export shared headers +include( ExportHeader.cmake ) +set( SharedIncludeDir "shared" ) +ExportHeader( SharedHeaders shared/bamtools_global.h ${SharedIncludeDir} ) + +# configure and install pkg-config file +configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/bamtools.pc.in ${CMAKE_CURRENT_BINARY_DIR}/bamtools-1.pc @ONLY ) +install( FILES ${CMAKE_CURRENT_BINARY_DIR}/bamtools-1.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig ) diff --git a/src/ExportHeader.cmake b/src/ExportHeader.cmake new file mode 100644 index 0000000..d62a5bc --- /dev/null +++ b/src/ExportHeader.cmake @@ -0,0 +1,27 @@ +# +# ExportHeader +# + +function( ExportHeader MODULE FILE DEST ) + + # if haven't defined our custom 'build target' + # not exactly a build target, but lets this command get + # checked any time build step happens + if( NOT TARGET ${MODULE} ) + add_custom_target( ${MODULE} ALL COMMENT "Exporting ${MODULE}" ) + endif( NOT TARGET ${MODULE} ) + + # get the filename (without path) + get_filename_component( FILENAME "${FILE}" NAME ) + + # copy header to destination + add_custom_command( TARGET ${MODULE} COMMAND + ${CMAKE_COMMAND} -E copy_if_different + "${CMAKE_CURRENT_SOURCE_DIR}/${FILE}" + "${CMAKE_CURRENT_BINARY_DIR}/include/${DEST}/${FILENAME}" ) + + # make sure files are properly 'installed' + install( FILES "${FILE}" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/bamtools/${DEST}" ) + +endfunction( ExportHeader ) + diff --git a/src/api/BamAlgorithms.h b/src/api/BamAlgorithms.h new file mode 100644 index 0000000..7f4b36f --- /dev/null +++ b/src/api/BamAlgorithms.h @@ -0,0 +1,21 @@ +// *************************************************************************** +// BamAlgorithms.h (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides generic algorithms that are intended to work with BamTools data +// structures. Where possible, these are intended to be STL-compatible. +// *************************************************************************** + +#ifndef BAMALGORITHMS_H +#define BAMALGORITHMS_H + +#include "api/algorithms/Sort.h" + +/*! \namespace BamTools::Algorithms + \brief Provides convenient classes & methods for working with BAM data +*/ + +#endif // BAM_ALGORITHMS_H diff --git a/src/api/BamAlignment.cpp b/src/api/BamAlignment.cpp new file mode 100644 index 0000000..8173dcf --- /dev/null +++ b/src/api/BamAlignment.cpp @@ -0,0 +1,1127 @@ +// *************************************************************************** +// BamAlignment.cpp (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 4 December 2012 (DB) +// --------------------------------------------------------------------------- +// Provides the BamAlignment data structure +// *************************************************************************** + +#include "api/BamAlignment.h" +#include "api/BamConstants.h" +using namespace BamTools; + +#include <cstddef> + +/*! \class BamTools::BamAlignment + \brief The main BAM alignment data structure. + + Provides methods to query/modify BAM alignment data fields. +*/ +/*! \var BamAlignment::Name + \brief read name +*/ +/*! \var BamAlignment::Length + \brief length of query sequence +*/ +/*! \var BamAlignment::QueryBases + \brief 'original' sequence (as reported from sequencing machine) + + \note Setting this field to "*" indicates that the sequence is not to be stored on output. + In this case, the contents of the Qualities field should be invalidated as well (cleared or marked as "*"). +*/ +/*! \var BamAlignment::AlignedBases + \brief 'aligned' sequence (includes any indels, padding, clipping) + + This field will be completely empty after reading from BamReader/BamMultiReader when + QueryBases is empty. +*/ +/*! \var BamAlignment::Qualities + \brief FASTQ qualities (ASCII characters, not numeric values) + + \note Setting this field to "*" indicates to BamWriter that the quality scores are not to be stored, + but instead will be output as a sequence of '0xFF'. Otherwise, QueryBases must not be a "*" and + the length of this field should equal the length of QueryBases. +*/ +/*! \var BamAlignment::TagData + \brief tag data (use the provided methods to query/modify) +*/ +/*! \var BamAlignment::RefID + \brief ID number for reference sequence +*/ +/*! \var BamAlignment::Position + \brief position (0-based) where alignment starts +*/ +/*! \var BamAlignment::Bin + \brief BAM (standard) index bin number for this alignment +*/ +/*! \var BamAlignment::MapQuality + \brief mapping quality score +*/ +/*! \var BamAlignment::AlignmentFlag + \brief alignment bit-flag (use the provided methods to query/modify) +*/ +/*! \var BamAlignment::CigarData + \brief CIGAR operations for this alignment +*/ +/*! \var BamAlignment::MateRefID + \brief ID number for reference sequence where alignment's mate was aligned +*/ +/*! \var BamAlignment::MatePosition + \brief position (0-based) where alignment's mate starts +*/ +/*! \var BamAlignment::InsertSize + \brief mate-pair insert size +*/ +/*! \var BamAlignment::Filename + \brief name of BAM file which this alignment comes from +*/ + +/*! \fn BamAlignment::BamAlignment() + \brief constructor +*/ +BamAlignment::BamAlignment() + : Length(0) + , RefID(-1) + , Position(-1) + , Bin(0) + , MapQuality(0) + , AlignmentFlag(0) + , MateRefID(-1) + , MatePosition(-1) + , InsertSize(0) +{} + +/*! \fn BamAlignment::BamAlignment(const BamAlignment& other) + \brief copy constructor +*/ +BamAlignment::BamAlignment(const BamAlignment& other) + : Name(other.Name) + , Length(other.Length) + , QueryBases(other.QueryBases) + , AlignedBases(other.AlignedBases) + , Qualities(other.Qualities) + , TagData(other.TagData) + , RefID(other.RefID) + , Position(other.Position) + , Bin(other.Bin) + , MapQuality(other.MapQuality) + , AlignmentFlag(other.AlignmentFlag) + , CigarData(other.CigarData) + , MateRefID(other.MateRefID) + , MatePosition(other.MatePosition) + , InsertSize(other.InsertSize) + , Filename(other.Filename) + , SupportData(other.SupportData) +{} + +/*! \fn BamAlignment::~BamAlignment() + \brief destructor +*/ +BamAlignment::~BamAlignment() {} + +/*! \fn bool BamAlignment::BuildCharData() + \brief Populates alignment string fields (read name, bases, qualities, tag data). + + An alignment retrieved using BamReader::GetNextAlignmentCore() lacks this data. + Using that method makes parsing much quicker when only positional data is required. + + However, if you later want to access the character data fields from such an alignment, + use this method to populate those fields. Provides ability to do 'lazy evaluation' of + alignment parsing. + + \return \c true if character data populated successfully (or was already available to begin with) +*/ +bool BamAlignment::BuildCharData() +{ + + // skip if char data already parsed + if (!SupportData.HasCoreOnly) return true; + + // check system endianness + bool IsBigEndian = BamTools::SystemIsBigEndian(); + + // calculate character lengths/offsets + const unsigned int dataLength = SupportData.BlockLength - Constants::BAM_CORE_SIZE; + const unsigned int seqDataOffset = + SupportData.QueryNameLength + (SupportData.NumCigarOperations * 4); + const unsigned int qualDataOffset = seqDataOffset + (SupportData.QuerySequenceLength + 1) / 2; + const unsigned int tagDataOffset = qualDataOffset + SupportData.QuerySequenceLength; + const unsigned int tagDataLength = dataLength - tagDataOffset; + + // check offsets to see what char data exists + const bool hasSeqData = (seqDataOffset < qualDataOffset); + const bool hasQualData = (qualDataOffset < tagDataOffset); + const bool hasTagData = (tagDataOffset < dataLength); + + // store alignment name (relies on null char in name as terminator) + Name.assign(SupportData.AllCharData.data()); + + // save query sequence + QueryBases.clear(); + if (hasSeqData) { + const char* seqData = SupportData.AllCharData.data() + seqDataOffset; + QueryBases.reserve(SupportData.QuerySequenceLength); + for (std::size_t i = 0; i < SupportData.QuerySequenceLength; ++i) { + const char singleBase = + Constants::BAM_DNA_LOOKUP[((seqData[(i / 2)] >> (4 * (1 - (i % 2)))) & 0xf)]; + QueryBases.append(1, singleBase); + } + } + + // save qualities + + Qualities.clear(); + if (hasQualData) { + const char* qualData = SupportData.AllCharData.data() + qualDataOffset; + + // if marked as unstored (sequence of 0xFF) - don't do conversion, just fill with 0xFFs + if (qualData[0] == (char)0xFF) + Qualities.resize(SupportData.QuerySequenceLength, (char)0xFF); + + // otherwise convert from numeric QV to 'FASTQ-style' ASCII character + else { + Qualities.reserve(SupportData.QuerySequenceLength); + for (std::size_t i = 0; i < SupportData.QuerySequenceLength; ++i) + Qualities.append(1, qualData[i] + 33); + } + } + + // clear previous AlignedBases + AlignedBases.clear(); + + // if QueryBases has data, build AlignedBases using CIGAR data + // otherwise, AlignedBases will remain empty (this case IS allowed) + if (!QueryBases.empty() && QueryBases != "*") { + + // resize AlignedBases + AlignedBases.reserve(SupportData.QuerySequenceLength); + + // iterate over CigarOps + int k = 0; + std::vector<CigarOp>::const_iterator cigarIter = CigarData.begin(); + std::vector<CigarOp>::const_iterator cigarEnd = CigarData.end(); + for (; cigarIter != cigarEnd; ++cigarIter) { + const CigarOp& op = (*cigarIter); + + switch (op.Type) { + + // for 'M', 'I', '=', 'X' - write bases + case (Constants::BAM_CIGAR_MATCH_CHAR): + case (Constants::BAM_CIGAR_INS_CHAR): + case (Constants::BAM_CIGAR_SEQMATCH_CHAR): + case (Constants::BAM_CIGAR_MISMATCH_CHAR): + AlignedBases.append(QueryBases.substr(k, op.Length)); + // fall through + + // for 'S' - soft clip, do not write bases + // but increment placeholder 'k' + case (Constants::BAM_CIGAR_SOFTCLIP_CHAR): + k += op.Length; + break; + + // for 'D' - write gap character + case (Constants::BAM_CIGAR_DEL_CHAR): + AlignedBases.append(op.Length, Constants::BAM_DNA_DEL); + break; + + // for 'P' - write padding character + case (Constants::BAM_CIGAR_PAD_CHAR): + AlignedBases.append(op.Length, Constants::BAM_DNA_PAD); + break; + + // for 'N' - write N's, skip bases in original query sequence + case (Constants::BAM_CIGAR_REFSKIP_CHAR): + AlignedBases.append(op.Length, Constants::BAM_DNA_N); + break; + + // for 'H' - hard clip, do nothing to AlignedBases, move to next op + case (Constants::BAM_CIGAR_HARDCLIP_CHAR): + break; + + // invalid CIGAR op-code + default: + const std::string message = + std::string("invalid CIGAR operation type: ") + op.Type; + SetErrorString("BamAlignment::BuildCharData", message); + return false; + } + } + } + + // save tag data + TagData.clear(); + if (hasTagData) { + + char* tagData = (((char*)SupportData.AllCharData.data()) + tagDataOffset); + + if (IsBigEndian) { + std::size_t i = 0; + while (i < tagDataLength) { + + i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.) + const char type = tagData[i]; // get tag type at position i + ++i; // move i past tag type + + switch (type) { + + case (Constants::BAM_TAG_TYPE_ASCII): + case (Constants::BAM_TAG_TYPE_INT8): + case (Constants::BAM_TAG_TYPE_UINT8): + // no endian swapping necessary for single-byte data + ++i; + break; + + case (Constants::BAM_TAG_TYPE_INT16): + case (Constants::BAM_TAG_TYPE_UINT16): + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + + case (Constants::BAM_TAG_TYPE_FLOAT): + case (Constants::BAM_TAG_TYPE_INT32): + case (Constants::BAM_TAG_TYPE_UINT32): + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + + case (Constants::BAM_TAG_TYPE_HEX): + case (Constants::BAM_TAG_TYPE_STRING): + // no endian swapping necessary for hex-string/string data + while (tagData[i]) + ++i; + // increment one more for null terminator + ++i; + break; + + case (Constants::BAM_TAG_TYPE_ARRAY): + + { + // read array type + const char arrayType = tagData[i]; + ++i; + + // swap endian-ness of number of elements in place, then retrieve for loop + BamTools::SwapEndian_32p(&tagData[i]); + uint32_t numElements; + memcpy(&numElements, &tagData[i], sizeof(uint32_t)); + i += sizeof(uint32_t); + + // swap endian-ness of array elements + for (std::size_t j = 0; j < numElements; ++j) { + switch (arrayType) { + case (Constants::BAM_TAG_TYPE_INT8): + case (Constants::BAM_TAG_TYPE_UINT8): + // no endian-swapping necessary + ++i; + break; + case (Constants::BAM_TAG_TYPE_INT16): + case (Constants::BAM_TAG_TYPE_UINT16): + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + case (Constants::BAM_TAG_TYPE_FLOAT): + case (Constants::BAM_TAG_TYPE_INT32): + case (Constants::BAM_TAG_TYPE_UINT32): + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + default: + const std::string message = + std::string("invalid binary array type: ") + arrayType; + SetErrorString("BamAlignment::BuildCharData", message); + return false; + } + } + + break; + } + + // invalid tag type-code + default: + const std::string message = std::string("invalid tag type: ") + type; + SetErrorString("BamAlignment::BuildCharData", message); + return false; + } + } + } + + // store tagData in alignment + TagData.resize(tagDataLength); + memcpy((char*)(TagData.data()), tagData, tagDataLength); + } + + // clear core-only flag & return success + SupportData.HasCoreOnly = false; + return true; +} + +/*! \fn bool BamAlignment::FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) const + \internal + + Searches for requested tag in BAM tag data. + + \param[in] tag requested 2-character tag name + \param[in,out] pTagData pointer to current position in BamAlignment::TagData + \param[in] tagDataLength length of BamAlignment::TagData + \param[in,out] numBytesParsed number of bytes parsed so far + + \return \c true if found + + \post If \a tag is found, \a pTagData will point to the byte where the tag data begins. + \a numBytesParsed will correspond to the position in the full TagData string. + +*/ +bool BamAlignment::FindTag(const std::string& tag, char*& pTagData, + const unsigned int& tagDataLength, unsigned int& numBytesParsed) const +{ + + while (numBytesParsed < tagDataLength) { + + const char* pTagType = pTagData; + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + + // check the current tag, return true on match + if (strncmp(pTagType, tag.c_str(), 2) == 0) return true; + + // get the storage class and find the next tag + if (*pTagStorageType == '\0') return false; + if (!SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed)) return false; + if (*pTagData == '\0') return false; + } + + // checked all tags, none match + return false; +} + +/*! \fn bool BamAlignment::GetArrayTagType(const std::string& tag, char& type) const + \brief Retrieves the BAM tag type-code for the array elements associated with requested tag name. + + \param[in] tag 2-character tag name + \param[out] type retrieved (1-character) type-code + + \return \c true if found. False if not found, or if tag is not an array type. + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::GetArrayTagType(const std::string& tag, char& type) const +{ + + // skip if alignment is core-only + if (SupportData.HasCoreOnly) { + // TODO: set error string? + return false; + } + + // skip if no tags present + if (TagData.empty()) { + // TODO: set error string? + return false; + } + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag not found, return failure + if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) { + // TODO: set error string? + return false; + } + + // check that tag type code is array + type = *(pTagData - 1); + if (type != Constants::BAM_TAG_TYPE_ARRAY) { + // TODO: set error string + return false; + } + + // fetch element type + const char elementType = *pTagData; + switch (elementType) { + + // allowable types + case (Constants::BAM_TAG_TYPE_INT8): + case (Constants::BAM_TAG_TYPE_UINT8): + case (Constants::BAM_TAG_TYPE_INT16): + case (Constants::BAM_TAG_TYPE_UINT16): + case (Constants::BAM_TAG_TYPE_INT32): + case (Constants::BAM_TAG_TYPE_UINT32): + case (Constants::BAM_TAG_TYPE_FLOAT): + type = elementType; + break; + + default: + //TODO: set error string + return false; + } + + // if we get here, return success + return true; +} + +/*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool closedInterval = false) const + \brief Calculates alignment end position, based on its starting position and CIGAR data. + + \warning The position returned now represents a zero-based, HALF-OPEN interval. + In previous versions of BamTools (0.x & 1.x) all intervals were treated + as zero-based, CLOSED. + + \param[in] usePadded Allow inserted bases to affect the reported position. Default is + false, so that reported position stays synced with reference + coordinates. + \param[in] closedInterval Setting this to true will return a 0-based end coordinate. Default is + false, so that his value represents a standard, half-open interval. + + \return alignment end position +*/ +int BamAlignment::GetEndPosition(bool usePadded, bool closedInterval) const +{ + + // initialize alignment end to starting position + int alignEnd = Position; + + // iterate over cigar operations + std::vector<CigarOp>::const_iterator cigarIter = CigarData.begin(); + std::vector<CigarOp>::const_iterator cigarEnd = CigarData.end(); + for (; cigarIter != cigarEnd; ++cigarIter) { + const CigarOp& op = (*cigarIter); + + switch (op.Type) { + + // increase end position on CIGAR chars [DMXN=] + case Constants::BAM_CIGAR_DEL_CHAR: + case Constants::BAM_CIGAR_MATCH_CHAR: + case Constants::BAM_CIGAR_MISMATCH_CHAR: + case Constants::BAM_CIGAR_REFSKIP_CHAR: + case Constants::BAM_CIGAR_SEQMATCH_CHAR: + alignEnd += op.Length; + break; + + // increase end position on insertion, only if @usePadded is true + case Constants::BAM_CIGAR_INS_CHAR: + if (usePadded) alignEnd += op.Length; + break; + + // all other CIGAR chars do not affect end position + default: + break; + } + } + + // adjust for closedInterval, if requested + if (closedInterval) alignEnd -= 1; + + // return result + return alignEnd; +} + +/*! \fn std::string BamAlignment::GetErrorString() const + \brief Returns a human-readable description of the last error that occurred + + This method allows elimination of STDERR pollution. Developers of client code + may choose how the messages are displayed to the user, if at all. + + \return error description +*/ +std::string BamAlignment::GetErrorString() const +{ + return ErrorString; +} + +/*! \fn bool BamAlignment::GetSoftClips(std::vector<int>& clipSizes, std::vector<int>& readPositions, std::vector<int>& genomePositions, bool usePadded = false) const + \brief Identifies if an alignment has a soft clip. If so, identifies the + sizes of the soft clips, as well as their positions in the read and reference. + + \param[out] clipSizes vector of the sizes of each soft clip in the alignment + \param[out] readPositions vector of the 0-based read locations of each soft clip in the alignment. + These positions are basically indexes within the read, not genomic positions. + \param[out] genomePositions vector of the 0-based genome locations of each soft clip in the alignment + \param[in] usePadded inserted bases affect reported position. Default is false, so that + reported position stays 'sync-ed' with reference coordinates. + + \return \c true if any soft clips were found in the alignment +*/ +bool BamAlignment::GetSoftClips(std::vector<int>& clipSizes, std::vector<int>& readPositions, + std::vector<int>& genomePositions, bool usePadded) const +{ + // initialize positions & flags + int refPosition = Position; + int readPosition = 0; + bool softClipFound = false; + bool firstCigarOp = true; + + // iterate over cigar operations + std::vector<CigarOp>::const_iterator cigarIter = CigarData.begin(); + std::vector<CigarOp>::const_iterator cigarEnd = CigarData.end(); + for (; cigarIter != cigarEnd; ++cigarIter) { + const CigarOp& op = (*cigarIter); + + switch (op.Type) { + + // increase both read & genome positions on CIGAR chars [DMXN=] + case Constants::BAM_CIGAR_DEL_CHAR: + case Constants::BAM_CIGAR_MATCH_CHAR: + case Constants::BAM_CIGAR_MISMATCH_CHAR: + case Constants::BAM_CIGAR_REFSKIP_CHAR: + case Constants::BAM_CIGAR_SEQMATCH_CHAR: + refPosition += op.Length; + readPosition += op.Length; + break; + + // increase read position on insertion, genome position only if @usePadded is true + case Constants::BAM_CIGAR_INS_CHAR: + readPosition += op.Length; + if (usePadded) refPosition += op.Length; + break; + + case Constants::BAM_CIGAR_SOFTCLIP_CHAR: + + softClipFound = true; + + ////////////////////////////////////////////////////////////////////////////// + // if we are dealing with the *first* CIGAR operation + // for this alignment, we increment the read position so that + // the read and genome position of the clip are referring to the same base. + // For example, in the alignment below, the ref position would be 4, yet + // the read position would be 0. Thus, to "sync" the two, + // we need to increment the read position by the length of the + // soft clip. + // Read: ATCGTTTCGTCCCTGC + // Ref: GGGATTTCGTCCCTGC + // Cigar: SSSSMMMMMMMMMMMM + // + // NOTE: This only needs to be done if the soft clip is the _first_ CIGAR op. + ////////////////////////////////////////////////////////////////////////////// + if (firstCigarOp) readPosition += op.Length; + + // track the soft clip's size, read position, and genome position + clipSizes.push_back(op.Length); + readPositions.push_back(readPosition); + genomePositions.push_back(refPosition); + + // any other CIGAR operations have no effect + default: + break; + } + + // clear our "first pass" flag + firstCigarOp = false; + } + + // return whether any soft clips found + return softClipFound; +} + +/*! \fn std::vector<std::string> BamAlignment::GetTagNames() const + \brief Retrieves the BAM tag names. + + When paired with GetTagType() and GetTag(), this method allows you + to iterate over an alignment's tag data without knowing the names (or types) + beforehand. + + \return \c vector containing all tag names found (empty if none available) + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +std::vector<std::string> BamAlignment::GetTagNames() const +{ + + std::vector<std::string> result; + if (SupportData.HasCoreOnly || TagData.empty()) return result; + + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + while (numBytesParsed < tagDataLength) { + + // get current tag name & type + const char* pTagName = pTagData; + const char* pTagType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + + // store tag name + result.push_back(std::string(pTagName, 2)); + + // find the next tag + if (*pTagType == '\0') break; + if (!SkipToNextTag(*pTagType, pTagData, numBytesParsed)) break; + if (*pTagData == '\0') break; + } + + return result; +} + +/*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const + \brief Retrieves the BAM tag type-code associated with requested tag name. + + \param[in] tag 2-character tag name + \param[out] type retrieved (1-character) type-code + + \return \c true if found + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::GetTagType(const std::string& tag, char& type) const +{ + + // skip if alignment is core-only + if (SupportData.HasCoreOnly) { + // TODO: set error string? + return false; + } + + // skip if no tags present + if (TagData.empty()) { + // TODO: set error string? + return false; + } + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag not found, return failure + if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) { + // TODO: set error string? + return false; + } + + // otherwise, retrieve & validate tag type code + type = *(pTagData - 1); + switch (type) { + case (Constants::BAM_TAG_TYPE_ASCII): + case (Constants::BAM_TAG_TYPE_INT8): + case (Constants::BAM_TAG_TYPE_UINT8): + case (Constants::BAM_TAG_TYPE_INT16): + case (Constants::BAM_TAG_TYPE_UINT16): + case (Constants::BAM_TAG_TYPE_INT32): + case (Constants::BAM_TAG_TYPE_UINT32): + case (Constants::BAM_TAG_TYPE_FLOAT): + case (Constants::BAM_TAG_TYPE_STRING): + case (Constants::BAM_TAG_TYPE_HEX): + case (Constants::BAM_TAG_TYPE_ARRAY): + return true; + + // unknown tag type + default: + const std::string message = std::string("invalid tag type: ") + type; + SetErrorString("BamAlignment::GetTagType", message); + return false; + } +} + +/*! \fn bool BamAlignment::HasTag(const std::string& tag) const + \brief Returns true if alignment has a record for requested tag. + + \param[in] tag 2-character tag name + \return \c true if alignment has a record for tag +*/ +bool BamAlignment::HasTag(const std::string& tag) const +{ + + // return false if no tag data present + if (SupportData.HasCoreOnly || TagData.empty()) return false; + + // localize the tag data for lookup + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if result of tag lookup + return FindTag(tag, pTagData, tagDataLength, numBytesParsed); +} + +/*! \fn bool BamAlignment::IsDuplicate() const + \return \c true if this read is a PCR duplicate +*/ +bool BamAlignment::IsDuplicate() const +{ + return ((AlignmentFlag & Constants::BAM_ALIGNMENT_DUPLICATE) != 0); +} + +/*! \fn bool BamAlignment::IsFailedQC() const + \return \c true if this read failed quality control +*/ +bool BamAlignment::IsFailedQC() const +{ + return ((AlignmentFlag & Constants::BAM_ALIGNMENT_QC_FAILED) != 0); +} + +/*! \fn bool BamAlignment::IsFirstMate() const + \return \c true if alignment is first mate on paired-end read +*/ +bool BamAlignment::IsFirstMate() const +{ + return ((AlignmentFlag & Constants::BAM_ALIGNMENT_READ_1) != 0); +} + +/*! \fn bool BamAlignment::IsMapped() const + \return \c true if alignment is mapped +*/ +bool BamAlignment::IsMapped() const +{ + return ((AlignmentFlag & Constants::BAM_ALIGNMENT_UNMAPPED) == 0); +} + +/*! \fn bool BamAlignment::IsMateMapped() const + \return \c true if alignment's mate is mapped +*/ +bool BamAlignment::IsMateMapped() const +{ + return ((AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_UNMAPPED) == 0); +} + +/*! \fn bool BamAlignment::IsMateReverseStrand() const + \return \c true if alignment's mate mapped to reverse strand +*/ +bool BamAlignment::IsMateReverseStrand() const +{ + return ((AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND) != 0); +} + +/*! \fn bool BamAlignment::IsPaired() const + \return \c true if alignment part of paired-end read +*/ +bool BamAlignment::IsPaired() const +{ + return ((AlignmentFlag & Constants::BAM_ALIGNMENT_PAIRED) != 0); +} + +/*! \fn bool BamAlignment::IsPrimaryAlignment() const + \return \c true if reported position is primary alignment +*/ +bool BamAlignment::IsPrimaryAlignment() const +{ + return ((AlignmentFlag & Constants::BAM_ALIGNMENT_SECONDARY) == 0); +} + +/*! \fn bool BamAlignment::IsProperPair() const + \return \c true if alignment is part of read that satisfied paired-end resolution +*/ +bool BamAlignment::IsProperPair() const +{ + return ((AlignmentFlag & Constants::BAM_ALIGNMENT_PROPER_PAIR) != 0); +} + +/*! \fn bool BamAlignment::IsReverseStrand() const + \return \c true if alignment mapped to reverse strand +*/ +bool BamAlignment::IsReverseStrand() const +{ + return ((AlignmentFlag & Constants::BAM_ALIGNMENT_REVERSE_STRAND) != 0); +} + +/*! \fn bool BamAlignment::IsSecondMate() const + \return \c true if alignment is second mate on read +*/ +bool BamAlignment::IsSecondMate() const +{ + return ((AlignmentFlag & Constants::BAM_ALIGNMENT_READ_2) != 0); +} + +/*! \fn bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const + \internal + + Checks that tag name & type strings are expected sizes. + + \param tag[in] BAM tag name + \param type[in] BAM tag type-code + \return \c true if both input strings are valid sizes +*/ +bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const +{ + return (tag.size() == Constants::BAM_TAG_TAGSIZE) && + (type.size() == Constants::BAM_TAG_TYPESIZE); +} + +/*! \fn void BamAlignment::RemoveTag(const std::string& tag) + \brief Removes field from BAM tags. + + \param[in] tag 2-character name of field to remove +*/ +void BamAlignment::RemoveTag(const std::string& tag) +{ + + // if char data not populated, do that first + if (SupportData.HasCoreOnly) BuildCharData(); + + // skip if no tags available + if (TagData.empty()) return; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // skip if tag not found + if (!FindTag(tag, pTagData, originalTagDataLength, numBytesParsed)) return; + + // otherwise, remove it + RaiiBuffer newTagData(originalTagDataLength); + + // copy original tag data up til desired tag + pTagData -= 3; + numBytesParsed -= 3; + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData.Buffer, pOriginalTagData, numBytesParsed); + + // attemp to skip to next tag + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + if (SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed)) { + + // squeeze remaining tag data + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagDataLength = + originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData.Buffer + beginningTagDataLength, pTagData, endTagDataLength); + + // save modified tag data in alignment + TagData.assign(newTagData.Buffer, beginningTagDataLength + endTagDataLength); + } +} + +/*! \fn void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const + \internal + + Sets a formatted error string for this alignment. + + \param[in] where class/method where error occurred + \param[in] what description of error +*/ +void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const +{ + static const std::string SEPARATOR(": "); + ErrorString = where + SEPARATOR + what; +} + +/*! \fn void BamAlignment::SetIsDuplicate(bool ok) + \brief Sets value of "PCR duplicate" flag to \a ok. +*/ +void BamAlignment::SetIsDuplicate(bool ok) +{ + if (ok) + AlignmentFlag |= Constants::BAM_ALIGNMENT_DUPLICATE; + else + AlignmentFlag &= ~Constants::BAM_ALIGNMENT_DUPLICATE; +} + +/*! \fn void BamAlignment::SetIsFailedQC(bool ok) + \brief Sets "failed quality control" flag to \a ok. +*/ +void BamAlignment::SetIsFailedQC(bool ok) +{ + if (ok) + AlignmentFlag |= Constants::BAM_ALIGNMENT_QC_FAILED; + else + AlignmentFlag &= ~Constants::BAM_ALIGNMENT_QC_FAILED; +} + +/*! \fn void BamAlignment::SetIsFirstMate(bool ok) + \brief Sets "alignment is first mate" flag to \a ok. +*/ +void BamAlignment::SetIsFirstMate(bool ok) +{ + if (ok) + AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_1; + else + AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_1; +} + +/*! \fn void BamAlignment::SetIsMapped(bool ok) + \brief Sets "alignment is mapped" flag to \a ok. +*/ +void BamAlignment::SetIsMapped(bool ok) +{ + if (ok) + AlignmentFlag &= ~Constants::BAM_ALIGNMENT_UNMAPPED; + else + AlignmentFlag |= Constants::BAM_ALIGNMENT_UNMAPPED; +} + +/*! \fn void BamAlignment::SetIsMateMapped(bool ok) + \brief Sets "alignment's mate is mapped" flag to \a ok. +*/ +void BamAlignment::SetIsMateMapped(bool ok) +{ + if (ok) + AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_UNMAPPED; + else + AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_UNMAPPED; +} + +/*! \fn void BamAlignment::SetIsMateReverseStrand(bool ok) + \brief Sets "alignment's mate mapped to reverse strand" flag to \a ok. +*/ +void BamAlignment::SetIsMateReverseStrand(bool ok) +{ + if (ok) + AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND; + else + AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND; +} + +/*! \fn void BamAlignment::SetIsPaired(bool ok) + \brief Sets "alignment part of paired-end read" flag to \a ok. +*/ +void BamAlignment::SetIsPaired(bool ok) +{ + if (ok) + AlignmentFlag |= Constants::BAM_ALIGNMENT_PAIRED; + else + AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PAIRED; +} + +/*! \fn void BamAlignment::SetIsPrimaryAlignment(bool ok) + \brief Sets "position is primary alignment" flag to \a ok. +*/ +void BamAlignment::SetIsPrimaryAlignment(bool ok) +{ + if (ok) + AlignmentFlag &= ~Constants::BAM_ALIGNMENT_SECONDARY; + else + AlignmentFlag |= Constants::BAM_ALIGNMENT_SECONDARY; +} + +/*! \fn void BamAlignment::SetIsProperPair(bool ok) + \brief Sets "alignment is part of read that satisfied paired-end resolution" flag to \a ok. +*/ +void BamAlignment::SetIsProperPair(bool ok) +{ + if (ok) + AlignmentFlag |= Constants::BAM_ALIGNMENT_PROPER_PAIR; + else + AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PROPER_PAIR; +} + +/*! \fn void BamAlignment::SetIsReverseStrand(bool ok) + \brief Sets "alignment mapped to reverse strand" flag to \a ok. +*/ +void BamAlignment::SetIsReverseStrand(bool ok) +{ + if (ok) + AlignmentFlag |= Constants::BAM_ALIGNMENT_REVERSE_STRAND; + else + AlignmentFlag &= ~Constants::BAM_ALIGNMENT_REVERSE_STRAND; +} + +/*! \fn void BamAlignment::SetIsSecondMate(bool ok) + \brief Sets "alignment is second mate on read" flag to \a ok. +*/ +void BamAlignment::SetIsSecondMate(bool ok) +{ + if (ok) + AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_2; + else + AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_2; +} + +/*! \fn bool BamAlignment::SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) const + \internal + + Moves to next available tag in tag data string + + \param[in] storageType BAM tag type-code that determines how far to move cursor + \param[in,out] pTagData pointer to current position (cursor) in tag string + \param[in,out] numBytesParsed report of how many bytes were parsed (cumulatively) + + \return \c if storageType was a recognized BAM tag type + + \post \a pTagData will point to the byte where the next tag data begins. + \a numBytesParsed will correspond to the cursor's position in the full TagData string. +*/ +bool BamAlignment::SkipToNextTag(const char storageType, char*& pTagData, + unsigned int& numBytesParsed) const +{ + switch (storageType) { + + case (Constants::BAM_TAG_TYPE_ASCII): + case (Constants::BAM_TAG_TYPE_INT8): + case (Constants::BAM_TAG_TYPE_UINT8): + ++numBytesParsed; + ++pTagData; + break; + + case (Constants::BAM_TAG_TYPE_INT16): + case (Constants::BAM_TAG_TYPE_UINT16): + numBytesParsed += sizeof(uint16_t); + pTagData += sizeof(uint16_t); + break; + + case (Constants::BAM_TAG_TYPE_FLOAT): + case (Constants::BAM_TAG_TYPE_INT32): + case (Constants::BAM_TAG_TYPE_UINT32): + numBytesParsed += sizeof(uint32_t); + pTagData += sizeof(uint32_t); + break; + + case (Constants::BAM_TAG_TYPE_STRING): + case (Constants::BAM_TAG_TYPE_HEX): + while (*pTagData) { + ++numBytesParsed; + ++pTagData; + } + // increment for null-terminator + ++numBytesParsed; + ++pTagData; + break; + + case (Constants::BAM_TAG_TYPE_ARRAY): + + { + // read array type + const char arrayType = *pTagData; + ++numBytesParsed; + ++pTagData; + + // read number of elements + int32_t numElements; + memcpy(&numElements, pTagData, sizeof(uint32_t)); // already endian-swapped, if needed + numBytesParsed += sizeof(uint32_t); + pTagData += sizeof(uint32_t); + + // calculate number of bytes to skip + int bytesToSkip = 0; + switch (arrayType) { + case (Constants::BAM_TAG_TYPE_INT8): + case (Constants::BAM_TAG_TYPE_UINT8): + bytesToSkip = numElements; + break; + case (Constants::BAM_TAG_TYPE_INT16): + case (Constants::BAM_TAG_TYPE_UINT16): + bytesToSkip = numElements * sizeof(uint16_t); + break; + case (Constants::BAM_TAG_TYPE_FLOAT): + case (Constants::BAM_TAG_TYPE_INT32): + case (Constants::BAM_TAG_TYPE_UINT32): + bytesToSkip = numElements * sizeof(uint32_t); + break; + default: + const std::string message = + std::string("invalid binary array type: ") + arrayType; + SetErrorString("BamAlignment::SkipToNextTag", message); + return false; + } + + // skip binary array contents + numBytesParsed += bytesToSkip; + pTagData += bytesToSkip; + break; + } + + default: + const std::string message = std::string("invalid tag type: ") + storageType; + SetErrorString("BamAlignment::SkipToNextTag", message); + return false; + } + + // if we get here, tag skipped OK - return success + return true; +} diff --git a/src/api/BamAlignment.h b/src/api/BamAlignment.h new file mode 100644 index 0000000..6491807 --- /dev/null +++ b/src/api/BamAlignment.h @@ -0,0 +1,644 @@ +// *************************************************************************** +// BamAlignment.h (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 July 2013 (DB) +// --------------------------------------------------------------------------- +// Provides the BamAlignment data structure +// *************************************************************************** + +#ifndef BAMALIGNMENT_H +#define BAMALIGNMENT_H + +#include <cstddef> +#include <cstdlib> +#include <cstring> +#include <string> +#include <vector> +#include "api/BamAux.h" +#include "api/BamConstants.h" +#include "api/api_global.h" + +namespace BamTools { + +//! \cond +// forward declaration of BamAlignment's "friends" +namespace Internal { +class BamReaderPrivate; +class BamWriterPrivate; +} // namespace Internal +//! \endcond + +// BamAlignment data structure +class API_EXPORT BamAlignment +{ + + // constructors & destructor +public: + BamAlignment(); + BamAlignment(const BamAlignment& other); + ~BamAlignment(); + + // queries against alignment flags +public: + bool IsDuplicate() const; // returns true if this read is a PCR duplicate + bool IsFailedQC() const; // returns true if this read failed quality control + bool IsFirstMate() const; // returns true if alignment is first mate on read + bool IsMapped() const; // returns true if alignment is mapped + bool IsMateMapped() const; // returns true if alignment's mate is mapped + bool IsMateReverseStrand() const; // returns true if alignment's mate mapped to reverse strand + bool IsPaired() const; // returns true if alignment part of paired-end read + bool IsPrimaryAlignment() const; // returns true if reported position is primary alignment + bool IsProperPair() + const; // returns true if alignment is part of read that satisfied paired-end resolution + bool IsReverseStrand() const; // returns true if alignment mapped to reverse strand + bool IsSecondMate() const; // returns true if alignment is second mate on read + + // manipulate alignment flags +public: + void SetIsDuplicate(bool ok); // sets value of "PCR duplicate" flag + void SetIsFailedQC(bool ok); // sets value of "failed quality control" flag + void SetIsFirstMate(bool ok); // sets value of "alignment is first mate" flag + void SetIsMapped(bool ok); // sets value of "alignment is mapped" flag + void SetIsMateMapped(bool ok); // sets value of "alignment's mate is mapped" flag + void SetIsMateReverseStrand( + bool ok); // sets value of "alignment's mate mapped to reverse strand" flag + void SetIsPaired(bool ok); // sets value of "alignment part of paired-end read" flag + void SetIsPrimaryAlignment(bool ok); // sets value of "position is primary alignment" flag + void SetIsProperPair( + bool + ok); // sets value of "alignment is part of read that satisfied paired-end resolution" flag + void SetIsReverseStrand(bool ok); // sets value of "alignment mapped to reverse strand" flag + void SetIsSecondMate(bool ok); // sets value of "alignment is second mate on read" flag + + // tag data access methods +public: + // add a new tag + template <typename T> + bool AddTag(const std::string& tag, const std::string& type, const T& value); + template <typename T> + bool AddTag(const std::string& tag, const std::vector<T>& values); + + // edit (or append) tag + template <typename T> + bool EditTag(const std::string& tag, const std::string& type, const T& value); + template <typename T> + bool EditTag(const std::string& tag, const std::vector<T>& values); + + // retrieves tag data + template <typename T> + bool GetTag(const std::string& tag, T& destination) const; + template <typename T> + bool GetTag(const std::string& tag, std::vector<T>& destination) const; + + // retrieves all current tag names + std::vector<std::string> GetTagNames() const; + + // retrieves the SAM/BAM type-code for requested tag name + bool GetTagType(const std::string& tag, char& type) const; + + // retrieves the SAM/BAM type-code for the data elements in an array tag + bool GetArrayTagType(const std::string& tag, char& type) const; + + // returns true if alignment has a record for this tag name + bool HasTag(const std::string& tag) const; + + // removes a tag + void RemoveTag(const std::string& tag); + + // additional methods +public: + // populates alignment string fields + bool BuildCharData(); + + // calculates alignment end position + int GetEndPosition(bool usePadded = false, bool closedInterval = false) const; + + // returns a description of the last error that occurred + std::string GetErrorString() const; + + // retrieves the size, read locations and reference locations of soft-clip operations + bool GetSoftClips(std::vector<int>& clipSizes, std::vector<int>& readPositions, + std::vector<int>& genomePositions, bool usePadded = false) const; + + // public data fields +public: + std::string Name; // read name + int32_t Length; // length of query sequence + std::string QueryBases; // 'original' sequence (contained in BAM file) + std::string + AlignedBases; // 'aligned' sequence (QueryBases plus deletion, padding, clipping chars) + std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values) + std::string TagData; // tag data (use provided methods to query/modify) + int32_t RefID; // ID number for reference sequence + int32_t Position; // position (0-based) where alignment starts + uint16_t Bin; // BAM (standard) index bin number for this alignment + uint16_t MapQuality; // mapping quality score + uint32_t AlignmentFlag; // alignment bit-flag (use provided methods to query/modify) + std::vector<CigarOp> CigarData; // CIGAR operations for this alignment + int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned + int32_t MatePosition; // position (0-based) where alignment's mate starts + int32_t InsertSize; // mate-pair insert size + std::string Filename; // name of BAM file which this alignment comes from + + //! \internal + // internal utility methods +private: + bool FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength, + unsigned int& numBytesParsed) const; + bool IsValidSize(const std::string& tag, const std::string& type) const; + void SetErrorString(const std::string& where, const std::string& what) const; + bool SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) const; + + // internal data +private: + struct BamAlignmentSupportData + { + + // data members + std::string AllCharData; + uint32_t BlockLength; + uint32_t NumCigarOperations; + uint32_t QueryNameLength; + uint32_t QuerySequenceLength; + bool HasCoreOnly; + + // constructor + BamAlignmentSupportData() + : BlockLength(0) + , NumCigarOperations(0) + , QueryNameLength(0) + , QuerySequenceLength(0) + , HasCoreOnly(false) + {} + }; + BamAlignmentSupportData SupportData; + friend class Internal::BamReaderPrivate; + friend class Internal::BamWriterPrivate; + + mutable std::string ErrorString; // mutable to allow updates even in logically const methods + //! \endinternal +}; + +// --------------------------------------------------------- +// BamAlignment tag access methods + +/*! \fn bool AddTag(const std::string& tag, const std::string& type, const T& value) + \brief Adds a field to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param[in] tag 2-character tag name + \param[in] type 1-character tag type + \param[in] value data to store + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +template <typename T> +inline bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const T& value) +{ + + // if char data not populated, do that first + if (SupportData.HasCoreOnly) BuildCharData(); + + // check tag/type size + if (!IsValidSize(tag, type)) { + // TODO: set error string? + return false; + } + + // check that storage type code is OK for T + if (!TagTypeHelper<T>::CanConvertTo(type.at(0))) { + // TODO: set error string? + return false; + } + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if (FindTag(tag, pTagData, tagDataLength, numBytesParsed)) { + // TODO: set error string? + return false; + } + + // otherwise, convert value to string + union + { + T value; + char valueBuffer[sizeof(T)]; + } un; + un.value = value; + + // copy original tag data to temp buffer + const std::string newTag = tag + type; + const std::size_t newTagDataLength = + tagDataLength + newTag.size() + sizeof(T); // leave room for new T + RaiiBuffer originalTagData(newTagDataLength); + memcpy(originalTagData.Buffer, TagData.c_str(), + tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData.Buffer + tagDataLength, newTag.data()); + memcpy(originalTagData.Buffer + tagDataLength + newTag.size(), un.valueBuffer, sizeof(T)); + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData.Buffer; + TagData.assign(newTagData, newTagDataLength); + return true; +} + +template <> +inline bool BamAlignment::AddTag<std::string>(const std::string& tag, const std::string& type, + const std::string& value) +{ + // if char data not populated, do that first + if (SupportData.HasCoreOnly) BuildCharData(); + + // check tag/type size + if (!IsValidSize(tag, type)) { + // TODO: set error string? + return false; + } + + // check that storage type code is OK for string + if (!TagTypeHelper<std::string>::CanConvertTo(type.at(0))) { + // TODO: set error string? + return false; + } + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if (FindTag(tag, pTagData, tagDataLength, numBytesParsed)) { + // TODO: set error string? + return false; + } + + // otherwise, copy tag data to temp buffer + const std::string newTag = tag + type + value; + const std::size_t newTagDataLength = + tagDataLength + newTag.size() + 1; // leave room for null-term + RaiiBuffer originalTagData(newTagDataLength); + memcpy(originalTagData.Buffer, TagData.c_str(), + tagDataLength + 1); // '+1' for TagData null-term + + // append newTag (removes original null-term, then appends newTag + null-term) + strcat(originalTagData.Buffer + tagDataLength, newTag.data()); + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData.Buffer; + TagData.assign(newTagData, newTagDataLength); + return true; +} + +/*! \fn template<typename T> bool AddTag(const std::string& tag, const std::vector<T>& values) + \brief Adds a numeric array field to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param[in] tag 2-character tag name + \param[in] values vector of data values to store + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +template <typename T> +inline bool BamAlignment::AddTag(const std::string& tag, const std::vector<T>& values) +{ + + // if char data not populated, do that first + if (SupportData.HasCoreOnly) BuildCharData(); + + // check for valid tag name length + if (tag.size() != Constants::BAM_TAG_TAGSIZE) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if (FindTag(tag, pTagData, tagDataLength, numBytesParsed)) { + // TODO: set error string? + return false; + } + + // build new tag's base information + char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; + memcpy(newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE); + newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; + newTagBase[3] = TagTypeHelper<T>::TypeCode(); + + // add number of array elements to newTagBase + const int32_t numElements = values.size(); + memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); + + // copy current TagData string to temp buffer, leaving room for new tag's contents + const std::size_t newTagDataLength = + tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE + numElements * sizeof(T); + RaiiBuffer originalTagData(newTagDataLength); + memcpy(originalTagData.Buffer, TagData.c_str(), + tagDataLength + 1); // '+1' for TagData's null-term + + // write newTagBase (removes old null term) + strcat(originalTagData.Buffer + tagDataLength, (const char*)newTagBase); + + // add vector elements to tag + int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; + for (int i = 0; i < numElements; ++i) { + const T& value = values.at(i); + memcpy(originalTagData.Buffer + elementsBeginOffset + i * sizeof(T), &value, sizeof(T)); + } + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData.Buffer; + TagData.assign(newTagData, newTagDataLength); + return true; +} + +/*! \fn template<typename T> bool EditTag(const std::string& tag, const std::string& type, const T& value) + \brief Edits a BAM tag field. + + If \a tag does not exist, a new entry is created. + + \param tag[in] 2-character tag name + \param type[in] 1-character tag type (must be "Z" or "H") + \param value[in] new data value + + \return \c true if the tag was modified/created successfully + + \sa BamAlignment::RemoveTag() + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +template <typename T> +inline bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const T& value) +{ + + // if char data not populated, do that first + if (SupportData.HasCoreOnly) BuildCharData(); + + // remove existing tag if present, then append tag with new value + if (HasTag(tag)) RemoveTag(tag); + return AddTag(tag, type, value); +} + +/*! \fn template<typename T> bool EditTag(const std::string& tag, const std::vector<T>& values) + \brief Edits a BAM tag field containing a numeric array. + + If \a tag does not exist, a new entry is created. + + \param tag[in] 2-character tag name + \param value[in] vector of data values + + \return \c true if the tag was modified/created successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +template <typename T> +inline bool BamAlignment::EditTag(const std::string& tag, const std::vector<T>& values) +{ + + // if char data not populated, do that first + if (SupportData.HasCoreOnly) BuildCharData(); + + // remove existing tag if present, then append tag with new values + if (HasTag(tag)) RemoveTag(tag); + return AddTag(tag, values); +} + +/*! \fn template<typename T> bool GetTag(const std::string& tag, T& destination) const + \brief Retrieves the value associated with a BAM tag. + + \param tag[in] 2-character tag name + \param destination[out] retrieved value + \return \c true if found +*/ +template <typename T> +inline bool BamAlignment::GetTag(const std::string& tag, T& destination) const +{ + + // skip if alignment is core-only + if (SupportData.HasCoreOnly) { + // TODO: set error string? + return false; + } + + // skip if no tags present + if (TagData.empty()) { + // TODO: set error string? + return false; + } + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // return failure if tag not found + if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) { + // TODO: set error string? + return false; + } + + // fetch data type + const char type = *(pTagData - 1); + if (!TagTypeHelper<T>::CanConvertFrom(type)) { + // TODO: set error string ? + return false; + } + + // determine data length + int destinationLength = 0; + switch (type) { + + // 1 byte data + case (Constants::BAM_TAG_TYPE_ASCII): + case (Constants::BAM_TAG_TYPE_INT8): + case (Constants::BAM_TAG_TYPE_UINT8): + destinationLength = 1; + break; + + // 2 byte data + case (Constants::BAM_TAG_TYPE_INT16): + case (Constants::BAM_TAG_TYPE_UINT16): + destinationLength = 2; + break; + + // 4 byte data + case (Constants::BAM_TAG_TYPE_INT32): + case (Constants::BAM_TAG_TYPE_UINT32): + case (Constants::BAM_TAG_TYPE_FLOAT): + destinationLength = 4; + break; + + // var-length types not supported for numeric destination + case (Constants::BAM_TAG_TYPE_STRING): + case (Constants::BAM_TAG_TYPE_HEX): + case (Constants::BAM_TAG_TYPE_ARRAY): + SetErrorString("BamAlignment::GetTag", + "cannot store variable length tag data into a numeric destination"); + return false; + + // unrecognized tag type + default: + const std::string message = std::string("invalid tag type: ") + type; + SetErrorString("BamAlignment::GetTag", message); + return false; + } + + // store data in destination + destination = 0; + memcpy(&destination, pTagData, destinationLength); + + // return success + return true; +} + +template <> +inline bool BamAlignment::GetTag<std::string>(const std::string& tag, + std::string& destination) const +{ + // skip if alignment is core-only + if (SupportData.HasCoreOnly) { + // TODO: set error string? + return false; + } + + // skip if no tags present + if (TagData.empty()) { + // TODO: set error string? + return false; + } + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // return failure if tag not found + if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) { + // TODO: set error string? + return false; + } + + // otherwise copy data into destination + const unsigned int dataLength = strlen(pTagData); + destination.clear(); + destination.resize(dataLength); + memcpy((char*)destination.data(), pTagData, dataLength); + + // return success + return true; +} + +/*! \fn template<typename T> bool GetTag(const std::string& tag, std::vector<T>& destination) const + \brief Retrieves the numeric array associated with a BAM tag. + + \param tag[in] 2-character tag name + \param destination[out] retrieved values + \return \c true if found +*/ +template <typename T> +inline bool BamAlignment::GetTag(const std::string& tag, std::vector<T>& destination) const +{ + + // skip if alignment is core-only + if (SupportData.HasCoreOnly) { + // TODO: set error string? + return false; + } + + // skip if no tags present + if (TagData.empty()) { + // TODO: set error string? + return false; + } + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // return false if tag not found + if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) { + // TODO: set error string? + return false; + } + + // check that tag is array type + const char tagType = *(pTagData - 1); + if (tagType != Constants::BAM_TAG_TYPE_ARRAY) { + SetErrorString("BamAlignment::GetTag", "cannot store a non-array tag in array destination"); + return false; + } + + // fetch element type + const char elementType = *pTagData; + if (!TagTypeHelper<T>::CanConvertFrom(elementType)) { + // TODO: set error string ? + return false; + } + ++pTagData; + + // calculate length of each element in tag's array + switch (elementType) { + case (Constants::BAM_TAG_TYPE_ASCII): + case (Constants::BAM_TAG_TYPE_INT8): + case (Constants::BAM_TAG_TYPE_UINT8): + break; + + case (Constants::BAM_TAG_TYPE_INT16): + case (Constants::BAM_TAG_TYPE_UINT16): + break; + + case (Constants::BAM_TAG_TYPE_INT32): + case (Constants::BAM_TAG_TYPE_UINT32): + case (Constants::BAM_TAG_TYPE_FLOAT): + break; + + // var-length types not supported for numeric destination + case (Constants::BAM_TAG_TYPE_STRING): + case (Constants::BAM_TAG_TYPE_HEX): + case (Constants::BAM_TAG_TYPE_ARRAY): + SetErrorString("BamAlignment::GetTag", + "invalid array data, variable-length elements are not allowed"); + return false; + + // unknown tag type + default: + const std::string message = std::string("invalid array element type: ") + elementType; + SetErrorString("BamAlignment::GetTag", message); + return false; + } + + // get number of elements + int32_t numElements; + memcpy(&numElements, pTagData, sizeof(int32_t)); + pTagData += 4; + destination.clear(); + destination.reserve(numElements); + + // read in elements + T value; + for (int i = 0; i < numElements; ++i) { + memcpy(&value, pTagData, sizeof(T)); + pTagData += sizeof(T); + destination.push_back(value); + } + + // return success + return true; +} + +typedef std::vector<BamAlignment> BamAlignmentVector; + +} // namespace BamTools + +#endif // BAMALIGNMENT_H diff --git a/src/api/BamAux.h b/src/api/BamAux.h new file mode 100644 index 0000000..e0f48f9 --- /dev/null +++ b/src/api/BamAux.h @@ -0,0 +1,519 @@ +// *************************************************************************** +// BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides data structures & utility methods that are used throughout the API. +// *************************************************************************** + +#ifndef BAMAUX_H +#define BAMAUX_H + +#include <cstddef> +#include <cstring> +#include <fstream> +#include <iostream> +#include <string> +#include <vector> +#include "api/api_global.h" + +/*! \file BamAux.h + + Provides data structures & utility methods that are used throughout the API. +*/ + +/*! \namespace BamTools + \brief Contains all BamTools classes & methods. + + The BamTools API contained in this namespace contains classes and methods + for reading, writing, and manipulating BAM alignment files. +*/ +namespace BamTools { + +// ---------------------------------------------------------------- +// CigarOp + +/*! \struct BamTools::CigarOp + \brief Represents a CIGAR alignment operation. + + \sa \samSpecURL for more details on using CIGAR operations. +*/ +struct API_EXPORT CigarOp +{ + + char Type; //!< CIGAR operation type (MIDNSHPX=) + uint32_t Length; //!< CIGAR operation length (number of bases) + + //! constructor + CigarOp(const char type = '\0', const uint32_t& length = 0) + : Type(type) + , Length(length) + {} +}; + +// ---------------------------------------------------------------- +// RefData + +/*! \struct BamTools::RefData + \brief Represents a reference sequence entry +*/ +struct API_EXPORT RefData +{ + + std::string RefName; //!< name of reference sequence + int32_t RefLength; //!< length of reference sequence + + //! constructor + RefData(const std::string& name = std::string(), const int32_t& length = 0) + : RefName(name) + , RefLength(length) + {} +}; + +//! convenience typedef for vector of RefData entries +typedef std::vector<RefData> RefVector; + +// ---------------------------------------------------------------- +// BamRegion + +/*! \struct BamTools::BamRegion + \brief Represents a sequential genomic region + + Allowed to span multiple (sequential) references. + + \warning BamRegion now represents a zero-based, HALF-OPEN interval. + In previous versions of BamTools (0.x & 1.x) all intervals were treated + as zero-based, CLOSED. +*/ +struct API_EXPORT BamRegion +{ + + int LeftRefID; //!< reference ID for region's left boundary + int LeftPosition; //!< position for region's left boundary + int RightRefID; //!< reference ID for region's right boundary + int RightPosition; //!< position for region's right boundary + + //! constructor + BamRegion(const int& leftID = -1, const int& leftPos = -1, const int& rightID = -1, + const int& rightPos = -1) + : LeftRefID(leftID) + , LeftPosition(leftPos) + , RightRefID(rightID) + , RightPosition(rightPos) + {} + + //! copy constructor + BamRegion(const BamRegion& other) + : LeftRefID(other.LeftRefID) + , LeftPosition(other.LeftPosition) + , RightRefID(other.RightRefID) + , RightPosition(other.RightPosition) + {} + + //! Clears region boundaries + void clear() + { + LeftRefID = -1; + LeftPosition = -1; + RightRefID = -1; + RightPosition = -1; + } + + //! Returns true if region has a left boundary + bool isLeftBoundSpecified() const + { + return (LeftRefID >= 0 && LeftPosition >= 0); + } + + //! Returns true if region boundaries are not defined + bool isNull() const + { + return (!isLeftBoundSpecified() && !isRightBoundSpecified()); + } + + //! Returns true if region has a right boundary + bool isRightBoundSpecified() const + { + return (RightRefID >= 0 && RightPosition >= 1); + } +}; + +struct CustomHeaderTag +{ + std::string TagName; + std::string TagValue; +}; + +// ---------------------------------------------------------------- +// General utility methods + +/*! \fn bool FileExists(const std::string& filename) + \brief returns true if the file exists +*/ +API_EXPORT inline bool FileExists(const std::string& filename) +{ + std::ifstream f(filename.c_str(), std::ifstream::in); + return !f.fail(); +} + +/*! \fn void SwapEndian_16(int16_t& x) + \brief swaps endianness of signed 16-bit integer, in place +*/ +API_EXPORT inline void SwapEndian_16(int16_t& x) +{ + x = ((x >> 8) | (x << 8)); +} + +/*! \fn void SwapEndian_16(uint16_t& x) + \brief swaps endianness of unsigned 16-bit integer, in place +*/ +API_EXPORT inline void SwapEndian_16(uint16_t& x) +{ + x = ((x >> 8) | (x << 8)); +} + +/*! \fn void SwapEndian_32(int32_t& x) + \brief swaps endianness of signed 32-bit integer, in place +*/ +API_EXPORT inline void SwapEndian_32(int32_t& x) +{ + x = ((x >> 24) | ((x << 8) & 0x00FF0000) | ((x >> 8) & 0x0000FF00) | (x << 24)); +} + +/*! \fn void SwapEndian_32(uint32_t& x) + \brief swaps endianness of unsigned 32-bit integer, in place +*/ +API_EXPORT inline void SwapEndian_32(uint32_t& x) +{ + x = ((x >> 24) | ((x << 8) & 0x00FF0000) | ((x >> 8) & 0x0000FF00) | (x << 24)); +} + +/*! \fn void SwapEndian_64(int64_t& x) + \brief swaps endianness of signed 64-bit integer, in place +*/ +API_EXPORT inline void SwapEndian_64(int64_t& x) +{ + x = ((x >> 56) | ((x << 40) & 0x00FF000000000000ll) | ((x << 24) & 0x0000FF0000000000ll) | + ((x << 8) & 0x000000FF00000000ll) | ((x >> 8) & 0x00000000FF000000ll) | + ((x >> 24) & 0x0000000000FF0000ll) | ((x >> 40) & 0x000000000000FF00ll) | (x << 56)); +} + +/*! \fn void SwapEndian_64(uint64_t& x) + \brief swaps endianness of unsigned 64-bit integer, in place +*/ +API_EXPORT inline void SwapEndian_64(uint64_t& x) +{ + x = ((x >> 56) | ((x << 40) & 0x00FF000000000000ll) | ((x << 24) & 0x0000FF0000000000ll) | + ((x << 8) & 0x000000FF00000000ll) | ((x >> 8) & 0x00000000FF000000ll) | + ((x >> 24) & 0x0000000000FF0000ll) | ((x >> 40) & 0x000000000000FF00ll) | (x << 56)); +} + +/*! \fn void SwapEndian_16p(char* data) + \brief swaps endianness of the next 2 bytes in a buffer, in place +*/ +API_EXPORT inline void SwapEndian_16p(char* data) +{ + uint16_t& value = (uint16_t&)*data; + SwapEndian_16(value); +} + +/*! \fn void SwapEndian_32p(char* data) + \brief swaps endianness of the next 4 bytes in a buffer, in place +*/ +API_EXPORT inline void SwapEndian_32p(char* data) +{ + uint32_t& value = (uint32_t&)*data; + SwapEndian_32(value); +} + +/*! \fn void SwapEndian_64p(char* data) + \brief swaps endianness of the next 8 bytes in a buffer, in place +*/ +API_EXPORT inline void SwapEndian_64p(char* data) +{ + uint64_t& value = (uint64_t&)*data; + SwapEndian_64(value); +} + +/*! \fn bool SystemIsBigEndian() + \brief checks host architecture's byte order + \return \c true if system uses big-endian ordering +*/ +API_EXPORT inline bool SystemIsBigEndian() +{ + const uint16_t one = 0x0001; + return ((*(char*)&one) == 0); +} + +/*! \fn void PackUnsignedInt(char* buffer, unsigned int value) + \brief stores unsigned integer value in a byte buffer + + \param[out] buffer destination buffer + \param[in] value value to 'pack' in buffer +*/ +API_EXPORT inline void PackUnsignedInt(char* buffer, unsigned int value) +{ + buffer[0] = (char)value; + buffer[1] = (char)(value >> 8); + buffer[2] = (char)(value >> 16); + buffer[3] = (char)(value >> 24); +} + +/*! \fn void PackUnsignedShort(char* buffer, unsigned short value) + \brief stores unsigned short integer value in a byte buffer + + \param[out] buffer destination buffer + \param[in] value value to 'pack' in buffer +*/ +API_EXPORT inline void PackUnsignedShort(char* buffer, unsigned short value) +{ + buffer[0] = (char)value; + buffer[1] = (char)(value >> 8); +} + +/*! \fn double UnpackDouble(const char* buffer) + \brief reads a double value from byte buffer + + \param[in] buffer source byte buffer + \return the (double) value read from the buffer +*/ +API_EXPORT inline double UnpackDouble(const char* buffer) +{ + union + { + double value; + unsigned char valueBuffer[sizeof(double)]; + } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + un.valueBuffer[4] = buffer[4]; + un.valueBuffer[5] = buffer[5]; + un.valueBuffer[6] = buffer[6]; + un.valueBuffer[7] = buffer[7]; + return un.value; +} + +/*! \fn double UnpackDouble(char* buffer) + \brief reads a double value from byte buffer + + This is an overloaded function. + + \param[in] buffer source byte buffer + \return the (double) value read from the buffer +*/ +API_EXPORT inline double UnpackDouble(char* buffer) +{ + return UnpackDouble((const char*)buffer); +} + +/*! \fn double UnpackFloat(const char* buffer) + \brief reads a float value from byte buffer + + \param[in] buffer source byte buffer + \return the (float) value read from the buffer +*/ +API_EXPORT inline float UnpackFloat(const char* buffer) +{ + union + { + float value; + unsigned char valueBuffer[sizeof(float)]; + } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +/*! \fn double UnpackFloat(char* buffer) + \brief reads a float value from byte buffer + + This is an overloaded function. + + \param[in] buffer source byte buffer + \return the (float) value read from the buffer +*/ +API_EXPORT inline float UnpackFloat(char* buffer) +{ + return UnpackFloat((const char*)buffer); +} + +/*! \fn signed int UnpackSignedInt(const char* buffer) + \brief reads a signed integer value from byte buffer + + \param[in] buffer source byte buffer + \return the (signed int) value read from the buffer +*/ +API_EXPORT inline signed int UnpackSignedInt(const char* buffer) +{ + union + { + signed int value; + unsigned char valueBuffer[sizeof(signed int)]; + } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +/*! \fn signed int UnpackSignedInt(char* buffer) + \brief reads a signed integer value from byte buffer + + This is an overloaded function. + + \param[in] buffer source byte buffer + \return the (signed int) value read from the buffer +*/ +API_EXPORT inline signed int UnpackSignedInt(char* buffer) +{ + return UnpackSignedInt((const char*)buffer); +} + +/*! \fn signed short UnpackSignedShort(const char* buffer) + \brief reads a signed short integer value from byte buffer + + \param[in] buffer source byte buffer + \return the (signed short) value read from the buffer +*/ +API_EXPORT inline signed short UnpackSignedShort(const char* buffer) +{ + union + { + signed short value; + unsigned char valueBuffer[sizeof(signed short)]; + } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + return un.value; +} + +/*! \fn signed short UnpackSignedShort(char* buffer) + \brief reads a signed short integer value from byte buffer + + This is an overloaded function. + + \param[in] buffer source byte buffer + \return the (signed short) value read from the buffer +*/ +API_EXPORT inline signed short UnpackSignedShort(char* buffer) +{ + return UnpackSignedShort((const char*)buffer); +} + +/*! \fn unsigned int UnpackUnsignedInt(const char* buffer) + \brief reads an unsigned integer value from byte buffer + + \param[in] buffer source byte buffer + \return the (unsigned int) value read from the buffer +*/ +API_EXPORT inline unsigned int UnpackUnsignedInt(const char* buffer) +{ + union + { + unsigned int value; + unsigned char valueBuffer[sizeof(unsigned int)]; + } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +/*! \fn unsigned int UnpackUnsignedInt(char* buffer) + \brief reads an unsigned integer value from byte buffer + + This is an overloaded function. + + \param[in] buffer source byte buffer + \return the (unsigned int) value read from the buffer +*/ +API_EXPORT inline unsigned int UnpackUnsignedInt(char* buffer) +{ + return UnpackUnsignedInt((const char*)buffer); +} + +/*! \fn unsigned short UnpackUnsignedShort(const char* buffer) + \brief reads an unsigned short integer value from byte buffer + + \param[in] buffer source byte buffer + \return the (unsigned short) value read from the buffer +*/ +API_EXPORT inline unsigned short UnpackUnsignedShort(const char* buffer) +{ + union + { + unsigned short value; + unsigned char valueBuffer[sizeof(unsigned short)]; + } un; + un.value = 0; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + un.valueBuffer[0] = buffer[1]; + un.valueBuffer[1] = buffer[0]; +#else +#error "Unsupported hardware" +#endif + return un.value; +} + +/*! \fn unsigned short UnpackUnsignedShort(char* buffer) + \brief reads an unsigned short integer value from byte buffer + + This is an overloaded function. + + \param[in] buffer source byte buffer + \return the (unsigned short) value read from the buffer +*/ +API_EXPORT inline unsigned short UnpackUnsignedShort(char* buffer) +{ + return UnpackUnsignedShort((const char*)buffer); +} + +// ---------------------------------------------------------------- +// 'internal' helper structs + +/*! \struct RaiiBuffer + \internal +*/ +struct RaiiBuffer +{ + + // data members + char* Buffer; + const std::size_t NumBytes; + + // ctor & dtor + RaiiBuffer(const std::size_t n) + : Buffer(new char[n]()) + , NumBytes(n) + {} + + ~RaiiBuffer() + { + delete[] Buffer; + } + + // add'l methods + void Clear() + { + memset(Buffer, 0, NumBytes); + } +}; + +} // namespace BamTools + +#endif // BAMAUX_H diff --git a/src/api/BamConstants.h b/src/api/BamConstants.h new file mode 100644 index 0000000..973c13b --- /dev/null +++ b/src/api/BamConstants.h @@ -0,0 +1,323 @@ +// *************************************************************************** +// BamConstants.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 16 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides basic constants for handling BAM files. +// *************************************************************************** + +#ifndef BAM_CONSTANTS_H +#define BAM_CONSTANTS_H + +#include <cassert> +#include <string> +#include "api/api_global.h" + +/*! \namespace BamTools::Constants + \brief Provides basic constants for handling BAM files. +*/ + +namespace BamTools { +namespace Constants { + +const uint8_t BAM_SIZEOF_INT = 4; + +// header magic number +const char* const BAM_HEADER_MAGIC = "BAM\1"; +const uint8_t BAM_HEADER_MAGIC_LENGTH = 4; + +// BAM alignment core size +const uint8_t BAM_CORE_SIZE = 32; +const uint8_t BAM_CORE_BUFFER_SIZE = 8; + +// BAM alignment flags +const int BAM_ALIGNMENT_PAIRED = 0x0001; +const int BAM_ALIGNMENT_PROPER_PAIR = 0x0002; +const int BAM_ALIGNMENT_UNMAPPED = 0x0004; +const int BAM_ALIGNMENT_MATE_UNMAPPED = 0x0008; +const int BAM_ALIGNMENT_REVERSE_STRAND = 0x0010; +const int BAM_ALIGNMENT_MATE_REVERSE_STRAND = 0x0020; +const int BAM_ALIGNMENT_READ_1 = 0x0040; +const int BAM_ALIGNMENT_READ_2 = 0x0080; +const int BAM_ALIGNMENT_SECONDARY = 0x0100; +const int BAM_ALIGNMENT_QC_FAILED = 0x0200; +const int BAM_ALIGNMENT_DUPLICATE = 0x0400; + +// CIGAR constants +const char* const BAM_CIGAR_LOOKUP = "MIDNSHP=X"; +const uint8_t BAM_CIGAR_MATCH = 0; +const uint8_t BAM_CIGAR_INS = 1; +const uint8_t BAM_CIGAR_DEL = 2; +const uint8_t BAM_CIGAR_REFSKIP = 3; +const uint8_t BAM_CIGAR_SOFTCLIP = 4; +const uint8_t BAM_CIGAR_HARDCLIP = 5; +const uint8_t BAM_CIGAR_PAD = 6; +const uint8_t BAM_CIGAR_SEQMATCH = 7; +const uint8_t BAM_CIGAR_MISMATCH = 8; + +const char BAM_CIGAR_MATCH_CHAR = 'M'; +const char BAM_CIGAR_INS_CHAR = 'I'; +const char BAM_CIGAR_DEL_CHAR = 'D'; +const char BAM_CIGAR_REFSKIP_CHAR = 'N'; +const char BAM_CIGAR_SOFTCLIP_CHAR = 'S'; +const char BAM_CIGAR_HARDCLIP_CHAR = 'H'; +const char BAM_CIGAR_PAD_CHAR = 'P'; +const char BAM_CIGAR_SEQMATCH_CHAR = '='; +const char BAM_CIGAR_MISMATCH_CHAR = 'X'; + +const int BAM_CIGAR_SHIFT = 4; +const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1); + +// BAM tag types & sizes +const char BAM_TAG_TYPE_ASCII = 'A'; +const char BAM_TAG_TYPE_INT8 = 'c'; +const char BAM_TAG_TYPE_UINT8 = 'C'; +const char BAM_TAG_TYPE_INT16 = 's'; +const char BAM_TAG_TYPE_UINT16 = 'S'; +const char BAM_TAG_TYPE_INT32 = 'i'; +const char BAM_TAG_TYPE_UINT32 = 'I'; +const char BAM_TAG_TYPE_FLOAT = 'f'; +const char BAM_TAG_TYPE_STRING = 'Z'; +const char BAM_TAG_TYPE_HEX = 'H'; +const char BAM_TAG_TYPE_ARRAY = 'B'; + +const uint8_t BAM_TAG_TAGSIZE = 2; +const uint8_t BAM_TAG_TYPESIZE = 1; +const uint8_t BAM_TAG_ARRAYBASE_SIZE = 8; + +// DNA bases +const char* const BAM_DNA_LOOKUP = "=ACMGRSVTWYHKDBN"; +const uint8_t BAM_BASECODE_EQUAL = 0; +const uint8_t BAM_BASECODE_A = 1; +const uint8_t BAM_BASECODE_C = 2; +const uint8_t BAM_BASECODE_M = 3; +const uint8_t BAM_BASECODE_G = 4; +const uint8_t BAM_BASECODE_R = 5; +const uint8_t BAM_BASECODE_S = 6; +const uint8_t BAM_BASECODE_V = 7; +const uint8_t BAM_BASECODE_T = 8; +const uint8_t BAM_BASECODE_W = 9; +const uint8_t BAM_BASECODE_Y = 10; +const uint8_t BAM_BASECODE_H = 11; +const uint8_t BAM_BASECODE_K = 12; +const uint8_t BAM_BASECODE_D = 13; +const uint8_t BAM_BASECODE_B = 14; +const uint8_t BAM_BASECODE_N = 15; + +const char BAM_DNA_EQUAL = '='; +const char BAM_DNA_A = 'A'; +const char BAM_DNA_C = 'C'; +const char BAM_DNA_M = 'M'; +const char BAM_DNA_G = 'G'; +const char BAM_DNA_R = 'R'; +const char BAM_DNA_S = 'S'; +const char BAM_DNA_V = 'V'; +const char BAM_DNA_T = 'T'; +const char BAM_DNA_W = 'W'; +const char BAM_DNA_Y = 'Y'; +const char BAM_DNA_H = 'H'; +const char BAM_DNA_K = 'K'; +const char BAM_DNA_D = 'D'; +const char BAM_DNA_B = 'B'; +const char BAM_DNA_N = 'N'; +const char BAM_DNA_DEL = '-'; +const char BAM_DNA_PAD = '*'; + +// zlib & BGZF constants +const char GZIP_ID1 = 31; +const char GZIP_ID2 = static_cast<char>(139); +const char CM_DEFLATE = 8; +const char FLG_FEXTRA = 4; +const char OS_UNKNOWN = static_cast<char>(255); +const char BGZF_XLEN = 6; +const char BGZF_ID1 = 66; +const char BGZF_ID2 = 67; +const char BGZF_LEN = 2; + +const int8_t GZIP_WINDOW_BITS = -15; +const int8_t Z_DEFAULT_MEM_LEVEL = 8; +const uint8_t BGZF_BLOCK_HEADER_LENGTH = 18; +const uint8_t BGZF_BLOCK_FOOTER_LENGTH = 8; +const uint32_t BGZF_MAX_BLOCK_SIZE = 65536; +const uint32_t BGZF_DEFAULT_BLOCK_SIZE = 65536; + +} // namespace Constants + +//! \cond +// ------------------------- +// tag-type helper structs +// ------------------------- + +// fail on any types not specified below +template <typename T> +struct TagTypeHelper +{ + static bool CanConvertFrom(const char) + { + assert(false); + return false; + } + static bool CanConvertTo(const char) + { + assert(false); + return false; + } + static char TypeCode() + { + assert(false); + return 0; + } +}; + +template <> +struct TagTypeHelper<uint8_t> +{ + static bool CanConvertFrom(const char c) + { + return (c == Constants::BAM_TAG_TYPE_ASCII || c == Constants::BAM_TAG_TYPE_UINT8); + } + static bool CanConvertTo(const char c) + { + return (c == Constants::BAM_TAG_TYPE_ASCII || c == Constants::BAM_TAG_TYPE_UINT8 || + c == Constants::BAM_TAG_TYPE_UINT16 || c == Constants::BAM_TAG_TYPE_UINT32); + } + + static char TypeCode() + { + return Constants::BAM_TAG_TYPE_UINT8; + } +}; + +template <> +struct TagTypeHelper<int8_t> +{ + static bool CanConvertFrom(const char c) + { + return (c == Constants::BAM_TAG_TYPE_ASCII || c == Constants::BAM_TAG_TYPE_INT8); + } + static bool CanConvertTo(const char c) + { + return (c == Constants::BAM_TAG_TYPE_ASCII || c == Constants::BAM_TAG_TYPE_INT8 || + c == Constants::BAM_TAG_TYPE_INT16 || c == Constants::BAM_TAG_TYPE_INT32); + } + static char TypeCode() + { + return Constants::BAM_TAG_TYPE_INT8; + } +}; + +template <> +struct TagTypeHelper<uint16_t> +{ + static bool CanConvertFrom(const char c) + { + return (c == Constants::BAM_TAG_TYPE_ASCII || c == Constants::BAM_TAG_TYPE_UINT8 || + c == Constants::BAM_TAG_TYPE_UINT16); + } + static bool CanConvertTo(const char c) + { + return (c == Constants::BAM_TAG_TYPE_UINT16 || c == Constants::BAM_TAG_TYPE_UINT32); + } + static char TypeCode() + { + return Constants::BAM_TAG_TYPE_UINT16; + } +}; + +template <> +struct TagTypeHelper<int16_t> +{ + static bool CanConvertFrom(const char c) + { + return (c == Constants::BAM_TAG_TYPE_ASCII || c == Constants::BAM_TAG_TYPE_INT8 || + c == Constants::BAM_TAG_TYPE_INT16); + } + static bool CanConvertTo(const char c) + { + return (c == Constants::BAM_TAG_TYPE_INT16 || c == Constants::BAM_TAG_TYPE_INT32); + } + static char TypeCode() + { + return Constants::BAM_TAG_TYPE_INT16; + } +}; + +template <> +struct TagTypeHelper<uint32_t> +{ + static bool CanConvertFrom(const char c) + { + return (c == Constants::BAM_TAG_TYPE_ASCII || c == Constants::BAM_TAG_TYPE_UINT8 || + c == Constants::BAM_TAG_TYPE_UINT16 || c == Constants::BAM_TAG_TYPE_UINT32); + } + static bool CanConvertTo(const char c) + { + return (c == Constants::BAM_TAG_TYPE_UINT32); + } + static char TypeCode() + { + return Constants::BAM_TAG_TYPE_UINT32; + } +}; + +template <> +struct TagTypeHelper<int32_t> +{ + static bool CanConvertFrom(const char c) + { + return (c == Constants::BAM_TAG_TYPE_ASCII || c == Constants::BAM_TAG_TYPE_INT8 || + c == Constants::BAM_TAG_TYPE_INT16 || c == Constants::BAM_TAG_TYPE_INT32); + } + static bool CanConvertTo(const char c) + { + return (c == Constants::BAM_TAG_TYPE_INT32); + } + static char TypeCode() + { + return Constants::BAM_TAG_TYPE_INT32; + } +}; + +template <> +struct TagTypeHelper<float> +{ + static bool CanConvertFrom(const char c) + { + return (c == Constants::BAM_TAG_TYPE_ASCII || c == Constants::BAM_TAG_TYPE_UINT8 || + c == Constants::BAM_TAG_TYPE_INT8 || c == Constants::BAM_TAG_TYPE_UINT16 || + c == Constants::BAM_TAG_TYPE_INT16 || c == Constants::BAM_TAG_TYPE_UINT32 || + c == Constants::BAM_TAG_TYPE_INT32 || c == Constants::BAM_TAG_TYPE_FLOAT); + } + static bool CanConvertTo(const char c) + { + return (c == Constants::BAM_TAG_TYPE_FLOAT); + } + static char TypeCode() + { + return Constants::BAM_TAG_TYPE_FLOAT; + } +}; + +template <> +struct TagTypeHelper<std::string> +{ + static bool CanConvertFrom(const char c) + { + return (c == Constants::BAM_TAG_TYPE_HEX || c == Constants::BAM_TAG_TYPE_STRING); + } + static bool CanConvertTo(const char c) + { + return (c == Constants::BAM_TAG_TYPE_HEX || c == Constants::BAM_TAG_TYPE_STRING); + } + static char TypeCode() + { + return Constants::BAM_TAG_TYPE_STRING; + } +}; + +//! \endcond + +} // namespace BamTools + +#endif // BAM_CONSTANTS_H diff --git a/src/api/BamIndex.h b/src/api/BamIndex.h new file mode 100644 index 0000000..ccf3036 --- /dev/null +++ b/src/api/BamIndex.h @@ -0,0 +1,98 @@ +// *************************************************************************** +// BamIndex.h (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides basic BAM index interface +// *************************************************************************** + +#ifndef BAM_INDEX_H +#define BAM_INDEX_H + +#include <string> +#include "api/BamAux.h" +#include "api/api_global.h" + +namespace BamTools { + +namespace Internal { +class BamReaderPrivate; +} // namespace Internal + +/*! \class BamTools::BamIndex + \brief Provides methods for generating & loading BAM index files. + + This class straddles the line between public API and internal + implementation detail. Most client code should never have to use this + class directly. + + It is exposed to the public API to allow advanced users to implement + their own custom indexing schemes. +*/ + +class API_EXPORT BamIndex +{ + + // enums +public: + // list of supported BamIndex types + enum IndexType + { + BAMTOOLS = 0, + STANDARD + }; + + // ctor & dtor +public: + BamIndex(Internal::BamReaderPrivate* reader) + : m_reader(reader) + {} + virtual ~BamIndex() {} + + // index interface +public: + // builds index from associated BAM file & writes out to index file + virtual bool Create() = 0; + + // returns a human-readable description of the last error encountered + std::string GetErrorString() + { + return m_errorString; + } + + // returns whether reference has alignments or no + virtual bool HasAlignments(const int& referenceID) const = 0; + + // attempts to use index data to jump to @region, returns success/fail + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + virtual bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) = 0; + + // loads existing data from file into memory + virtual bool Load(const std::string& filename) = 0; + + // returns the 'type' enum for derived index format + virtual BamIndex::IndexType Type() const = 0; + + //! \cond + + // internal methods +protected: + void SetErrorString(const std::string& where, const std::string& what) const + { + m_errorString = where + ": " + what; + } + + // data members +protected: + Internal::BamReaderPrivate* m_reader; // copy, not owned + mutable std::string m_errorString; + + //! \endcond +}; + +} // namespace BamTools + +#endif // BAM_INDEX_H diff --git a/src/api/BamMultiReader.cpp b/src/api/BamMultiReader.cpp new file mode 100644 index 0000000..82a98a5 --- /dev/null +++ b/src/api/BamMultiReader.cpp @@ -0,0 +1,442 @@ +// *************************************************************************** +// BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 14 January 2013 (DB) +// --------------------------------------------------------------------------- +// Convenience class for reading multiple BAM files. +// +// This functionality allows applications to work on very large sets of files +// without requiring intermediate merge, sort, and index steps for each file +// subset. It also improves the performance of our merge system as it +// precludes the need to sort merged files. +// *************************************************************************** + +#include "api/BamMultiReader.h" +#include "api/internal/bam/BamMultiReader_p.h" +using namespace BamTools; + +#include <string> +#include <vector> + +/*! \class BamTools::BamMultiReader + \brief Convenience class for reading multiple BAM files. +*/ +/*! \enum BamMultiReader::MergeOrder + \brief Used to describe the merge strategy of the BamMultiReader. + + The merge strategy determines which alignment is 'next' from across + all opened BAM files. +*/ +/*! \var BamMultiReader::MergeOrder BamMultiReader::RoundRobinMerge + \brief Merge strategy when BAM files are unsorted, or their sorted status is either unknown or ignored +*/ +/*! \var BamMultiReader::MergeOrder BamMultiReader::MergeByCoordinate + \brief Merge strategy when BAM files are sorted by position ('coordinate') +*/ +/*! \var BamMultiReader::MergeOrder BamMultiReader::MergeByName + \brief Merge strategy when BAM files are sorted by read name ('queryname') +*/ + +/*! \fn BamMultiReader::BamMultiReader() + \brief constructor +*/ +BamMultiReader::BamMultiReader() + : d(new Internal::BamMultiReaderPrivate) +{} + +/*! \fn BamMultiReader::~BamMultiReader() + \brief destructor +*/ +BamMultiReader::~BamMultiReader() +{ + delete d; + d = 0; +} + +/*! \fn void BamMultiReader::Close() + \brief Closes all open BAM files. + + Also clears out all header and reference data. + + \sa CloseFile(), IsOpen(), Open(), BamReader::Close() +*/ +bool BamMultiReader::Close() +{ + return d->Close(); +} + +/*! \fn void BamMultiReader::CloseFile(const std::string& filename) + \brief Closes requested BAM file. + + Leaves any other file(s) open, along with header and reference data. + + \param[in] filename name of specific BAM file to close + + \sa Close(), IsOpen(), Open(), BamReader::Close() +*/ +bool BamMultiReader::CloseFile(const std::string& filename) +{ + return d->CloseFile(filename); +} + +/*! \fn bool BamMultiReader::CreateIndexes(const BamIndex::IndexType& type) + \brief Creates index files for the current BAM files. + + \param[in] type file format to create, see BamIndex::IndexType for available formats + \return \c true if index files created OK + \sa LocateIndexes(), OpenIndexes(), BamReader::CreateIndex() +*/ +bool BamMultiReader::CreateIndexes(const BamIndex::IndexType& type) +{ + return d->CreateIndexes(type); +} + +/*! \fn const std::vector<std::string> BamMultiReader::Filenames() const + \brief Returns list of filenames for all open BAM files. + + Retrieved filenames will contain whatever was passed via Open(). + If you need full directory paths here, be sure to include them + when you open the BAM files. + + \returns names of open BAM files. If no files are open, returns an empty vector. + \sa IsOpen(), BamReader::GetFilename() +*/ +const std::vector<std::string> BamMultiReader::Filenames() const +{ + return d->Filenames(); +} + +/*! \fn std::string BamMultiReader::GetErrorString() const + \brief Returns a human-readable description of the last error that occurred + + This method allows elimination of STDERR pollution. Developers of client code + may choose how the messages are displayed to the user, if at all. + + \return error description +*/ +std::string BamMultiReader::GetErrorString() const +{ + return d->GetErrorString(); +} + +/*! \fn SamHeader BamMultiReader::GetHeader() const + \brief Returns unified SAM-format header for all files + + \note Modifying the retrieved text does NOT affect the current + BAM files. These files have been opened in a read-only mode. However, + your modified header text can be used in conjunction with BamWriter + to generate a new BAM file with the appropriate header information. + + \returns header data wrapped in SamHeader object + \sa GetHeaderText(), BamReader::GetHeader() +*/ +SamHeader BamMultiReader::GetHeader() const +{ + return d->GetHeader(); +} + +/*! \fn std::string BamMultiReader::GetHeaderText() const + \brief Returns unified SAM-format header text for all files + + \note Modifying the retrieved text does NOT affect the current + BAM files. These files have been opened in a read-only mode. However, + your modified header text can be used in conjunction with BamWriter + to generate a new BAM file with the appropriate header information. + + \returns SAM-formatted header text + \sa GetHeader(), BamReader::GetHeaderText() +*/ +std::string BamMultiReader::GetHeaderText() const +{ + return d->GetHeaderText(); +} + +/*! \fn BamMultiReader::MergeOrder BamMultiReader::GetMergeOrder() const + \brief Returns curent merge order strategy. + + \returns current merge order enum value + \sa BamMultiReader::MergeOrder, SetExplicitMergeOrder() +*/ +BamMultiReader::MergeOrder BamMultiReader::GetMergeOrder() const +{ + return d->GetMergeOrder(); +} + +/*! \fn bool BamMultiReader::GetNextAlignment(BamAlignment& alignment) + \brief Retrieves next available alignment. + + Equivalent to BamReader::GetNextAlignment() with respect to what is a valid + overlapping alignment and what data gets populated. + + This method takes care of determining which alignment actually is 'next' + across multiple files, depending on their sort order. + + \param[out] alignment destination for alignment record data + \returns \c true if a valid alignment was found + \sa GetNextAlignmentCore(), SetExplicitMergeOrder(), SetRegion(), BamReader::GetNextAlignment() +*/ +bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) +{ + return d->GetNextAlignment(nextAlignment); +} + +/*! \fn bool BamMultiReader::GetNextAlignmentCore(BamAlignment& alignment) + \brief Retrieves next available alignment. + + Equivalent to BamReader::GetNextAlignmentCore() with respect to what is a valid + overlapping alignment and what data gets populated. + + This method takes care of determining which alignment actually is 'next' + across multiple files, depending on their sort order. + + \param[out] alignment destination for alignment record data + \returns \c true if a valid alignment was found + \sa GetNextAlignment(), SetExplicitMergeOrder(), SetRegion(), BamReader::GetNextAlignmentCore() +*/ +bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) +{ + return d->GetNextAlignmentCore(nextAlignment); +} + +/*! \fn int BamMultiReader::GetReferenceCount() const + \brief Returns number of reference sequences. + \sa BamReader::GetReferenceCount() +*/ +int BamMultiReader::GetReferenceCount() const +{ + return d->GetReferenceCount(); +} + +/*! \fn const RefVector& BamMultiReader::GetReferenceData() const + \brief Returns all reference sequence entries. + \sa RefData, BamReader::GetReferenceData() +*/ +const BamTools::RefVector BamMultiReader::GetReferenceData() const +{ + return d->GetReferenceData(); +} + +/*! \fn int BamMultiReader::GetReferenceID(const std::string& refName) const + \brief Returns the ID of the reference with this name. + + If \a refName is not found, returns -1. + + \param[in] refName name of reference to look up + \sa BamReader::GetReferenceID() +*/ +int BamMultiReader::GetReferenceID(const std::string& refName) const +{ + return d->GetReferenceID(refName); +} + +/*! \fn bool BamMultiReader::HasIndexes() const + \brief Returns \c true if all BAM files have index data available. + \sa BamReader::HasIndex() +*/ +bool BamMultiReader::HasIndexes() const +{ + return d->HasIndexes(); +} + +/*! \fn bool BamMultiReader::HasOpenReaders() const + \brief Returns \c true if there are any open BAM files. +*/ +bool BamMultiReader::HasOpenReaders() const +{ + return d->HasOpenReaders(); +} + +/*! \fn bool BamMultiReader::Jump(int refID, int position) + \brief Performs a random-access jump within current BAM files. + + This is a convenience method, equivalent to calling SetRegion() + with only a left boundary specified. + + \param[in] refID ID of reference to jump to + \param[in] position (0-based) left boundary + + \returns \c true if jump was successful + \sa HasIndex(), BamReader::Jump() +*/ + +bool BamMultiReader::Jump(int refID, int position) +{ + return d->Jump(refID, position); +} + +/*! \fn bool BamMultiReader::LocateIndexes(const BamIndex::IndexType& preferredType) + \brief Looks for index files that match current BAM files. + + Use this function when you need index files, and perhaps have a + preferred index format, but do not depend heavily on which indexes + actually get loaded at runtime. + + For each BAM file, this function will defer to your \a preferredType + whenever possible. However, if an index file of \a preferredType can + not be found, then it will look for any other index file that matches + that BAM file. + + An example case would look this: + \code + BamMultiReader reader; + + // do setup... + + // ensure that all files have an index + if ( !reader.LocateIndexes() ) // opens any existing index files that match our BAM files + reader.CreateIndexes(); // creates index files for any BAM files that still lack one + + // do interesting stuff using random-access... + + \endcode + + If you want precise control over which index files are loaded, use OpenIndexes() + with the desired index filenames. If that function returns false, you can use + CreateIndexes() to then build index files of the exact requested format. + + \param[in] preferredType desired index file format, see BamIndex::IndexType for available formats + \returns \c true if index files could be found for \b ALL open BAM files + \sa BamReader::LocateIndex() +*/ +bool BamMultiReader::LocateIndexes(const BamIndex::IndexType& preferredType) +{ + return d->LocateIndexes(preferredType); +} + +/*! \fn bool BamMultiReader::Open(const std::vector<std::string>& filenames) + \brief Opens BAM files. + + \note Opening BAM files will invalidate any current region set on the multireader. + All file pointers will be returned to the beginning of the alignment data. Follow + this with Jump() or SetRegion() to establish a region of interest. + + \param[in] filenames list of BAM filenames to open + \returns \c true if BAM files were opened successfully + \sa Close(), HasOpenReaders(), OpenFile(), OpenIndexes(), BamReader::Open() +*/ +bool BamMultiReader::Open(const std::vector<std::string>& filenames) +{ + return d->Open(filenames); +} + +/*! \fn bool BamMultiReader::OpenFile(const std::string& filename) + \brief Opens a single BAM file. + + Adds another BAM file to multireader "on-the-fly". + + \note Opening a BAM file will invalidate any current region set on the multireader. + All file pointers will be returned to the beginning of the alignment data. Follow + this with Jump() or SetRegion() to establish a region of interest. + + \param[in] filename BAM filename to open + \returns \c true if BAM file was opened successfully + \sa Close(), HasOpenReaders(), Open(), OpenIndexes(), BamReader::Open() +*/ +bool BamMultiReader::OpenFile(const std::string& filename) +{ + return d->OpenFile(filename); +} + +/*! \fn bool BamMultiReader::OpenIndexes(const std::vector<std::string>& indexFilenames) + \brief Opens index files for current BAM files. + + \note Currently assumes that index filenames match the order (and number) of + BAM files passed to Open(). + + \param[in] indexFilenames list of BAM index file names + \returns \c true if BAM index file was opened & data loaded successfully + \sa LocateIndex(), Open(), SetIndex(), BamReader::OpenIndex() +*/ +bool BamMultiReader::OpenIndexes(const std::vector<std::string>& indexFilenames) +{ + return d->OpenIndexes(indexFilenames); +} + +/*! \fn bool BamMultiReader::Rewind() + \brief Returns the internal file pointers to the beginning of alignment records. + + Useful for performing multiple sequential passes through BAM files. + Calling this function clears any prior region that may have been set. + + \returns \c true if rewind operation was successful + \sa Jump(), SetRegion(), BamReader::Rewind() +*/ +bool BamMultiReader::Rewind() +{ + return d->Rewind(); +} + +/*! \fn void BamMultiReader::SetExplicitMergeOrder(BamMultiReader::MergeOrder order) + \brief Sets an explicit merge order, regardless of the BAM files' SO header tag. + + The default behavior of the BamMultiReader is to check the SO tag in the BAM files' + SAM header text to determine the merge strategy". The merge strategy is used to + determine from which BAM file the next alignment should come when either + GetNextAlignment() or GetNextAlignmentCore() are called. If files share a + 'coordinate' or 'queryname' value for this tag, then the merge strategy is + selected accordingly. If any of them do not match, or if any fileis marked as + 'unsorted', then the merge strategy is simply a round-robin. + + This method allows client code to explicitly override the lookup behavior. This + method can be useful when you know, for example, that your BAM files are sorted + by coordinate but upstream processes did not set the header tag properly. + + \note This method should \b not be called while reading alignments via + GetNextAlignment() or GetNextAlignmentCore(). For proper results, you should + call this method before (or immediately after) opening files, rewinding, + jumping, etc. but \b not once alignment fetching has started. There is + nothing in the API to prevent you from doing so, but the results may be + unexpected. + + \returns \c true if merge order could be successfully applied + \sa BamMultiReader::MergeOrder, GetMergeOrder(), GetNextAlignment(), GetNextAlignmentCore() +*/ +bool BamMultiReader::SetExplicitMergeOrder(BamMultiReader::MergeOrder order) +{ + return d->SetExplicitMergeOrder(order); +} + +/*! \fn bool BamMultiReader::SetRegion(const BamRegion& region) + \brief Sets a target region of interest + + Equivalent to calling BamReader::SetRegion() on all open BAM files. + + \warning BamRegion now represents a zero-based, HALF-OPEN interval. + In previous versions of BamTools (0.x & 1.x) all intervals were treated + as zero-based, CLOSED. + + \param[in] region desired region-of-interest to activate + \returns \c true if ALL readers set the region successfully + \sa HasIndexes(), Jump(), BamReader::SetRegion() +*/ +bool BamMultiReader::SetRegion(const BamRegion& region) +{ + return d->SetRegion(region); +} + +/*! \fn bool BamMultiReader::SetRegion(const int& leftRefID, + const int& leftPosition, + const int& rightRefID, + const int& rightPosition) + \brief Sets a target region of interest + + This is an overloaded function. Equivalent to calling BamReader::SetRegion() on all open BAM files. + + \warning This function now expects a zero-based, HALF-OPEN interval. + In previous versions of BamTools (0.x & 1.x) all intervals were treated + as zero-based, CLOSED. + + \param[in] leftRefID referenceID of region's left boundary + \param[in] leftPosition position of region's left boundary + \param[in] rightRefID reference ID of region's right boundary + \param[in] rightPosition position of region's right boundary + + \returns \c true if ALL readers set the region successfully + \sa HasIndexes(), Jump(), BamReader::SetRegion() +*/ +bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, + const int& rightPosition) +{ + return d->SetRegion(BamRegion(leftRefID, leftPosition, rightRefID, rightPosition)); +} diff --git a/src/api/BamMultiReader.h b/src/api/BamMultiReader.h new file mode 100644 index 0000000..dca1b1d --- /dev/null +++ b/src/api/BamMultiReader.h @@ -0,0 +1,127 @@ +// *************************************************************************** +// BamMultiReader.h (c) 2010 Erik Garrison, Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 14 January 2013 (DB) +// --------------------------------------------------------------------------- +// Convenience class for reading multiple BAM files. +// *************************************************************************** + +#ifndef BAMMULTIREADER_H +#define BAMMULTIREADER_H + +#include <map> +#include <sstream> +#include <string> +#include <utility> +#include "api/BamReader.h" +#include "api/api_global.h" + +namespace BamTools { + +namespace Internal { +class BamMultiReaderPrivate; +} // namespace Internal + +class API_EXPORT BamMultiReader +{ + + // enums +public: + // possible merge order strategies + enum MergeOrder + { + RoundRobinMerge = 0, + MergeByCoordinate, + MergeByName + }; + + // constructor / destructor +public: + BamMultiReader(); + ~BamMultiReader(); + + // public interface +public: + // ---------------------- + // BAM file operations + // ---------------------- + + // closes all open BAM files + bool Close(); + // close only the requested BAM file + bool CloseFile(const std::string& filename); + // returns list of filenames for all open BAM files + const std::vector<std::string> Filenames() const; + // returns curent merge order strategy + BamMultiReader::MergeOrder GetMergeOrder() const; + // returns true if multireader has any open BAM files + bool HasOpenReaders() const; + // performs random-access jump within current BAM files + bool Jump(int refID, int position = 0); + // opens BAM files + bool Open(const std::vector<std::string>& filenames); + // opens a single BAM file, adding to any other current BAM files + bool OpenFile(const std::string& filename); + // returns file pointers to beginning of alignments + bool Rewind(); + // sets an explicit merge order, regardless of the BAM files' SO header tag + bool SetExplicitMergeOrder(BamMultiReader::MergeOrder order); + // sets the target region of interest + bool SetRegion(const BamRegion& region); + // sets the target region of interest + bool SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, + const int& rightPosition); + + // ---------------------- + // access alignment data + // ---------------------- + + // retrieves next available alignment + bool GetNextAlignment(BamAlignment& alignment); + // retrieves next available alignment (without populating the alignment's string data fields) + bool GetNextAlignmentCore(BamAlignment& alignment); + + // ---------------------- + // access auxiliary data + // ---------------------- + + // returns unified SAM header for all files + SamHeader GetHeader() const; + // returns unified SAM header text for all files + std::string GetHeaderText() const; + // returns number of reference sequences + int GetReferenceCount() const; + // returns all reference sequence entries. + const BamTools::RefVector GetReferenceData() const; + // returns the ID of the reference with this name. + int GetReferenceID(const std::string& refName) const; + + // ---------------------- + // BAM index operations + // ---------------------- + + // creates index files for current BAM files + bool CreateIndexes(const BamIndex::IndexType& type = BamIndex::STANDARD); + // returns true if all BAM files have index data available + bool HasIndexes() const; + // looks for index files that match current BAM files + bool LocateIndexes(const BamIndex::IndexType& preferredType = BamIndex::STANDARD); + // opens index files for current BAM files. + bool OpenIndexes(const std::vector<std::string>& indexFilenames); + + // ---------------------- + // error handling + // ---------------------- + + // returns a human-readable description of the last error that occurred + std::string GetErrorString() const; + + // private implementation +private: + Internal::BamMultiReaderPrivate* d; +}; + +} // namespace BamTools + +#endif // BAMMULTIREADER_H diff --git a/src/api/BamReader.cpp b/src/api/BamReader.cpp new file mode 100644 index 0000000..56e6c39 --- /dev/null +++ b/src/api/BamReader.cpp @@ -0,0 +1,402 @@ +// *************************************************************************** +// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 29 July 2013 (DB) +// --------------------------------------------------------------------------- +// Provides read access to BAM files. +// *************************************************************************** + +#include "api/BamReader.h" +#include "api/internal/bam/BamReader_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <algorithm> +#include <iostream> +#include <iterator> +#include <string> +#include <vector> + +/*! \class BamTools::BamReader + \brief Provides read access to BAM files. +*/ + +/*! \fn BamReader::BamReader() + \brief constructor +*/ +BamReader::BamReader() + : d(new BamReaderPrivate(this)) +{} + +/*! \fn BamReader::~BamReader() + \brief destructor +*/ +BamReader::~BamReader() +{ + delete d; + d = 0; +} + +/*! \fn bool BamReader::Close() + \brief Closes the current BAM file. + + Also clears out all header and reference data. + + \return \c true if file closed OK + \sa IsOpen(), Open() +*/ +bool BamReader::Close() +{ + return d->Close(); +} + +/*! \fn bool BamReader::CreateIndex(const BamIndex::IndexType& type) + \brief Creates an index file for current BAM file. + + \param[in] type file format to create, see BamIndex::IndexType for available formats + \return \c true if index created OK + \sa LocateIndex(), OpenIndex() +*/ +bool BamReader::CreateIndex(const BamIndex::IndexType& type) +{ + return d->CreateIndex(type); +} + +/*! \fn const SamHeader& BamReader::GetConstSamHeader() const + \brief Returns const reference to SAM header data. + + Allows for read-only queries of SAM header data. + + If you do not need to modify the SAM header, use this method to avoid the + potentially expensive copy used by GetHeader(). + + \note + \returns const reference to header data object + \sa GetHeader(), GetHeaderText() +*/ +const SamHeader& BamReader::GetConstSamHeader() const +{ + return d->GetConstSamHeader(); +} + +/*! \fn std::string BamReader::GetErrorString() const + \brief Returns a human-readable description of the last error that occurred + + This method allows elimination of STDERR pollution. Developers of client code + may choose how the messages are displayed to the user, if at all. + + \return error description +*/ +std::string BamReader::GetErrorString() const +{ + return d->GetErrorString(); +} + +/*! \fn const std::string BamReader::GetFilename() const + \brief Returns name of current BAM file. + + Retrieved filename will contain whatever was passed via Open(). + If you need full directory paths here, be sure to include them + when you open the BAM file. + + \returns name of open BAM file. If no file is open, returns an empty string. + \sa IsOpen() +*/ +const std::string BamReader::GetFilename() const +{ + return d->Filename(); +} + +/*! \fn SamHeader BamReader::GetHeader() const + \brief Returns SAM header data. + + Header data is wrapped in a SamHeader object that can be conveniently queried and/or modified. + If you only need read access, consider using GetConstSamHeader() instead. + + \note Modifying the retrieved SamHeader object does NOT affect the + current BAM file. This file has been opened in a read-only mode. + However, your modified SamHeader object can be used in conjunction with + BamWriter to generate a new BAM file with the appropriate header information. + + \returns header data object + \sa GetConstSamHeader(), GetHeaderText() +*/ +SamHeader BamReader::GetHeader() const +{ + return d->GetSamHeader(); +} + +/*! \fn std::string BamReader::GetHeaderText() const + \brief Returns SAM header data, as SAM-formatted text. + + \note Modifying the retrieved text does NOT affect the current + BAM file. This file has been opened in a read-only mode. However, + your modified header text can be used in conjunction with BamWriter + to generate a new BAM file with the appropriate header information. + + \returns SAM-formatted header text + \sa GetHeader() +*/ +std::string BamReader::GetHeaderText() const +{ + return d->GetHeaderText(); +} + +/*! \fn bool BamReader::GetNextAlignment(BamAlignment& alignment) + \brief Retrieves next available alignment. + + Attempts to read the next alignment record from BAM file, and checks to see + if it overlaps the current region. If no region is currently set, then the + next alignment available is always considered valid. + + If a region has been set, via Jump() or SetRegion(), an alignment is only + considered valid if it overlaps the region. If the actual 'next' alignment record + in the BAM file does not overlap this region, then this function will read sequentially + through the file until the next alignment that overlaps this region is found. + Once the region has been exhausted (i.e. the next alignment loaded is beyond the region), + the function aborts and returns \c false. In this case, there is no point to continue + reading, assuming properly sorted alignments. + + This function fully populates all of the alignment's available data fields, + including the string data fields (read name, bases, qualities, tags, filename). + If only positional data (refID, position, CIGAR ops, alignment flags, etc.) + are required, consider using GetNextAlignmentCore() for a significant + performance boost. + + \param[out] alignment destination for alignment record data + \returns \c true if a valid alignment was found +*/ +bool BamReader::GetNextAlignment(BamAlignment& alignment) +{ + return d->GetNextAlignment(alignment); +} + +/*! \fn bool BamReader::GetNextAlignmentCore(BamAlignment& alignment) + \brief Retrieves next available alignment, without populating the alignment's string data fields. + + Equivalent to GetNextAlignment() with respect to what is a valid overlapping alignment. + + However, this method does NOT populate the alignment's string data fields + (read name, bases, qualities, tags, filename). This provides a boost in speed + when these fields are not required for every alignment. These fields, excluding filename, + can be populated 'lazily' (as needed) by calling BamAlignment::BuildCharData() later. + + \param[out] alignment destination for alignment record data + \returns \c true if a valid alignment was found + \sa SetRegion() +*/ +bool BamReader::GetNextAlignmentCore(BamAlignment& alignment) +{ + return d->GetNextAlignmentCore(alignment); +} + +/*! \fn int BamReader::GetReferenceCount() const + \brief Returns number of reference sequences. +*/ +int BamReader::GetReferenceCount() const +{ + return d->GetReferenceCount(); +} + +/*! \fn const RefVector& BamReader::GetReferenceData() const + \brief Returns all reference sequence entries. + \sa RefData +*/ +const RefVector& BamReader::GetReferenceData() const +{ + return d->GetReferenceData(); +} + +/*! \fn int BamReader::GetReferenceID(const std::string& refName) const + \brief Returns the ID of the reference with this name. + + If \a refName is not found, returns -1. + + \param[in] refName name of reference to look up +*/ +int BamReader::GetReferenceID(const std::string& refName) const +{ + return d->GetReferenceID(refName); +} + +/*! \fn bool BamReader::HasIndex() const + \brief Returns \c true if index data is available. +*/ +bool BamReader::HasIndex() const +{ + return d->HasIndex(); +} + +/*! \fn bool BamReader::IsOpen() const + \brief Returns \c true if a BAM file is open for reading. +*/ +bool BamReader::IsOpen() const +{ + return d->IsOpen(); +} + +/*! \fn bool BamReader::Jump(int refID, int position) + \brief Performs a random-access jump within BAM file. + + This is a convenience method, equivalent to calling SetRegion() + with only a left boundary specified. + + \param[in] refID left-bound reference ID + \param[in] position left-bound position + + \returns \c true if jump was successful + \sa HasIndex() +*/ +bool BamReader::Jump(int refID, int position) +{ + return d->SetRegion(BamRegion(refID, position)); +} + +/*! \fn bool BamReader::LocateIndex(const BamIndex::IndexType& preferredType) + \brief Looks in BAM file's directory for a matching index file. + + Use this function when you need an index file, and perhaps have a + preferred index format, but do not depend heavily on which format + actually gets loaded at runtime. + + This function will defer to your \a preferredType whenever possible. + However, if an index file of \a preferredType can not be found, then + it will look for any other index file that corresponds to this BAM file. + + If you want precise control over which index file is loaded, use OpenIndex() + with the desired index filename. If that function returns false, you can use + CreateIndex() to then build an index of the exact requested format. + + \param[in] preferredType desired index file format, see BamIndex::IndexType for available formats + + \returns \c true if (any) index file could be found +*/ +bool BamReader::LocateIndex(const BamIndex::IndexType& preferredType) +{ + return d->LocateIndex(preferredType); +} + +/*! \fn bool BamReader::Open(const std::string& filename) + \brief Opens a BAM file. + + If BamReader is already opened on another file, this function closes + that file, then attempts to open requested \a filename. + + \param[in] filename name of BAM file to open + + \returns \c true if BAM file was opened successfully + \sa Close(), IsOpen(), OpenIndex() +*/ +bool BamReader::Open(const std::string& filename) +{ + return d->Open(filename); +} + +/*! \fn bool BamReader::OpenIndex(const std::string& indexFilename) + \brief Opens a BAM index file. + + \param[in] indexFilename name of BAM index file to open + + \returns \c true if BAM index file was opened & data loaded successfully + \sa LocateIndex(), Open(), SetIndex() +*/ +bool BamReader::OpenIndex(const std::string& indexFilename) +{ + return d->OpenIndex(indexFilename); +} + +/*! \fn bool BamReader::Rewind() + \brief Returns the internal file pointer to the first alignment record. + + Useful for performing multiple sequential passes through a BAM file. + Calling this function clears any prior region that may have been set. + + \note This function sets the file pointer to first alignment record + in the BAM file, NOT the beginning of the file. + + \returns \c true if rewind operation was successful + \sa Jump(), SetRegion() +*/ +bool BamReader::Rewind() +{ + return d->Rewind(); +} + +/*! \fn void BamReader::SetIndex(BamIndex* index) + \brief Sets a custom BamIndex on this reader. + + Only necessary for custom BamIndex subclasses. Most clients should + never have to use this function. + + Example: + \code + BamReader reader; + reader.SetIndex(new MyCustomBamIndex); + \endcode + + \note BamReader takes ownership of \a index - i.e. the BamReader will + take care of deleting it when the reader is destructed, when the current + BAM file is closed, or when a new index is requested. + + \param[in] index custom BamIndex subclass created by client + \sa CreateIndex(), LocateIndex(), OpenIndex() +*/ +void BamReader::SetIndex(BamIndex* index) +{ + d->SetIndex(index); +} + +/*! \fn bool BamReader::SetRegion(const BamRegion& region) + \brief Sets a target region of interest + + Requires that index data be available. Attempts a random-access + jump in the BAM file, near \a region left boundary position. + + Subsequent calls to GetNextAlignment() or GetNextAlignmentCore() + will only return \c true when alignments can be found that overlap + this \a region. + + A \a region with no right boundary is considered open-ended, meaning + that all alignments that lie downstream of the left boundary are + considered valid, continuing to the end of the BAM file. + + \warning BamRegion now represents a zero-based, HALF-OPEN interval. + In previous versions of BamTools (0.x & 1.x) all intervals were treated + as zero-based, CLOSED. + + \param[in] region desired region-of-interest to activate + + \returns \c true if reader was able to jump successfully to the region's left boundary + \sa HasIndex(), Jump() +*/ +bool BamReader::SetRegion(const BamRegion& region) +{ + return d->SetRegion(region); +} + +/*! \fn bool BamReader::SetRegion(const int& leftRefID, + const int& leftPosition, + const int& rightRefID, + const int& rightPosition) + \brief Sets a target region of interest. + + This is an overloaded function. + + \warning This function expects a zero-based, HALF-OPEN interval. + In previous versions of BamTools (0.x & 1.x) all intervals were treated + as zero-based, CLOSED. + + \param[in] leftRefID referenceID of region's left boundary + \param[in] leftPosition position of region's left boundary + \param[in] rightRefID reference ID of region's right boundary + \param[in] rightPosition position of region's right boundary + + \returns \c true if reader was able to jump successfully to the region's left boundary + \sa HasIndex(), Jump() +*/ +bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, + const int& rightBound) +{ + return d->SetRegion(BamRegion(leftRefID, leftBound, rightRefID, rightBound)); +} diff --git a/src/api/BamReader.h b/src/api/BamReader.h new file mode 100644 index 0000000..1991a67 --- /dev/null +++ b/src/api/BamReader.h @@ -0,0 +1,117 @@ +// *************************************************************************** +// BamReader.h (c) 2009 Derek Barnett, Michael Str�mberg +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 18 November 2012 (DB) +// --------------------------------------------------------------------------- +// Provides read access to BAM files. +// *************************************************************************** + +#ifndef BAMREADER_H +#define BAMREADER_H + +#include <string> +#include "api/BamAlignment.h" +#include "api/BamIndex.h" +#include "api/SamHeader.h" +#include "api/api_global.h" + +namespace BamTools { + +namespace Internal { +class BamReaderPrivate; +} // namespace Internal + +class API_EXPORT BamReader +{ + + // constructor / destructor +public: + BamReader(); + ~BamReader(); + + // public interface +public: + // ---------------------- + // BAM file operations + // ---------------------- + + // closes the current BAM file + bool Close(); + // returns filename of current BAM file + const std::string GetFilename() const; + // returns true if a BAM file is open for reading + bool IsOpen() const; + // performs random-access jump within BAM file + bool Jump(int refID, int position = 0); + // opens a BAM file + bool Open(const std::string& filename); + // returns internal file pointer to beginning of alignment data + bool Rewind(); + // sets the target region of interest + bool SetRegion(const BamRegion& region); + // sets the target region of interest + bool SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, + const int& rightPosition); + + // ---------------------- + // access alignment data + // ---------------------- + + // retrieves next available alignment + bool GetNextAlignment(BamAlignment& alignment); + // retrieves next available alignmnet (without populating the alignment's string data fields) + bool GetNextAlignmentCore(BamAlignment& alignment); + + // ---------------------- + // access header data + // ---------------------- + + // returns a read-only reference to SAM header data + const SamHeader& GetConstSamHeader() const; + // returns an editable copy of SAM header data + SamHeader GetHeader() const; + // returns SAM header data, as SAM-formatted text + std::string GetHeaderText() const; + + // ---------------------- + // access reference data + // ---------------------- + + // returns the number of reference sequences + int GetReferenceCount() const; + // returns all reference sequence entries + const RefVector& GetReferenceData() const; + // returns the ID of the reference with this name + int GetReferenceID(const std::string& refName) const; + + // ---------------------- + // BAM index operations + // ---------------------- + + // creates an index file for current BAM file, using the requested index type + bool CreateIndex(const BamIndex::IndexType& type = BamIndex::STANDARD); + // returns true if index data is available + bool HasIndex() const; + // looks in BAM file's directory for a matching index file + bool LocateIndex(const BamIndex::IndexType& preferredType = BamIndex::STANDARD); + // opens a BAM index file + bool OpenIndex(const std::string& indexFilename); + // sets a custom BamIndex on this reader + void SetIndex(BamIndex* index); + + // ---------------------- + // error handling + // ---------------------- + + // returns a human-readable description of the last error that occurred + std::string GetErrorString() const; + + // private implementation +private: + Internal::BamReaderPrivate* d; +}; + +} // namespace BamTools + +#endif // BAMREADER_H diff --git a/src/api/BamWriter.cpp b/src/api/BamWriter.cpp new file mode 100644 index 0000000..6f349ff --- /dev/null +++ b/src/api/BamWriter.cpp @@ -0,0 +1,155 @@ +// *************************************************************************** +// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#include "api/BamWriter.h" +#include "api/BamAlignment.h" +#include "api/SamHeader.h" +#include "api/internal/bam/BamWriter_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +/*! \class BamTools::BamWriter + \brief Provides write access for generating BAM files. +*/ +/*! \enum BamTools::BamWriter::CompressionMode + \brief This enum describes the compression behaviors for output BAM files. +*/ +/*! \var BamWriter::CompressionMode BamWriter::Compressed + \brief Use normal BAM compression +*/ +/*! \var BamWriter::CompressionMode BamWriter::Uncompressed + \brief Disable BAM compression + + Useful in situations where the BAM data is streamed (e.g. piping). + It would be wasteful to compress, and then immediately decompress + the data. +*/ + +/*! \fn BamWriter::BamWriter() + \brief constructor +*/ +BamWriter::BamWriter() + : d(new BamWriterPrivate) +{} + +/*! \fn BamWriter::~BamWriter() + \brief destructor +*/ +BamWriter::~BamWriter() +{ + delete d; + d = 0; +} + +/*! \fn BamWriter::Close() + \brief Closes the current BAM file. + \sa Open() +*/ +void BamWriter::Close() +{ + d->Close(); +} + +/*! \fn std::string BamWriter::GetErrorString() const + \brief Returns a human-readable description of the last error that occurred + + This method allows elimination of STDERR pollution. Developers of client code + may choose how the messages are displayed to the user, if at all. + + \return error description +*/ +std::string BamWriter::GetErrorString() const +{ + return d->GetErrorString(); +} + +/*! \fn bool BamWriter::IsOpen() const + \brief Returns \c true if BAM file is open for writing. + \sa Open() +*/ +bool BamWriter::IsOpen() const +{ + return d->IsOpen(); +} + +/*! \fn bool BamWriter::Open(const std::string& filename, + const std::string& samHeaderText, + const RefVector& referenceSequences) + \brief Opens a BAM file for writing. + + Will overwrite the BAM file if it already exists. + + \param[in] filename name of output BAM file + \param[in] samHeaderText header data, as SAM-formatted string + \param[in] referenceSequences list of reference entries + + \return \c true if opened successfully + \sa Close(), IsOpen(), BamReader::GetHeaderText(), BamReader::GetReferenceData() +*/ +bool BamWriter::Open(const std::string& filename, const std::string& samHeaderText, + const RefVector& referenceSequences) +{ + return d->Open(filename, samHeaderText, referenceSequences); +} + +/*! \fn bool BamWriter::Open(const std::string& filename, + const SamHeader& samHeader, + const RefVector& referenceSequences) + \brief Opens a BAM file for writing. + + This is an overloaded function. + + Will overwrite the BAM file if it already exists. + + \param[in] filename name of output BAM file + \param[in] samHeader header data, wrapped in SamHeader object + \param[in] referenceSequences list of reference entries + + \return \c true if opened successfully + \sa Close(), IsOpen(), BamReader::GetHeader(), BamReader::GetReferenceData() +*/ +bool BamWriter::Open(const std::string& filename, const SamHeader& samHeader, + const RefVector& referenceSequences) +{ + return d->Open(filename, samHeader.ToString(), referenceSequences); +} + +/*! \fn void BamWriter::SaveAlignment(const BamAlignment& alignment) + \brief Saves an alignment to the BAM file. + + \param[in] alignment BamAlignment record to save + \sa BamReader::GetNextAlignment(), BamReader::GetNextAlignmentCore() +*/ +bool BamWriter::SaveAlignment(const BamAlignment& alignment) +{ + return d->SaveAlignment(alignment); +} + +/*! \fn void BamWriter::SetCompressionMode(const BamWriter::CompressionMode& compressionMode) + \brief Sets the output compression mode. + + Default mode is BamWriter::Compressed. + + \note Changing the compression mode is disabled on open files (i.e. the request will + be ignored). Be sure to call this function before opening the BAM file. + + \code + BamWriter writer; + writer.SetCompressionMode(BamWriter::Uncompressed); + writer.Open( ... ); + // ... + \endcode + + \param[in] compressionMode desired output compression behavior + \sa IsOpen(), Open() +*/ +void BamWriter::SetCompressionMode(const BamWriter::CompressionMode& compressionMode) +{ + d->SetWriteCompressed(compressionMode == BamWriter::Compressed); +} diff --git a/src/api/BamWriter.h b/src/api/BamWriter.h new file mode 100644 index 0000000..b4c01b5 --- /dev/null +++ b/src/api/BamWriter.h @@ -0,0 +1,70 @@ +// *************************************************************************** +// BamWriter.h (c) 2009 Michael Str�mberg, Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#ifndef BAMWRITER_H +#define BAMWRITER_H + +#include <string> +#include "api/BamAux.h" +#include "api/api_global.h" + +namespace BamTools { + +class BamAlignment; +struct SamHeader; + +//! \cond +namespace Internal { +class BamWriterPrivate; +} // namespace Internal +//! \endcond + +class API_EXPORT BamWriter +{ + + // enums +public: + enum CompressionMode + { + Compressed = 0, + Uncompressed + }; + + // ctor & dtor +public: + BamWriter(); + ~BamWriter(); + + // public interface +public: + // closes the current BAM file + void Close(); + // returns a human-readable description of the last error that occurred + std::string GetErrorString() const; + // returns true if BAM file is open for writing + bool IsOpen() const; + // opens a BAM file for writing + bool Open(const std::string& filename, const std::string& samHeaderText, + const RefVector& referenceSequences); + // opens a BAM file for writing + bool Open(const std::string& filename, const SamHeader& samHeader, + const RefVector& referenceSequences); + // saves the alignment to the alignment archive + bool SaveAlignment(const BamAlignment& alignment); + // sets the output compression mode + void SetCompressionMode(const BamWriter::CompressionMode& compressionMode); + + // private implementation +private: + Internal::BamWriterPrivate* d; +}; + +} // namespace BamTools + +#endif // BAMWRITER_H diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt new file mode 100644 index 0000000..1c2ab9a --- /dev/null +++ b/src/api/CMakeLists.txt @@ -0,0 +1,77 @@ +# ========================== +# BamTools CMakeLists.txt +# (c) 2010 Derek Barnett +# +# src/api/ +# ========================== + +# list include paths +include_directories( ${BamTools_SOURCE_DIR}/src ) + +# add compiler definitions +add_definitions( -DBAMTOOLS_API_LIBRARY ) # (for proper exporting of library symbols) + +# fetch all internal source files +add_subdirectory( internal ) + +# make list of all API source files +set( BamToolsAPISources + BamAlignment.cpp + BamMultiReader.cpp + BamReader.cpp + BamWriter.cpp + SamHeader.cpp + SamProgram.cpp + SamProgramChain.cpp + SamReadGroup.cpp + SamReadGroupDictionary.cpp + SamSequence.cpp + SamSequenceDictionary.cpp + ${InternalSources} +) + +# link libraries automatically with zlib (and Winsock2, if applicable) +find_package( ZLIB REQUIRED ) + +if( WIN32 ) + set( WIN32_LIBRARIES wsock32 ws2_32 ) +endif() + +# create main BamTools API library +add_library( BamTools ${BamToolsAPISources} ) +# The SONAME is bumped on every version increment +# as Bamtools does not yet guarantee a stable ABI +set_target_properties( BamTools PROPERTIES + SOVERSION "${BamTools_VERSION}" + OUTPUT_NAME "bamtools" ) +target_include_directories( BamTools PRIVATE "${ZLIB_INCLUDE_DIRS}" ) +target_link_libraries( BamTools PRIVATE "${ZLIB_LIBRARIES}" "${WIN32_LIBRARIES}" ) +install( TARGETS BamTools + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" ) + +# export API headers +include(../ExportHeader.cmake) +set(ApiIncludeDir "api") +ExportHeader(APIHeaders api_global.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamAlgorithms.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamAlignment.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamAux.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamConstants.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamIndex.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamMultiReader.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamReader.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamWriter.h ${ApiIncludeDir}) +ExportHeader(APIHeaders IBamIODevice.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamConstants.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamHeader.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamProgram.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamProgramChain.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamReadGroup.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamReadGroupDictionary.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamSequence.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamSequenceDictionary.h ${ApiIncludeDir}) + +set( AlgorithmsIncludeDir "api/algorithms" ) +ExportHeader( AlgorithmsHeaders algorithms/Sort.h ${AlgorithmsIncludeDir} ) diff --git a/src/api/IBamIODevice.h b/src/api/IBamIODevice.h new file mode 100644 index 0000000..6de8968 --- /dev/null +++ b/src/api/IBamIODevice.h @@ -0,0 +1,100 @@ +// *************************************************************************** +// IBamIODevice.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Base class for all BAM I/O devices (e.g. local file, pipe, HTTP, FTP, etc.) +// +// Derived classes should provide protocol-specific implementations for +// reading/writing plain bytes, as well as other I/O-related behaviors. +// +// Since IBamIODevices may be defined in client code, the internal +// BamExceptions are NOT allowed to be thrown from devices, including the +// built-in ones. This keeps a consistent interface at the BgzfStream for +// handling any device type. Use the error string for relaying error messages. +// *************************************************************************** + +#ifndef IBAMIODEVICE_H +#define IBAMIODEVICE_H + +#include <cstdio> +#include <string> +#include "api/api_global.h" + +namespace BamTools { + +class API_EXPORT IBamIODevice +{ + + // enums +public: + enum OpenMode + { + NotOpen = 0x0000, + ReadOnly = 0x0001, + WriteOnly = 0x0002, + ReadWrite = ReadOnly | WriteOnly + }; + + // ctor & dtor +public: + virtual ~IBamIODevice() {} + + // IBamIODevice interface +public: + // TODO: add seek(pos, *from*) + + // pure virtuals + virtual void Close() = 0; + virtual bool IsRandomAccess() const = 0; + virtual bool Open(const OpenMode mode) = 0; + virtual int64_t Read(char* data, const unsigned int numBytes) = 0; + virtual bool Seek(const int64_t& position, const int origin = SEEK_SET) = 0; + virtual int64_t Tell() const = 0; + virtual int64_t Write(const char* data, const unsigned int numBytes) = 0; + + // default implementation provided + virtual std::string GetErrorString(); + virtual bool IsOpen() const; + virtual OpenMode Mode() const; + + // internal methods +protected: + IBamIODevice(); // hidden ctor + void SetErrorString(const std::string& where, const std::string& what); + + // data members +protected: + OpenMode m_mode; + std::string m_errorString; +}; + +inline IBamIODevice::IBamIODevice() + : m_mode(IBamIODevice::NotOpen) +{} + +inline std::string IBamIODevice::GetErrorString() +{ + return m_errorString; +} + +inline bool IBamIODevice::IsOpen() const +{ + return (m_mode != IBamIODevice::NotOpen); +} + +inline IBamIODevice::OpenMode IBamIODevice::Mode() const +{ + return m_mode; +} + +inline void IBamIODevice::SetErrorString(const std::string& where, const std::string& what) +{ + static const std::string SEPARATOR = ": "; + m_errorString = where + SEPARATOR + what; +} + +} // namespace BamTools + +#endif // IBAMIODEVICE_H diff --git a/src/api/SamConstants.h b/src/api/SamConstants.h new file mode 100644 index 0000000..6a1a275 --- /dev/null +++ b/src/api/SamConstants.h @@ -0,0 +1,97 @@ +// *************************************************************************** +// SamConstants.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 27 March 2012 (DB) +// --------------------------------------------------------------------------- +// Provides constants for SAM header +// *************************************************************************** + +#ifndef SAM_CONSTANTS_H +#define SAM_CONSTANTS_H + +#include <string> +#include "api/api_global.h" + +namespace BamTools { +namespace Constants { + +// basic char constants used in SAM format +const char SAM_COLON = ':'; +const char SAM_EQUAL = '='; +const char SAM_PERIOD = '.'; +const char SAM_STAR = '*'; +const char SAM_TAB = '\t'; +const std::string SAM_DIGITS = "0123456789"; + +const std::string SAM_CURRENT_VERSION = "1.4"; + +// HD entries +const std::string SAM_HD_BEGIN_TOKEN = "@HD"; +const std::string SAM_HD_VERSION_TAG = "VN"; +const std::string SAM_HD_SORTORDER_TAG = "SO"; +const std::string SAM_HD_GROUPORDER_TAG = "GO"; + +// SQ entries +const std::string SAM_SQ_BEGIN_TOKEN = "@SQ"; +const std::string SAM_SQ_ASSEMBLYID_TAG = "AS"; +const std::string SAM_SQ_CHECKSUM_TAG = "M5"; +const std::string SAM_SQ_LENGTH_TAG = "LN"; +const std::string SAM_SQ_NAME_TAG = "SN"; +const std::string SAM_SQ_SPECIES_TAG = "SP"; +const std::string SAM_SQ_URI_TAG = "UR"; + +// RG entries +const std::string SAM_RG_BEGIN_TOKEN = "@RG"; +const std::string SAM_RG_DESCRIPTION_TAG = "DS"; +const std::string SAM_RG_FLOWORDER_TAG = "FO"; +const std::string SAM_RG_ID_TAG = "ID"; +const std::string SAM_RG_KEYSEQUENCE_TAG = "KS"; +const std::string SAM_RG_LIBRARY_TAG = "LB"; +const std::string SAM_RG_PLATFORMUNIT_TAG = "PU"; +const std::string SAM_RG_PREDICTEDINSERTSIZE_TAG = "PI"; +const std::string SAM_RG_PRODUCTIONDATE_TAG = "DT"; +const std::string SAM_RG_PROGRAM_TAG = "PG"; +const std::string SAM_RG_SAMPLE_TAG = "SM"; +const std::string SAM_RG_SEQCENTER_TAG = "CN"; +const std::string SAM_RG_SEQTECHNOLOGY_TAG = "PL"; + +// PG entries +const std::string SAM_PG_BEGIN_TOKEN = "@PG"; +const std::string SAM_PG_COMMANDLINE_TAG = "CL"; +const std::string SAM_PG_ID_TAG = "ID"; +const std::string SAM_PG_NAME_TAG = "PN"; +const std::string SAM_PG_PREVIOUSPROGRAM_TAG = "PP"; +const std::string SAM_PG_VERSION_TAG = "VN"; + +// CO entries +const std::string SAM_CO_BEGIN_TOKEN = "@CO"; + +// HD:SO values +const std::string SAM_HD_SORTORDER_COORDINATE = "coordinate"; +const std::string SAM_HD_SORTORDER_QUERYNAME = "queryname"; +const std::string SAM_HD_SORTORDER_UNKNOWN = "unknown"; +const std::string SAM_HD_SORTORDER_UNSORTED = "unsorted"; + +// HD:GO values +const std::string SAM_HD_GROUPORDER_NONE = "none"; +const std::string SAM_HD_GROUPORDER_QUERY = "query"; +const std::string SAM_HD_GROUPORDER_REFERENCE = "reference"; + +// SQ:LN values +const unsigned int SAM_SQ_LENGTH_MIN = 1; +const unsigned int SAM_SQ_LENGTH_MAX = 536870911; // 2^29 - 1 + +// RG:PL values +const std::string SAM_RG_SEQTECHNOLOGY_CAPILLARY = "CAPILLARY"; +const std::string SAM_RG_SEQTECHNOLOGY_HELICOS = "HELICOS"; +const std::string SAM_RG_SEQTECHNOLOGY_ILLUMINA = "ILLUMINA"; +const std::string SAM_RG_SEQTECHNOLOGY_IONTORRENT = "IONTORRENT"; +const std::string SAM_RG_SEQTECHNOLOGY_LS454 = "LS454"; +const std::string SAM_RG_SEQTECHNOLOGY_PACBIO = "PACBIO"; +const std::string SAM_RG_SEQTECHNOLOGY_SOLID = "SOLID"; + +} // namespace Constants +} // namespace BamTools + +#endif // SAM_CONSTANTS_H diff --git a/src/api/SamHeader.cpp b/src/api/SamHeader.cpp new file mode 100644 index 0000000..9429e81 --- /dev/null +++ b/src/api/SamHeader.cpp @@ -0,0 +1,246 @@ +// *************************************************************************** +// SamHeader.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides direct read/write access to the SAM header data fields. +// *************************************************************************** + +#include "api/SamHeader.h" +#include "api/SamConstants.h" +#include "api/internal/sam/SamFormatParser_p.h" +#include "api/internal/sam/SamFormatPrinter_p.h" +#include "api/internal/sam/SamHeaderValidator_p.h" +#include "api/internal/utils/BamException_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +/*! \struct BamTools::SamHeader + \brief Represents the SAM-formatted text header that is part of the BAM file header. + + Provides direct read/write access to the SAM header data fields. + + \sa \samSpecURL +*/ +/*! \var SamHeader::Version + \brief corresponds to \@HD VN:\<Version\> + + Required for valid SAM header, if \@HD record is present. +*/ +/*! \var SamHeader::SortOrder + \brief corresponds to \@HD SO:\<SortOrder\> +*/ +/*! \var SamHeader::GroupOrder + \brief corresponds to \@HD GO:\<GroupOrder\> +*/ +/*! \var SamHeader::Sequences + \brief corresponds to \@SQ entries + \sa SamSequence, SamSequenceDictionary +*/ +/*! \var SamHeader::ReadGroups + \brief corresponds to \@RG entries + \sa SamReadGroup, SamReadGroupDictionary +*/ +/*! \var SamHeader::Programs + \brief corresponds to \@PG entries + \sa SamProgram, SamProgramChain +*/ +/*! \var SamHeader::Comments + \brief corresponds to \@CO entries +*/ + +/*! \fn SamHeader::SamHeader(const std::string& headerText = std::string()) + \brief constructor +*/ +SamHeader::SamHeader(const std::string& headerText) + : SortOrder(Constants::SAM_HD_SORTORDER_UNKNOWN) +{ + SetHeaderText(headerText); +} + +/*! \fn SamHeader::SamHeader(const SamHeader& other) + \brief copy constructor +*/ +SamHeader::SamHeader(const SamHeader& other) + : Version(other.Version) + , SortOrder(other.SortOrder) + , GroupOrder(other.GroupOrder) + , CustomTags(other.CustomTags) + , Sequences(other.Sequences) + , ReadGroups(other.ReadGroups) + , Programs(other.Programs) + , Comments(other.Comments) + , m_errorString(other.GetErrorString()) +{} + +/*! \fn SamHeader::~SamHeader() + \brief destructor +*/ +SamHeader::~SamHeader() {} + +/*! \fn void SamHeader::Clear() + \brief Clears all header contents. +*/ +void SamHeader::Clear() +{ + + // clear SAM header components + Version.clear(); + SortOrder.clear(); + GroupOrder.clear(); + CustomTags.clear(); + Sequences.Clear(); + ReadGroups.Clear(); + Programs.Clear(); + Comments.clear(); + + // clear error string + m_errorString.clear(); +} + +/*! \fn std::string SamHeader::GetErrorString() const + \brief Returns a human-readable description of the last error that occurred + + This method allows elimination of STDERR pollution. Developers of client code + may choose how the messages are displayed to the user, if at all. + + \return error description +*/ +std::string SamHeader::GetErrorString() const +{ + return m_errorString; +} + +/*! \fn bool SamHeader::HasError() const + \brief Returns \c true if header encountered an error +*/ +bool SamHeader::HasError() const +{ + return (!m_errorString.empty()); +} + +/*! \fn bool SamHeader::HasVersion() const + \brief Returns \c true if header contains \@HD ID:\<Version\> +*/ +bool SamHeader::HasVersion() const +{ + return (!Version.empty()); +} + +/*! \fn bool SamHeader::HasSortOrder() const + \brief Returns \c true if header contains \@HD SO:\<SortOrder\> +*/ +bool SamHeader::HasSortOrder() const +{ + return (!SortOrder.empty()); +} + +/*! \fn bool SamHeader::HasGroupOrder() const + \brief Returns \c true if header contains \@HD GO:\<GroupOrder\> +*/ +bool SamHeader::HasGroupOrder() const +{ + return (!GroupOrder.empty()); +} + +/*! \fn bool SamHeader::HasSequences() const + \brief Returns \c true if header contains any \@SQ entries +*/ +bool SamHeader::HasSequences() const +{ + return (!Sequences.IsEmpty()); +} + +/*! \fn bool SamHeader::HasReadGroups() const + \brief Returns \c true if header contains any \@RG entries +*/ +bool SamHeader::HasReadGroups() const +{ + return (!ReadGroups.IsEmpty()); +} + +/*! \fn bool SamHeader::HasPrograms() const + \brief Returns \c true if header contains any \@PG entries +*/ +bool SamHeader::HasPrograms() const +{ + return (!Programs.IsEmpty()); +} + +/*! \fn bool SamHeader::HasComments() const + \brief Returns \c true if header contains any \@CO entries +*/ +bool SamHeader::HasComments() const +{ + return (!Comments.empty()); +} + +/*! \fn bool SamHeader::IsValid(bool verbose = false) const + \brief Checks header contents for required data and proper formatting. + + \param[in] verbose If set to true, validation errors & warnings will be printed to stderr. + Otherwise, messages are available through SamHeader::GetErrorString(). + \return \c true if SAM header is well-formed +*/ +bool SamHeader::IsValid(bool verbose) const +{ + + SamHeaderValidator validator(*this); + + // if SAM header is valid, return success + if (validator.Validate()) return true; + + // otherwiser + else { + + // print messages to stderr + if (verbose) validator.PrintMessages(std::cerr); + + // or catch in local error string + else { + std::stringstream errorStream; + validator.PrintMessages(errorStream); + m_errorString = errorStream.str(); + } + return false; + } +} + +/*! \fn void SamHeader::SetHeaderText(const std::string& headerText) + \brief Replaces header contents with \a headerText. + + \param[in] headerText SAM formatted-text that will be parsed into data fields +*/ +void SamHeader::SetHeaderText(const std::string& headerText) +{ + + // clear prior data + Clear(); + + try { + SamFormatParser parser(*this); + parser.Parse(headerText); + } catch (BamException& e) { + + // clear anything parsed so far + // no telling what's valid and what's partially parsed + Clear(); + + // set error string + m_errorString = e.what(); + } +} + +/*! \fn std::string SamHeader::ToString() const + \brief Converts data fields to SAM-formatted text. + + Applies any local modifications made since creating this object or calling SetHeaderText(). + + \return SAM-formatted header text +*/ +std::string SamHeader::ToString() const +{ + SamFormatPrinter printer(*this); + return printer.ToString(); +} diff --git a/src/api/SamHeader.h b/src/api/SamHeader.h new file mode 100644 index 0000000..23534b1 --- /dev/null +++ b/src/api/SamHeader.h @@ -0,0 +1,78 @@ +// *************************************************************************** +// SamHeader.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides direct read/write access to the SAM header data fields. +// *************************************************************************** + +#ifndef SAM_HEADER_H +#define SAM_HEADER_H + +#include <string> +#include <vector> +#include "api/BamAux.h" +#include "api/SamProgramChain.h" +#include "api/SamReadGroupDictionary.h" +#include "api/SamSequenceDictionary.h" +#include "api/api_global.h" + +namespace BamTools { + +struct API_EXPORT SamHeader +{ + + // ctor & dtor + SamHeader(const std::string& headerText = std::string()); + SamHeader(const SamHeader& other); + ~SamHeader(); + + // query/modify entire SamHeader + void Clear(); // clears all header contents + std::string GetErrorString() const; + bool HasError() const; + bool IsValid(bool verbose = false) const; // returns true if SAM header is well-formed + void SetHeaderText( + const std::string& headerText); // replaces data fields with contents of SAM-formatted text + std::string ToString() const; // returns the printable, SAM-formatted header text + + // convenience query methods + bool HasVersion() const; // returns true if header contains format version entry + bool HasSortOrder() const; // returns true if header contains sort order entry + bool HasGroupOrder() const; // returns true if header contains group order entry + bool HasSequences() const; // returns true if header contains any sequence entries + bool HasReadGroups() const; // returns true if header contains any read group entries + bool HasPrograms() const; // returns true if header contains any program record entries + bool HasComments() const; // returns true if header contains comments + + // -------------- + // data members + // -------------- + + // header metadata (@HD line) + std::string Version; // VN:<Version> *Required, if @HD record is present* + std::string SortOrder; // SO:<SortOrder> + std::string GroupOrder; // GO:<GroupOrder> + std::vector<CustomHeaderTag> CustomTags; // optional custom tags on @HD line + + // header sequences (@SQ entries) + SamSequenceDictionary Sequences; + + // header read groups (@RG entries) + SamReadGroupDictionary ReadGroups; + + // header program data (@PG entries) + SamProgramChain Programs; + + // header comments (@CO entries) + std::vector<std::string> Comments; + + // internal data +private: + mutable std::string m_errorString; +}; + +} // namespace BamTools + +#endif // SAM_HEADER_H diff --git a/src/api/SamProgram.cpp b/src/api/SamProgram.cpp new file mode 100644 index 0000000..0c23f11 --- /dev/null +++ b/src/api/SamProgram.cpp @@ -0,0 +1,134 @@ +// *************************************************************************** +// SamProgram.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides direct read/write access to the SAM header program records. +// *************************************************************************** + +#include "api/SamProgram.h" +using namespace BamTools; + +/*! \struct BamTools::SamProgram + \brief Represents a SAM program record. + + Provides direct read/write access to the SAM header program records. + + \sa \samSpecURL +*/ +/*! \var SamProgram::CommandLine + \brief corresponds to \@PG CL:\<CommandLine\> +*/ +/*! \var SamProgram::ID + \brief corresponds to \@PG ID:\<ID\> + + Required for valid SAM header. +*/ +/*! \var SamProgram::Name + \brief corresponds to \@PG PN:\<Name\> +*/ +/*! \var SamProgram::PreviousProgramID + \brief corresponds to \@PG PP:\<PreviousProgramID\> +*/ +/*! \var SamProgram::Version + \brief corresponds to \@PG VN:\<Version\> +*/ +/*! \var SamProgram::NextProgramID + \internal + Holds ID of the "next" program record in a SamProgramChain +*/ + +/*! \fn SamProgram::SamProgram() + \brief default constructor +*/ +SamProgram::SamProgram() {} + +/*! \fn SamProgram::SamProgram(const std::string& id) + \brief constructs program record with \a id + + \param id desired program record ID +*/ +SamProgram::SamProgram(const std::string& id) + : ID(id) +{} + +/*! \fn SamProgram::SamProgram(const SamProgram& other) + \brief copy constructor +*/ +SamProgram::SamProgram(const SamProgram& other) + : CommandLine(other.CommandLine) + , ID(other.ID) + , Name(other.Name) + , PreviousProgramID(other.PreviousProgramID) + , Version(other.Version) + , CustomTags(other.CustomTags) + , NextProgramID(other.NextProgramID) +{} + +/*! \fn SamProgram::~SamProgram() + \brief destructor +*/ +SamProgram::~SamProgram() {} + +/*! \fn void SamProgram::Clear() + \brief Clears all data fields. +*/ +void SamProgram::Clear() +{ + CommandLine.clear(); + ID.clear(); + Name.clear(); + PreviousProgramID.clear(); + Version.clear(); + NextProgramID.clear(); +} + +/*! \fn bool SamProgram::HasCommandLine() const + \brief Returns \c true if program record contains \@PG: CL:\<CommandLine\> +*/ +bool SamProgram::HasCommandLine() const +{ + return (!CommandLine.empty()); +} + +/*! \fn bool SamProgram::HasID() const + \brief Returns \c true if program record contains \@PG: ID:\<ID\> +*/ +bool SamProgram::HasID() const +{ + return (!ID.empty()); +} + +/*! \fn bool SamProgram::HasName() const + \brief Returns \c true if program record contains \@PG: PN:\<Name\> +*/ +bool SamProgram::HasName() const +{ + return (!Name.empty()); +} + +/*! \fn bool SamProgram::HasNextProgramID() const + \internal + \return true if program has a "next" record in a SamProgramChain +*/ +bool SamProgram::HasNextProgramID() const +{ + return (!NextProgramID.empty()); +} + +/*! \fn bool SamProgram::HasPreviousProgramID() const + \brief Returns \c true if program record contains \@PG: PP:\<PreviousProgramID\> +*/ +bool SamProgram::HasPreviousProgramID() const +{ + return (!PreviousProgramID.empty()); +} + +/*! \fn bool SamProgram::HasVersion() const + \brief Returns \c true if program record contains \@PG: VN:\<Version\> +*/ +bool SamProgram::HasVersion() const +{ + return (!Version.empty()); +} diff --git a/src/api/SamProgram.h b/src/api/SamProgram.h new file mode 100644 index 0000000..b6e3017 --- /dev/null +++ b/src/api/SamProgram.h @@ -0,0 +1,66 @@ +// *************************************************************************** +// SamProgram.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides direct read/write access to the SAM header program records. +// *************************************************************************** + +#ifndef SAM_PROGRAM_H +#define SAM_PROGRAM_H + +#include <string> +#include "api/BamAux.h" +#include "api/api_global.h" + +namespace BamTools { + +class SamProgramChain; + +struct API_EXPORT SamProgram +{ + + // ctor & dtor + SamProgram(); + SamProgram(const std::string& id); + SamProgram(const SamProgram& other); + ~SamProgram(); + + // query/modify entire program record + void Clear(); // clears all data fields + + // convenience query methods + bool HasCommandLine() const; // returns true if program record has a command line entry + bool HasID() const; // returns true if program record has an ID + bool HasName() const; // returns true if program record has a name + bool HasPreviousProgramID() + const; // returns true if program record has a 'previous program ID' + bool HasVersion() const; // returns true if program record has a version + + // data members + std::string CommandLine; // CL:<CommandLine> + std::string ID; // ID:<ID> *Required for valid SAM header* + std::string Name; // PN:<Name> + std::string PreviousProgramID; // PP:<PreviousProgramID> + std::string Version; // VN:<Version> + std::vector<CustomHeaderTag> CustomTags; // optional custom tags + + // internal (non-standard) methods & fields +private: + bool HasNextProgramID() const; + std::string NextProgramID; + friend class BamTools::SamProgramChain; +}; + +/*! \fn bool operator==(const SamProgram& lhs, const SamProgram& rhs) + \brief tests equality by comparing program IDs +*/ +API_EXPORT inline bool operator==(const SamProgram& lhs, const SamProgram& rhs) +{ + return lhs.ID == rhs.ID; +} + +} // namespace BamTools + +#endif // SAM_PROGRAM_H diff --git a/src/api/SamProgramChain.cpp b/src/api/SamProgramChain.cpp new file mode 100644 index 0000000..d796def --- /dev/null +++ b/src/api/SamProgramChain.cpp @@ -0,0 +1,363 @@ +// *************************************************************************** +// SamProgramChain.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides methods for operating on a SamProgram record "chain" +// *************************************************************************** + +#include "api/SamProgramChain.h" +using namespace BamTools; + +#include <algorithm> +#include <cstdlib> +#include <iostream> + +/*! \class BamTools::SamProgramChain + \brief Sorted container "chain" of SamProgram records. + + Provides methods for operating on a collection of SamProgram records. + + \note Underlying container is *NOT* ordered by linkage, but by order of + appearance in SamHeader and subsequent Add() calls. Using the current + iterators will not allow you to step through the header's program history. + Instead use First()/Last() to access oldest/newest records, respectively. +*/ + +/*! \fn SamProgramChain::SamProgramChain() + \brief constructor +*/ +SamProgramChain::SamProgramChain() {} + +/*! \fn SamProgramChain::SamProgramChain(const SamProgramChain& other) + \brief copy constructor +*/ +SamProgramChain::SamProgramChain(const SamProgramChain& other) + : m_data(other.m_data) +{} + +/*! \fn SamProgramChain::~SamProgramChain() + \brief destructor +*/ +SamProgramChain::~SamProgramChain() {} + +/*! \fn void SamProgramChain::Add(SamProgram& program) + \brief Appends a program to program chain. + + Duplicate entries are silently discarded. + + \note Underlying container is *NOT* ordered by linkage, but by order of + appearance in SamHeader and subsequent Add() calls. Using the current + iterators will not allow you to step through the header's program history. + Instead use First()/Last() to access oldest/newest records, respectively. + + \param[in] program entry to be appended +*/ +void SamProgramChain::Add(SamProgram& program) +{ + + // ignore duplicated records + if (Contains(program)) return; + + // if other programs already in chain, try to find the "next" record + // tries to match another record's PPID with @program's ID + if (!IsEmpty()) program.NextProgramID = NextIdFor(program.ID); + + // store program record + m_data.push_back(program); +} + +/*! \fn void SamProgramChain::Add(std::vector<SamProgram>& programs) + \brief Appends a batch of programs to the end of the chain. + + This is an overloaded function. + + \param[in] programs batch of program records to append + \sa Add() +*/ +void SamProgramChain::Add(std::vector<SamProgram>& programs) +{ + std::vector<SamProgram>::iterator pgIter = programs.begin(); + std::vector<SamProgram>::iterator pgEnd = programs.end(); + for (; pgIter != pgEnd; ++pgIter) + Add(*pgIter); +} + +/*! \fn SamProgramIterator SamProgramChain::Begin() + \return an STL iterator pointing to the first (oldest) program record + \sa ConstBegin(), End(), First() +*/ +SamProgramIterator SamProgramChain::Begin() +{ + return m_data.begin(); +} + +/*! \fn SamProgramConstIterator SamProgramChain::Begin() const + \return an STL const_iterator pointing to the first (oldest) program record + + This is an overloaded function. + + \sa ConstBegin(), End(), First() +*/ +SamProgramConstIterator SamProgramChain::Begin() const +{ + return m_data.begin(); +} + +/*! \fn void SamProgramChain::Clear() + \brief Clears all program records. +*/ +void SamProgramChain::Clear() +{ + m_data.clear(); +} + +/*! \fn SamProgramConstIterator SamProgramChain::ConstBegin() const + \return an STL const_iterator pointing to the first (oldest) program record + \sa Begin(), ConstEnd(), First() +*/ +SamProgramConstIterator SamProgramChain::ConstBegin() const +{ + return m_data.begin(); +} + +/*! \fn SamProgramConstIterator SamProgramChain::ConstEnd() const + \return an STL const_iterator pointing to the imaginary entry after the last (newest) program record + \sa ConstBegin(), End(), Last() +*/ +SamProgramConstIterator SamProgramChain::ConstEnd() const +{ + return m_data.end(); +} + +/*! \fn bool SamProgramChain::Contains(const SamProgram& program) const + \brief Returns true if chains has this program record (matching on ID). + + This is an overloaded function. + + \param[in] program SamProgram to search for + \return \c true if chain contains program (matching on ID) +*/ +bool SamProgramChain::Contains(const SamProgram& program) const +{ + return Contains(program.ID); +} + +/*! \fn bool SamProgramChain::Contains(const std::string& programId) const + \brief Returns true if chains has a program record with this ID + + \param[in] programId search for program matching this ID + \return \c true if chain contains a program record with this ID +*/ +bool SamProgramChain::Contains(const std::string& programId) const +{ + return (IndexOf(programId) != (int)m_data.size()); +} + +/*! \fn SamProgramIterator SamProgramChain::End() + \return an STL iterator pointing to the imaginary entry after the last (newest) program record + \sa Begin(), ConstEnd(), Last() +*/ +SamProgramIterator SamProgramChain::End() +{ + return m_data.end(); +} + +/*! \fn SamProgramConstIterator SamProgramChain::End() const + \return an STL const_iterator pointing to the imaginary entry after the last (newest) program record + + This is an overloaded function. + + \sa Begin(), ConstEnd(), Last() +*/ +SamProgramConstIterator SamProgramChain::End() const +{ + return m_data.end(); +} + +/*! \fn SamProgram& SamProgramChain::First() + \brief Fetches first (oldest) record in the chain. + + \warning This function will fail if the chain is empty. If this is possible, + check the result of IsEmpty() before calling this function. + + \return a modifiable reference to the first (oldest) program entry + \sa Begin(), Last() +*/ +SamProgram& SamProgramChain::First() +{ + + // find first record in container that has no PreviousProgramID entry + SamProgramIterator iter = Begin(); + SamProgramIterator end = End(); + for (; iter != end; ++iter) { + SamProgram& current = (*iter); + if (!current.HasPreviousProgramID()) return current; + } + + // otherwise error + std::cerr << "SamProgramChain::First: could not find any record without a PP tag" << std::endl; + std::exit(EXIT_FAILURE); +} + +/*! \fn const SamProgram& SamProgramChain::First() const + \brief Fetches first (oldest) record in the chain. + + This is an overloaded function. + + \warning This function will fail if the chain is empty. If this is possible, + check the result of IsEmpty() before calling this function. + + \return a read-only reference to the first (oldest) program entry + \sa Begin(), ConstBegin(), Last() +*/ +const SamProgram& SamProgramChain::First() const +{ + + // find first record in container that has no PreviousProgramID entry + SamProgramConstIterator iter = ConstBegin(); + SamProgramConstIterator end = ConstEnd(); + for (; iter != end; ++iter) { + const SamProgram& current = (*iter); + if (!current.HasPreviousProgramID()) return current; + } + + // otherwise error + std::cerr << "SamProgramChain::First: could not find any record without a PP tag" << std::endl; + std::exit(EXIT_FAILURE); +} + +/*! \fn int SamProgramChain::IndexOf(const std::string& programId) const + \internal + \return index of program record if found. + Otherwise, returns vector::size() (invalid index). +*/ +int SamProgramChain::IndexOf(const std::string& programId) const +{ + SamProgramConstIterator begin = ConstBegin(); + SamProgramConstIterator iter = begin; + SamProgramConstIterator end = ConstEnd(); + for (; iter != end; ++iter) { + const SamProgram& current = (*iter); + if (current.ID == programId) break; + } + return distance(begin, iter); +} + +/*! \fn bool SamProgramChain::IsEmpty() const + \brief Returns \c true if chain contains no records + \sa Size() +*/ +bool SamProgramChain::IsEmpty() const +{ + return m_data.empty(); +} + +/*! \fn SamProgram& SamProgramChain::Last() + \brief Fetches last (newest) record in the chain. + + \warning This function will fail if the chain is empty. If this is possible, + check the result of IsEmpty() before calling this function. + + \return a modifiable reference to the last (newest) program entry + \sa End(), First() +*/ +SamProgram& SamProgramChain::Last() +{ + // find first record in container that has no NextProgramID entry + SamProgramIterator iter = Begin(); + SamProgramIterator end = End(); + for (; iter != end; ++iter) { + SamProgram& current = (*iter); + if (!current.HasNextProgramID()) return current; + } + + // otherwise error + std::cerr << "SamProgramChain::Last: could not determine last record" << std::endl; + std::exit(EXIT_FAILURE); +} + +/*! \fn const SamProgram& SamProgramChain::Last() const + \brief Fetches last (newest) record in the chain. + + This is an overloaded function. + + \warning This function will fail if the chain is empty. If this is possible, + check the result of IsEmpty() before calling this function. + + \return a read-only reference to the last (newest) program entry + \sa End(), ConstEnd(), First() +*/ +const SamProgram& SamProgramChain::Last() const +{ + // find first record in container that has no NextProgramID entry + SamProgramConstIterator iter = ConstBegin(); + SamProgramConstIterator end = ConstEnd(); + for (; iter != end; ++iter) { + const SamProgram& current = (*iter); + if (!current.HasNextProgramID()) return current; + } + + // otherwise error + std::cerr << "SamProgramChain::Last: could not determine last record" << std::endl; + std::exit(EXIT_FAILURE); +} + +/*! \fn const std::string SamProgramChain::NextIdFor(const std::string& programId) const + \internal + + \return ID of program record, whose PreviousProgramID matches \a programId. + Otherwise, returns empty string if none found. +*/ +const std::string SamProgramChain::NextIdFor(const std::string& programId) const +{ + + // find first record in container whose PreviousProgramID matches @programId + SamProgramConstIterator iter = ConstBegin(); + SamProgramConstIterator end = ConstEnd(); + for (; iter != end; ++iter) { + const SamProgram& current = (*iter); + if (!current.HasPreviousProgramID() && current.PreviousProgramID == programId) { + return current.ID; + } + } + + // none found + return std::string(); +} + +/*! \fn int SamProgramChain::Size() const + \brief Returns number of program records in the chain. + \sa IsEmpty() +*/ +int SamProgramChain::Size() const +{ + return m_data.size(); +} + +/*! \fn SamProgram& SamProgramChain::operator[](const std::string& programId) + \brief Retrieves the modifiable SamProgram record that matches \a programId. + + \warning If the chain contains no read group matching this ID, this function will + print an error and terminate. Check the return value of Contains() if this may be + possible. + + \param[in] programId ID of program record to retrieve + \return a modifiable reference to the SamProgram associated with the ID +*/ +SamProgram& SamProgramChain::operator[](const std::string& programId) +{ + + // look up program record matching this ID + int index = IndexOf(programId); + + // if record not found + if (index == (int)m_data.size()) { + std::cerr << "SamProgramChain::operator[] - unknown programId: " << programId << std::endl; + std::exit(EXIT_FAILURE); + } + + // otherwise return program record at index + return m_data.at(index); +} diff --git a/src/api/SamProgramChain.h b/src/api/SamProgramChain.h new file mode 100644 index 0000000..9e61857 --- /dev/null +++ b/src/api/SamProgramChain.h @@ -0,0 +1,86 @@ +// *************************************************************************** +// SamProgramChain.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides methods for operating on a SamProgram record "chain" +// *************************************************************************** + +#ifndef SAM_PROGRAMCHAIN_H +#define SAM_PROGRAMCHAIN_H + +#include <string> +#include <vector> +#include "api/SamProgram.h" +#include "api/api_global.h" + +namespace BamTools { + +// chain is *NOT* sorted in any order +// use First()/Last() to retrieve oldest/newest programs, respectively +typedef std::vector<SamProgram> SamProgramContainer; +typedef SamProgramContainer::iterator SamProgramIterator; +typedef SamProgramContainer::const_iterator SamProgramConstIterator; + +class API_EXPORT SamProgramChain +{ + + // ctor & dtor +public: + SamProgramChain(); + SamProgramChain(const SamProgramChain& other); + ~SamProgramChain(); + + // query/modify program data +public: + // appends a program record to the chain + void Add(SamProgram& program); + void Add(std::vector<SamProgram>& programs); + + // clears all read group entries + void Clear(); + + // returns true if chain contains this program record (matches on ID) + bool Contains(const SamProgram& program) const; + bool Contains(const std::string& programId) const; + + // returns the first (oldest) program in the chain + SamProgram& First(); + const SamProgram& First() const; + + // returns true if chain is empty + bool IsEmpty() const; + + // returns last (most recent) program in the chain + SamProgram& Last(); + const SamProgram& Last() const; + + // returns number of program records in the chain + int Size() const; + + // retrieves a modifiable reference to the SamProgram object associated with this ID + SamProgram& operator[](const std::string& programId); + + // retrieve STL-compatible iterators +public: + SamProgramIterator Begin(); // returns iterator to begin() + SamProgramConstIterator Begin() const; // returns const_iterator to begin() + SamProgramConstIterator ConstBegin() const; // returns const_iterator to begin() + SamProgramIterator End(); // returns iterator to end() + SamProgramConstIterator End() const; // returns const_iterator to end() + SamProgramConstIterator ConstEnd() const; // returns const_iterator to end() + + // internal methods +private: + int IndexOf(const std::string& programId) const; + const std::string NextIdFor(const std::string& programId) const; + + // data members +private: + SamProgramContainer m_data; +}; + +} // namespace BamTools + +#endif // SAM_PROGRAMCHAIN_H diff --git a/src/api/SamReadGroup.cpp b/src/api/SamReadGroup.cpp new file mode 100644 index 0000000..259c6ba --- /dev/null +++ b/src/api/SamReadGroup.cpp @@ -0,0 +1,211 @@ +// *************************************************************************** +// SamReadGroup.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides direct read/write access to the SAM read group data fields. +// *************************************************************************** + +#include "api/SamReadGroup.h" +using namespace BamTools; + +/*! \struct BamTools::SamReadGroup + \brief Represents a SAM read group entry. + + Provides direct read/write access to the SAM read group data fields. + + \sa \samSpecURL +*/ +/*! \var SamReadGroup::Description + \brief corresponds to \@RG DS:\<Description\> +*/ +/*! \var SamReadGroup::FlowOrder + \brief corresponds to \@RG FO:\<FlowOrder\> +*/ +/*! \var SamReadGroup::ID + \brief corresponds to \@RG ID:\<ID\> + + Required for valid SAM header. +*/ +/*! \var SamReadGroup::KeySequence + \brief corresponds to \@RG KS:\<KeySequence\> +*/ +/*! \var SamReadGroup::Library + \brief corresponds to \@RG LB:\<Library\> +*/ +/*! \var SamReadGroup::PlatformUnit + \brief corresponds to \@RG PU:\<PlatformUnit\> +*/ +/*! \var SamReadGroup::PredictedInsertSize + \brief corresponds to \@RG PI:\<PredictedInsertSize\> +*/ +/*! \var SamReadGroup::ProductionDate + \brief corresponds to \@RG DT:\<ProductionDate\> +*/ +/*! \var SamReadGroup::Program + \brief corresponds to \@RG PG:\<Program\> +*/ +/*! \var SamReadGroup::Sample + \brief corresponds to \@RG SM:\<Sample\> +*/ +/*! \var SamReadGroup::SequencingCenter + \brief corresponds to \@RG CN:\<SequencingCenter\> +*/ +/*! \var SamReadGroup::SequencingTechnology + \brief corresponds to \@RG PL:\<SequencingTechnology\> +*/ + +/*! \fn SamReadGroup::SamReadGroup() + \brief default constructor +*/ +SamReadGroup::SamReadGroup() {} + +/*! \fn SamReadGroup::SamReadGroup(const std::string& id) + \brief constructs read group with \a id + + \param id desired read group ID +*/ +SamReadGroup::SamReadGroup(const std::string& id) + : ID(id) +{} + +/*! \fn SamReadGroup::SamReadGroup(const SamReadGroup& other) + \brief copy constructor +*/ +SamReadGroup::SamReadGroup(const SamReadGroup& other) + : Description(other.Description) + , FlowOrder(other.FlowOrder) + , ID(other.ID) + , KeySequence(other.KeySequence) + , Library(other.Library) + , PlatformUnit(other.PlatformUnit) + , PredictedInsertSize(other.PredictedInsertSize) + , ProductionDate(other.ProductionDate) + , Program(other.Program) + , Sample(other.Sample) + , SequencingCenter(other.SequencingCenter) + , SequencingTechnology(other.SequencingTechnology) + , CustomTags(other.CustomTags) +{} + +/*! \fn SamReadGroup::~SamReadGroup() + \brief destructor +*/ +SamReadGroup::~SamReadGroup() {} + +/*! \fn void SamReadGroup::Clear() + \brief Clears all data fields. +*/ +void SamReadGroup::Clear() +{ + Description.clear(); + FlowOrder.clear(); + ID.clear(); + KeySequence.clear(); + Library.clear(); + PlatformUnit.clear(); + PredictedInsertSize.clear(); + ProductionDate.clear(); + Program.clear(); + Sample.clear(); + SequencingCenter.clear(); + SequencingTechnology.clear(); + CustomTags.clear(); +} + +/*! \fn bool SamReadGroup::HasDescription() const + \brief Returns \c true if read group contains \@RG DS:\<Description\> +*/ +bool SamReadGroup::HasDescription() const +{ + return (!Description.empty()); +} + +/*! \fn bool SamReadGroup::HasFlowOrder() const + \brief Returns \c true if read group contains \@RG FO:\<FlowOrder\> +*/ +bool SamReadGroup::HasFlowOrder() const +{ + return (!FlowOrder.empty()); +} + +/*! \fn bool SamReadGroup::HasID() const + \brief Returns \c true if read group contains \@RG: ID:\<ID\> +*/ +bool SamReadGroup::HasID() const +{ + return (!ID.empty()); +} + +/*! \fn bool SamReadGroup::HasKeySequence() const + \brief Returns \c true if read group contains \@RG KS:\<KeySequence\> +*/ +bool SamReadGroup::HasKeySequence() const +{ + return (!KeySequence.empty()); +} + +/*! \fn bool SamReadGroup::HasLibrary() const + \brief Returns \c true if read group contains \@RG LB:\<Library\> +*/ +bool SamReadGroup::HasLibrary() const +{ + return (!Library.empty()); +} + +/*! \fn bool SamReadGroup::HasPlatformUnit() const + \brief Returns \c true if read group contains \@RG PU:\<PlatformUnit\> +*/ +bool SamReadGroup::HasPlatformUnit() const +{ + return (!PlatformUnit.empty()); +} + +/*! \fn bool SamReadGroup::HasPredictedInsertSize() const + \brief Returns \c true if read group contains \@RG PI:\<PredictedInsertSize\> +*/ +bool SamReadGroup::HasPredictedInsertSize() const +{ + return (!PredictedInsertSize.empty()); +} + +/*! \fn bool SamReadGroup::HasProductionDate() const + \brief Returns \c true if read group contains \@RG DT:\<ProductionDate\> +*/ +bool SamReadGroup::HasProductionDate() const +{ + return (!ProductionDate.empty()); +} + +/*! \fn bool SamReadGroup::HasProgram() const + \brief Returns \c true if read group contains \@RG PG:\<Program\> +*/ +bool SamReadGroup::HasProgram() const +{ + return (!Program.empty()); +} + +/*! \fn bool SamReadGroup::HasSample() const + \brief Returns \c true if read group contains \@RG SM:\<Sample\> +*/ +bool SamReadGroup::HasSample() const +{ + return (!Sample.empty()); +} + +/*! \fn bool SamReadGroup::HasSequencingCenter() const + \brief Returns \c true if read group contains \@RG CN:\<SequencingCenter\> +*/ +bool SamReadGroup::HasSequencingCenter() const +{ + return (!SequencingCenter.empty()); +} + +/*! \fn bool SamReadGroup::HasSequencingTechnology() const + \brief Returns \c true if read group contains \@RG PL:\<SequencingTechnology\> +*/ +bool SamReadGroup::HasSequencingTechnology() const +{ + return (!SequencingTechnology.empty()); +} diff --git a/src/api/SamReadGroup.h b/src/api/SamReadGroup.h new file mode 100644 index 0000000..96896e5 --- /dev/null +++ b/src/api/SamReadGroup.h @@ -0,0 +1,73 @@ +// *************************************************************************** +// SamReadGroup.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides direct read/write access to the SAM read group data fields. +// *************************************************************************** + +#ifndef SAM_READGROUP_H +#define SAM_READGROUP_H + +#include <string> +#include <vector> +#include "api/BamAux.h" +#include "api/api_global.h" + +namespace BamTools { + +struct API_EXPORT SamReadGroup +{ + + // ctor & dtor + SamReadGroup(); + SamReadGroup(const std::string& id); + SamReadGroup(const SamReadGroup& other); + ~SamReadGroup(); + + // query/modify entire read group + void Clear(); // clears all data fields + + // convenience query methods + bool HasDescription() const; // returns true if read group has a description + bool HasFlowOrder() const; // returns true if read group has a flow order entry + bool HasID() const; // returns true if read group has a group ID + bool HasKeySequence() const; // returns true if read group has a key sequence + bool HasLibrary() const; // returns true if read group has a library name + bool HasPlatformUnit() const; // returns true if read group has a platform unit ID + bool HasPredictedInsertSize() const; // returns true if read group has a predicted insert size + bool HasProductionDate() const; // returns true if read group has a production date + bool HasProgram() const; // returns true if read group has a program entry + bool HasSample() const; // returns true if read group has a sample name + bool HasSequencingCenter() const; // returns true if read group has a sequencing center ID + bool HasSequencingTechnology() + const; // returns true if read group has a sequencing technology ID + + // data fields + std::string Description; // DS:<Description> + std::string FlowOrder; // FO:<FlowOrder> + std::string ID; // ID:<ID> *Required for valid SAM header* + std::string KeySequence; // KS:<KeySequence> + std::string Library; // LB:<Library> + std::string PlatformUnit; // PU:<PlatformUnit> + std::string PredictedInsertSize; // PI:<PredictedInsertSize> + std::string ProductionDate; // DT:<ProductionDate> + std::string Program; // PG:<Program> + std::string Sample; // SM:<Sample> + std::string SequencingCenter; // CN:<SequencingCenter> + std::string SequencingTechnology; // PL:<SequencingTechnology> + std::vector<CustomHeaderTag> CustomTags; // optional custom tags +}; + +/*! \fn bool operator==(const SamReadGroup& lhs, const SamReadGroup& rhs) + \brief tests equality by comparing read group IDs +*/ +API_EXPORT inline bool operator==(const SamReadGroup& lhs, const SamReadGroup& rhs) +{ + return lhs.ID == rhs.ID; +} + +} // namespace BamTools + +#endif // SAM_READGROUP_H diff --git a/src/api/SamReadGroupDictionary.cpp b/src/api/SamReadGroupDictionary.cpp new file mode 100644 index 0000000..ec88031 --- /dev/null +++ b/src/api/SamReadGroupDictionary.cpp @@ -0,0 +1,317 @@ +// *************************************************************************** +// SamReadGroupDictionary.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 16 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides methods for operating on a collection of SamReadGroup entries. +// *************************************************************************** + +#include "api/SamReadGroupDictionary.h" +using namespace BamTools; + +#include <cstddef> +#include <iostream> + +/*! \class BamTools::SamReadGroupDictionary + \brief Container of SamReadGroup entries. + + Provides methods for operating on a collection of SamReadGroup entries. +*/ + +/*! \fn SamReadGroupDictionary::SamReadGroupDictionary() + \brief constructor +*/ +SamReadGroupDictionary::SamReadGroupDictionary() {} + +/*! \fn SamReadGroupDictionary::SamReadGroupDictionary(const SamReadGroupDictionary& other) + \brief copy constructor +*/ +SamReadGroupDictionary::SamReadGroupDictionary(const SamReadGroupDictionary& other) + : m_data(other.m_data) + , m_lookupData(other.m_lookupData) +{} + +/*! \fn SamReadGroupDictionary::~SamReadGroupDictionary() + \brief destructor +*/ +SamReadGroupDictionary::~SamReadGroupDictionary() {} + +/*! \fn void SamReadGroupDictionary::Add(const SamReadGroup& readGroup) + \brief Appends a read group to the dictionary. + + Duplicate entries are silently discarded. + + \param[in] readGroup entry to be added +*/ +void SamReadGroupDictionary::Add(const SamReadGroup& readGroup) +{ + if (IsEmpty() || !Contains(readGroup)) { + m_data.push_back(readGroup); + m_lookupData[readGroup.ID] = m_data.size() - 1; + } +} + +/*! \fn void SamReadGroupDictionary::Add(const std::string& readGroupId) + \brief Appends a read group to the dictionary. + + This is an overloaded function. + + \param[in] readGroupId ID of read group to be added + \sa Add() +*/ +void SamReadGroupDictionary::Add(const std::string& readGroupId) +{ + Add(SamReadGroup(readGroupId)); +} + +/*! \fn void SamReadGroupDictionary::Add(const SamReadGroupDictionary& readGroups) + \brief Appends another read group dictionary to this one. + + This is an overloaded function. + + \param[in] readGroups entries to be added + \sa Add() +*/ +void SamReadGroupDictionary::Add(const SamReadGroupDictionary& readGroups) +{ + SamReadGroupConstIterator rgIter = readGroups.ConstBegin(); + SamReadGroupConstIterator rgEnd = readGroups.ConstEnd(); + for (; rgIter != rgEnd; ++rgIter) + Add(*rgIter); +} + +/*! \fn void SamReadGroupDictionary::Add(const std::vector<SamReadGroup>& readGroups) + \brief Appends multiple read groups to the dictionary. + + This is an overloaded function. + + \param[in] readGroups entries to be added + \sa Add() +*/ +void SamReadGroupDictionary::Add(const std::vector<SamReadGroup>& readGroups) +{ + std::vector<SamReadGroup>::const_iterator rgIter = readGroups.begin(); + std::vector<SamReadGroup>::const_iterator rgEnd = readGroups.end(); + for (; rgIter != rgEnd; ++rgIter) + Add(*rgIter); +} + +/*! \fn void SamReadGroupDictionary::Add(const std::vector<std::string>& readGroupIds) + \brief Appends multiple read groups to the dictionary. + + This is an overloaded function. + + \param[in] readGroupIds IDs of read groups to be added + \sa Add() +*/ +void SamReadGroupDictionary::Add(const std::vector<std::string>& readGroupIds) +{ + std::vector<std::string>::const_iterator rgIter = readGroupIds.begin(); + std::vector<std::string>::const_iterator rgEnd = readGroupIds.end(); + for (; rgIter != rgEnd; ++rgIter) + Add(*rgIter); +} + +/*! \fn SamReadGroupIterator SamReadGroupDictionary::Begin() + \return an STL iterator pointing to the first read group + \sa ConstBegin(), End() +*/ +SamReadGroupIterator SamReadGroupDictionary::Begin() +{ + return m_data.begin(); +} + +/*! \fn SamReadGroupConstIterator SamReadGroupDictionary::Begin() const + \return an STL const_iterator pointing to the first read group + + This is an overloaded function. + + \sa ConstBegin(), End() +*/ +SamReadGroupConstIterator SamReadGroupDictionary::Begin() const +{ + return m_data.begin(); +} + +/*! \fn void SamReadGroupDictionary::Clear() + \brief Clears all read group entries. +*/ +void SamReadGroupDictionary::Clear() +{ + m_data.clear(); + m_lookupData.clear(); +} + +/*! \fn SamReadGroupConstIterator SamReadGroupDictionary::ConstBegin() const + \return an STL const_iterator pointing to the first read group + \sa Begin(), ConstEnd() +*/ +SamReadGroupConstIterator SamReadGroupDictionary::ConstBegin() const +{ + return m_data.begin(); +} + +/*! \fn SamReadGroupConstIterator SamReadGroupDictionary::ConstEnd() const + \return an STL const_iterator pointing to the imaginary entry after the last read group + \sa ConstBegin(), End() +*/ +SamReadGroupConstIterator SamReadGroupDictionary::ConstEnd() const +{ + return m_data.end(); +} + +/*! \fn bool SamReadGroupDictionary::Contains(const std::string& readGroupId) const + \brief Returns true if dictionary contains read group. + + \param[in] readGroupId search for read group matching this ID + \return \c true if dictionary contains a read group with this ID +*/ +bool SamReadGroupDictionary::Contains(const std::string& readGroupId) const +{ + return (m_lookupData.find(readGroupId) != m_lookupData.end()); +} + +/*! \fn bool SamReadGroupDictionary::Contains(const SamReadGroup& readGroup) const + \brief Returns true if dictionary contains read group (matching on ID). + + This is an overloaded function. + + \param[in] readGroup search for this read group + \return \c true if dictionary contains read group (matching on ID). +*/ +bool SamReadGroupDictionary::Contains(const SamReadGroup& readGroup) const +{ + return Contains(readGroup.ID); +} + +/*! \fn SamReadGroupIterator SamReadGroupDictionary::End() + \return an STL iterator pointing to the imaginary entry after the last read group + \sa Begin(), ConstEnd() +*/ +SamReadGroupIterator SamReadGroupDictionary::End() +{ + return m_data.end(); +} + +/*! \fn SamReadGroupConstIterator SamReadGroupDictionary::End() const + \return an STL const_iterator pointing to the imaginary entry after the last read group + + This is an overloaded function. + + \sa Begin(), ConstEnd() +*/ +SamReadGroupConstIterator SamReadGroupDictionary::End() const +{ + return m_data.end(); +} + +/*! \fn bool SamReadGroupDictionary::IsEmpty() const + \brief Returns \c true if dictionary contains no read groups + \sa Size() +*/ +bool SamReadGroupDictionary::IsEmpty() const +{ + return m_data.empty(); +} + +/*! \fn void SamReadGroupDictionary::Remove(const SamReadGroup& readGroup) + \brief Removes read group from dictionary, if found (matching on ID). + + This is an overloaded function. + + \param[in] readGroup read group to remove (matches on ID) +*/ +void SamReadGroupDictionary::Remove(const SamReadGroup& readGroup) +{ + Remove(readGroup.ID); +} + +/*! \fn void SamReadGroupDictionary::Remove(const std::string& readGroupId) + \brief Removes read group from dictionary, if found. + + \param[in] readGroupId ID of read group to remove + \sa Remove() +*/ +void SamReadGroupDictionary::Remove(const std::string& readGroupId) +{ + + // skip if empty dictionary or if ID unknown + if (IsEmpty() || !Contains(readGroupId)) return; + + // update 'lookup index' for every entry after @readGroupId + const std::size_t indexToRemove = m_lookupData[readGroupId]; + const std::size_t numEntries = m_data.size(); + for (std::size_t i = indexToRemove + 1; i < numEntries; ++i) { + const SamReadGroup& rg = m_data.at(i); + --m_lookupData[rg.ID]; + } + + // erase entry from containers + m_data.erase(Begin() + indexToRemove); + m_lookupData.erase(readGroupId); +} + +/*! \fn void SamReadGroupDictionary::Remove(const std::vector<SamReadGroup>& readGroups) + \brief Removes multiple read groups from dictionary (matching on ID). + + This is an overloaded function. + + \param[in] readGroups read groups to remove + \sa Remove() +*/ +void SamReadGroupDictionary::Remove(const std::vector<SamReadGroup>& readGroups) +{ + std::vector<SamReadGroup>::const_iterator rgIter = readGroups.begin(); + std::vector<SamReadGroup>::const_iterator rgEnd = readGroups.end(); + for (; rgIter != rgEnd; ++rgIter) + Remove(*rgIter); +} + +/*! \fn void SamReadGroupDictionary::Remove(const std::vector<std::string>& readGroupIds) + \brief Removes multiple read groups from dictionary. + + This is an overloaded function. + + \param[in] readGroupIds IDs of the read groups to remove + \sa Remove() +*/ +void SamReadGroupDictionary::Remove(const std::vector<std::string>& readGroupIds) +{ + std::vector<std::string>::const_iterator rgIter = readGroupIds.begin(); + std::vector<std::string>::const_iterator rgEnd = readGroupIds.end(); + for (; rgIter != rgEnd; ++rgIter) + Remove(*rgIter); +} + +/*! \fn int SamReadGroupDictionary::Size() const + \brief Returns number of read groups in dictionary. + \sa IsEmpty() +*/ +int SamReadGroupDictionary::Size() const +{ + return m_data.size(); +} + +/*! \fn SamReadGroup& SamReadGroupDictionary::operator[](const std::string& readGroupId) + \brief Retrieves the modifiable SamReadGroup that matches \a readGroupId. + + \note If the dictionary contains no read group matching this ID, this function inserts + a new one with this ID, and returns a reference to it. If you want to avoid this insertion + behavior, check the result of Contains() before using this operator. + + \param[in] readGroupId ID of read group to retrieve + \return a modifiable reference to the SamReadGroup associated with the ID +*/ +SamReadGroup& SamReadGroupDictionary::operator[](const std::string& readGroupId) +{ + + if (!Contains(readGroupId)) { + SamReadGroup rg(readGroupId); + m_data.push_back(rg); + m_lookupData[readGroupId] = m_data.size() - 1; + } + + const std::size_t index = m_lookupData[readGroupId]; + return m_data.at(index); +} diff --git a/src/api/SamReadGroupDictionary.h b/src/api/SamReadGroupDictionary.h new file mode 100644 index 0000000..79df6ca --- /dev/null +++ b/src/api/SamReadGroupDictionary.h @@ -0,0 +1,87 @@ +// *************************************************************************** +// SamReadGroupDictionary.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 16 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides methods for operating on a collection of SamReadGroup entries. +// *************************************************************************** + +#ifndef SAM_READGROUP_DICTIONARY_H +#define SAM_READGROUP_DICTIONARY_H + +#include <cstddef> +#include <map> +#include <string> +#include <vector> +#include "api/SamReadGroup.h" +#include "api/api_global.h" + +namespace BamTools { + +typedef std::vector<SamReadGroup> SamReadGroupContainer; +typedef SamReadGroupContainer::iterator SamReadGroupIterator; +typedef SamReadGroupContainer::const_iterator SamReadGroupConstIterator; + +class API_EXPORT SamReadGroupDictionary +{ + + // ctor & dtor +public: + SamReadGroupDictionary(); + SamReadGroupDictionary(const SamReadGroupDictionary& other); + ~SamReadGroupDictionary(); + + // query/modify read group data +public: + // adds a read group + void Add(const SamReadGroup& readGroup); + void Add(const std::string& readGroupId); + + // adds multiple read groups + void Add(const SamReadGroupDictionary& readGroups); + void Add(const std::vector<SamReadGroup>& readGroups); + void Add(const std::vector<std::string>& readGroupIds); + + // clears all read group entries + void Clear(); + + // returns true if dictionary contains this read group + bool Contains(const SamReadGroup& readGroup) const; + bool Contains(const std::string& readGroupId) const; + + // returns true if dictionary is empty + bool IsEmpty() const; + + // removes read group, if found + void Remove(const SamReadGroup& readGroup); + void Remove(const std::string& readGroupId); + + // removes multiple read groups + void Remove(const std::vector<SamReadGroup>& readGroups); + void Remove(const std::vector<std::string>& readGroupIds); + + // returns number of read groups in dictionary + int Size() const; + + // retrieves a modifiable reference to the SamReadGroup object associated with this ID + SamReadGroup& operator[](const std::string& readGroupId); + + // retrieve STL-compatible iterators +public: + SamReadGroupIterator Begin(); // returns iterator to begin() + SamReadGroupConstIterator Begin() const; // returns const_iterator to begin() + SamReadGroupConstIterator ConstBegin() const; // returns const_iterator to begin() + SamReadGroupIterator End(); // returns iterator to end() + SamReadGroupConstIterator End() const; // returns const_iterator to end() + SamReadGroupConstIterator ConstEnd() const; // returns const_iterator to end() + + // data members +private: + SamReadGroupContainer m_data; + std::map<std::string, std::size_t> m_lookupData; +}; + +} // namespace BamTools + +#endif // SAM_READGROUP_DICTIONARY_H diff --git a/src/api/SamSequence.cpp b/src/api/SamSequence.cpp new file mode 100644 index 0000000..8b4bcfa --- /dev/null +++ b/src/api/SamSequence.cpp @@ -0,0 +1,152 @@ +// *************************************************************************** +// SamSequence.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides direct read/write access to the SAM sequence data fields. +// *************************************************************************** + +#include "api/SamSequence.h" +#include <sstream> +using namespace BamTools; + +/*! \struct BamTools::SamSequence + \brief Represents a SAM sequence entry. + + Provides direct read/write access to the SAM sequence data fields. + + \sa \samSpecURL +*/ +/*! \var SamSequence::AssemblyID + \brief corresponds to \@SQ AS:\<AssemblyID\> +*/ +/*! \var SamSequence::Checksum + \brief corresponds to \@SQ M5:\<Checksum\> +*/ +/*! \var SamSequence::Length + \brief corresponds to \@SQ LN:\<Length\> + + Required for valid SAM header. +*/ +/*! \var SamSequence::Name + \brief corresponds to \@SQ SN:\<Name\> + + Required for valid SAM header. +*/ +/*! \var SamSequence::Species + \brief corresponds to \@SQ SP:\<Species\> +*/ +/*! \var SamSequence::URI + \brief corresponds to \@SQ UR:\<URI\> +*/ + +/*! \fn SamSequence::SamSequence() + \brief default constructor +*/ +SamSequence::SamSequence() {} + +/*! \fn SamSequence::SamSequence(const std::string& name, const int& length) + \brief constructs sequence with \a name and \a length + + \param name desired sequence name + \param length desired sequence length (numeric value) +*/ +SamSequence::SamSequence(const std::string& name, const int& length) + : Name(name) +{ + std::stringstream s; + s << length; + Length = s.str(); +} + +/*! \fn SamSequence::SamSequence(const std::string& name, const std::string& length) + \brief constructs sequence with \a name and \a length + + \param name desired sequence name + \param length desired sequence length (string value) +*/ +SamSequence::SamSequence(const std::string& name, const std::string& length) + : Length(length) + , Name(name) +{} + +/*! \fn SamSequence::SamSequence(const SamSequence& other) + \brief copy constructor +*/ +SamSequence::SamSequence(const SamSequence& other) + : AssemblyID(other.AssemblyID) + , Checksum(other.Checksum) + , Length(other.Length) + , Name(other.Name) + , Species(other.Species) + , URI(other.URI) + , CustomTags(other.CustomTags) +{} + +/*! \fn SamSequence::~SamSequence() + \brief destructor +*/ +SamSequence::~SamSequence() {} + +/*! \fn void SamSequence::Clear() + \brief Clears all data fields. +*/ +void SamSequence::Clear() +{ + AssemblyID.clear(); + Checksum.clear(); + Length.clear(); + Name.clear(); + Species.clear(); + URI.clear(); + CustomTags.clear(); +} + +/*! \fn bool SamSequence::HasAssemblyID() const + \brief Returns \c true if sequence contains \@SQ AS:\<AssemblyID\> +*/ +bool SamSequence::HasAssemblyID() const +{ + return (!AssemblyID.empty()); +} + +/*! \fn bool SamSequence::HasChecksum() const + \brief Returns \c true if sequence contains \@SQ M5:\<Checksum\> +*/ +bool SamSequence::HasChecksum() const +{ + return (!Checksum.empty()); +} + +/*! \fn bool SamSequence::HasLength() const + \brief Returns \c true if sequence contains \@SQ LN:\<Length\> +*/ +bool SamSequence::HasLength() const +{ + return (!Length.empty()); +} + +/*! \fn bool SamSequence::HasName() const + \brief Returns \c true if sequence contains \@SQ SN:\<Name\> +*/ +bool SamSequence::HasName() const +{ + return (!Name.empty()); +} + +/*! \fn bool SamSequence::HasSpecies() const + \brief Returns \c true if sequence contains \@SQ SP:\<Species\> +*/ +bool SamSequence::HasSpecies() const +{ + return (!Species.empty()); +} + +/*! \fn bool SamSequence::HasURI() const + \brief Returns \c true if sequence contains \@SQ UR:\<URI\> +*/ +bool SamSequence::HasURI() const +{ + return (!URI.empty()); +} diff --git a/src/api/SamSequence.h b/src/api/SamSequence.h new file mode 100644 index 0000000..c94a755 --- /dev/null +++ b/src/api/SamSequence.h @@ -0,0 +1,66 @@ +// *************************************************************************** +// SamSequence.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides direct read/write access to the SAM sequence data fields. +// *************************************************************************** + +#ifndef SAM_SEQUENCE_H +#define SAM_SEQUENCE_H + +#include <string> +#include <vector> +#include "api/BamAux.h" +#include "api/api_global.h" + +namespace BamTools { + +struct API_EXPORT SamSequence +{ + + // ctor & dtor + SamSequence(); + SamSequence(const std::string& name, const int& length); + SamSequence(const std::string& name, const std::string& length); + SamSequence(const SamSequence& other); + ~SamSequence(); + + // query/modify entire sequence + void Clear(); // clears all contents + + // convenience query methods + bool HasAssemblyID() const; // returns true if sequence has an assembly ID + bool HasChecksum() const; // returns true if sequence has an MD5 checksum + bool HasLength() const; // returns true if sequence has a length + bool HasName() const; // returns true if sequence has a name + bool HasSpecies() const; // returns true if sequence has a species ID + bool HasURI() const; // returns true if sequence has a URI + + // data members + std::string AssemblyID; // AS:<AssemblyID> + std::string Checksum; // M5:<Checksum> + std::string Length; // LN:<Length> *Required for valid SAM header* + std::string Name; // SN:<Name> *Required for valid SAM header* + std::string Species; // SP:<Species> + std::string URI; // UR:<URI> + std::vector<CustomHeaderTag> CustomTags; // optional custom tags +}; + +/*! \fn bool operator==(const SamSequence& lhs, const SamSequence& rhs) + \brief tests equality by comparing sequence names, lengths, & checksums (if available) +*/ +API_EXPORT inline bool operator==(const SamSequence& lhs, const SamSequence& rhs) +{ + if (lhs.Name != rhs.Name) return false; + if (lhs.Length != rhs.Length) return false; + if (lhs.HasChecksum() && rhs.HasChecksum()) + return (lhs.Checksum == rhs.Checksum); + else + return true; +} + +} // namespace BamTools + +#endif // SAM_SEQUENCE_H diff --git a/src/api/SamSequenceDictionary.cpp b/src/api/SamSequenceDictionary.cpp new file mode 100644 index 0000000..e38b7d3 --- /dev/null +++ b/src/api/SamSequenceDictionary.cpp @@ -0,0 +1,321 @@ +// *************************************************************************** +// SamSequenceDictionary.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 16 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides methods for operating on a collection of SamSequence entries. +// ************************************************************************* + +#include "api/SamSequenceDictionary.h" +using namespace BamTools; + +#include <cstddef> +#include <iostream> + +/*! \class BamTools::SamSequenceDictionary + \brief Container of SamSequence entries. + + Provides methods for operating on a collection of SamSequence entries. +*/ + +/*! \fn SamSequenceDictionary::SamSequenceDictionary() + \brief constructor +*/ +SamSequenceDictionary::SamSequenceDictionary() {} + +/*! \fn SamSequenceDictionary::SamSequenceDictionary(const SamSequenceDictionary& other) + \brief copy constructor +*/ +SamSequenceDictionary::SamSequenceDictionary(const SamSequenceDictionary& other) + : m_data(other.m_data) + , m_lookupData(other.m_lookupData) +{} + +/*! \fn SamSequenceDictionary::~SamSequenceDictionary() + \brief destructor +*/ +SamSequenceDictionary::~SamSequenceDictionary() {} + +/*! \fn void SamSequenceDictionary::Add(const SamSequence& sequence) + \brief Appends a sequence to the dictionary. + + Duplicate entries are silently discarded. + + \param[in] sequence entry to be added +*/ +void SamSequenceDictionary::Add(const SamSequence& sequence) +{ + if (IsEmpty() || !Contains(sequence)) { + m_data.push_back(sequence); + m_lookupData[sequence.Name] = m_data.size() - 1; + } +} + +/*! \fn void SamSequenceDictionary::Add(const std::string& name, const int& length) + \brief Appends a sequence to the dictionary. + + This is an overloaded function. + + \param[in] name name of sequence entry to be added + \param[in] length length of sequence entry to be added + \sa Add() +*/ +void SamSequenceDictionary::Add(const std::string& name, const int& length) +{ + Add(SamSequence(name, length)); +} + +/*! \fn void SamSequenceDictionary::Add(const SamSequenceDictionary& sequences) + \brief Appends another sequence dictionary to this one + + This is an overloaded function. + + \param[in] sequences sequence dictionary to be appended + \sa Add() +*/ +void SamSequenceDictionary::Add(const SamSequenceDictionary& sequences) +{ + SamSequenceConstIterator seqIter = sequences.ConstBegin(); + SamSequenceConstIterator seqEnd = sequences.ConstEnd(); + for (; seqIter != seqEnd; ++seqIter) + Add(*seqIter); +} + +/*! \fn void SamSequenceDictionary::Add(const std::vector<SamSequence>& sequences) + \brief Appends multiple sequences to the dictionary. + + This is an overloaded function. + + \param[in] sequences entries to be added + \sa Add() +*/ +void SamSequenceDictionary::Add(const std::vector<SamSequence>& sequences) +{ + std::vector<SamSequence>::const_iterator seqIter = sequences.begin(); + std::vector<SamSequence>::const_iterator seqEnd = sequences.end(); + for (; seqIter != seqEnd; ++seqIter) + Add(*seqIter); +} + +/*! \fn void SamSequenceDictionary::Add(const std::map<std::string, int>& sequenceMap) + \brief Appends multiple sequences to the dictionary. + + This is an overloaded function. + + \param[in] sequenceMap map of sequence entries (name => length) to be added + \sa Add() +*/ +void SamSequenceDictionary::Add(const std::map<std::string, int>& sequenceMap) +{ + std::map<std::string, int>::const_iterator seqIter = sequenceMap.begin(); + std::map<std::string, int>::const_iterator seqEnd = sequenceMap.end(); + for (; seqIter != seqEnd; ++seqIter) { + const std::string& name = (*seqIter).first; + const int& length = (*seqIter).second; + Add(SamSequence(name, length)); + } +} + +/*! \fn SamSequenceIterator SamSequenceDictionary::Begin() + \return an STL iterator pointing to the first sequence + \sa ConstBegin(), End() +*/ +SamSequenceIterator SamSequenceDictionary::Begin() +{ + return m_data.begin(); +} + +/*! \fn SamSequenceConstIterator SamSequenceDictionary::Begin() const + \return an STL const_iterator pointing to the first sequence + + This is an overloaded function. + + \sa ConstBegin(), End() +*/ +SamSequenceConstIterator SamSequenceDictionary::Begin() const +{ + return m_data.begin(); +} + +/*! \fn void SamSequenceDictionary::Clear() + \brief Clears all sequence entries. +*/ +void SamSequenceDictionary::Clear() +{ + m_data.clear(); + m_lookupData.clear(); +} + +/*! \fn SamSequenceConstIterator SamSequenceDictionary::ConstBegin() const + \return an STL const_iterator pointing to the first sequence + \sa Begin(), ConstEnd() +*/ +SamSequenceConstIterator SamSequenceDictionary::ConstBegin() const +{ + return m_data.begin(); +} + +/*! \fn SamSequenceConstIterator SamSequenceDictionary::ConstEnd() const + \return an STL const_iterator pointing to the imaginary entry after the last sequence + \sa End(), ConstBegin() +*/ +SamSequenceConstIterator SamSequenceDictionary::ConstEnd() const +{ + return m_data.end(); +} + +/*! \fn bool SamSequenceDictionary::Contains(const std::string& sequenceName) const + \brief Returns true if dictionary contains sequence. + + \param[in] sequenceName search for sequence matching this name + \return \c true if dictionary contains a sequence with this name +*/ +bool SamSequenceDictionary::Contains(const std::string& sequenceName) const +{ + return (m_lookupData.find(sequenceName) != m_lookupData.end()); +} + +/*! \fn bool SamSequenceDictionary::Contains(const SamSequence& sequence) const + \brief Returns true if dictionary contains sequence (matches on name). + + This is an overloaded function. + + \param[in] sequence search for this sequence + \return \c true if dictionary contains sequence (matching on name) +*/ +bool SamSequenceDictionary::Contains(const SamSequence& sequence) const +{ + return Contains(sequence.Name); +} + +/*! \fn SamSequenceIterator SamSequenceDictionary::End() + \return an STL iterator pointing to the imaginary entry after the last sequence + \sa Begin(), ConstEnd() +*/ +SamSequenceIterator SamSequenceDictionary::End() +{ + return m_data.end(); +} + +/*! \fn SamSequenceConstIterator SamSequenceDictionary::End() const + \return an STL const_iterator pointing to the imaginary entry after the last sequence + + This is an overloaded function. + + \sa Begin(), ConstEnd() +*/ +SamSequenceConstIterator SamSequenceDictionary::End() const +{ + return m_data.end(); +} + +/*! \fn bool SamSequenceDictionary::IsEmpty() const + \brief Returns \c true if dictionary contains no sequences + \sa Size() +*/ +bool SamSequenceDictionary::IsEmpty() const +{ + return m_data.empty(); +} + +/*! \fn void SamSequenceDictionary::Remove(const SamSequence& sequence) + \brief Removes sequence from dictionary, if found (matches on name). + + This is an overloaded function. + + \param[in] sequence SamSequence to remove (matching on name) +*/ +void SamSequenceDictionary::Remove(const SamSequence& sequence) +{ + Remove(sequence.Name); +} + +/*! \fn void SamSequenceDictionary::Remove(const std::string& sequenceName) + \brief Removes sequence from dictionary, if found. + + \param[in] sequenceName name of sequence to remove + \sa Remove() +*/ +void SamSequenceDictionary::Remove(const std::string& sequenceName) +{ + + // skip if empty dictionary or if name unknown + if (IsEmpty() || !Contains(sequenceName)) return; + + // update 'lookup index' for every entry after @sequenceName + const std::size_t indexToRemove = m_lookupData[sequenceName]; + const std::size_t numEntries = m_data.size(); + for (std::size_t i = indexToRemove + 1; i < numEntries; ++i) { + const SamSequence& sq = m_data.at(i); + --m_lookupData[sq.Name]; + } + + // erase entry from containers + m_data.erase(Begin() + indexToRemove); + m_lookupData.erase(sequenceName); +} + +/*! \fn void SamSequenceDictionary::Remove(const std::vector<SamSequence>& sequences) + \brief Removes multiple sequences from dictionary. + + This is an overloaded function. + + \param[in] sequences sequences to remove + \sa Remove() +*/ +void SamSequenceDictionary::Remove(const std::vector<SamSequence>& sequences) +{ + std::vector<SamSequence>::const_iterator rgIter = sequences.begin(); + std::vector<SamSequence>::const_iterator rgEnd = sequences.end(); + for (; rgIter != rgEnd; ++rgIter) + Remove(*rgIter); +} + +/*! \fn void SamSequenceDictionary::Remove(const std::vector<std::string>& sequenceNames) + \brief Removes multiple sequences from dictionary. + + This is an overloaded function. + + \param[in] sequenceNames names of the sequences to remove + \sa Remove() +*/ +void SamSequenceDictionary::Remove(const std::vector<std::string>& sequenceNames) +{ + std::vector<std::string>::const_iterator rgIter = sequenceNames.begin(); + std::vector<std::string>::const_iterator rgEnd = sequenceNames.end(); + for (; rgIter != rgEnd; ++rgIter) + Remove(*rgIter); +} + +/*! \fn int SamSequenceDictionary::Size() const + \brief Returns number of sequences in dictionary. + \sa IsEmpty() +*/ +int SamSequenceDictionary::Size() const +{ + return m_data.size(); +} + +/*! \fn SamSequence& SamSequenceDictionary::operator[](const std::string& sequenceName) + \brief Retrieves the modifiable SamSequence that matches \a sequenceName. + + \note If the dictionary contains no sequence matching this name, this function inserts + a new one with this name (length:0), and returns a reference to it. If you want to avoid + this insertion behavior, check the result of Contains() before using this operator. + + \param[in] sequenceName name of sequence to retrieve + \return a modifiable reference to the SamSequence associated with the name +*/ +SamSequence& SamSequenceDictionary::operator[](const std::string& sequenceName) +{ + + if (!Contains(sequenceName)) { + SamSequence seq(sequenceName, 0); + m_data.push_back(seq); + m_lookupData[sequenceName] = m_data.size() - 1; + } + + const std::size_t index = m_lookupData[sequenceName]; + return m_data.at(index); +} diff --git a/src/api/SamSequenceDictionary.h b/src/api/SamSequenceDictionary.h new file mode 100644 index 0000000..12375e5 --- /dev/null +++ b/src/api/SamSequenceDictionary.h @@ -0,0 +1,87 @@ +// *************************************************************************** +// SamSequenceDictionary.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 16 October 2011 +// --------------------------------------------------------------------------- +// Provides methods for operating on a collection of SamSequence entries. +// *************************************************************************** + +#ifndef SAM_SEQUENCE_DICTIONARY_H +#define SAM_SEQUENCE_DICTIONARY_H + +#include <cstddef> +#include <map> +#include <string> +#include <vector> +#include "api/SamSequence.h" +#include "api/api_global.h" + +namespace BamTools { + +typedef std::vector<SamSequence> SamSequenceContainer; +typedef SamSequenceContainer::iterator SamSequenceIterator; +typedef SamSequenceContainer::const_iterator SamSequenceConstIterator; + +class API_EXPORT SamSequenceDictionary +{ + + // ctor & dtor +public: + SamSequenceDictionary(); + SamSequenceDictionary(const SamSequenceDictionary& other); + ~SamSequenceDictionary(); + + // query/modify sequence data +public: + // adds a sequence + void Add(const SamSequence& sequence); + void Add(const std::string& name, const int& length); + + // adds multiple sequences + void Add(const SamSequenceDictionary& sequences); + void Add(const std::vector<SamSequence>& sequences); + void Add(const std::map<std::string, int>& sequenceMap); + + // clears all sequence entries + void Clear(); + + // returns true if dictionary contains this sequence + bool Contains(const SamSequence& sequence) const; + bool Contains(const std::string& sequenceName) const; + + // returns true if dictionary is empty + bool IsEmpty() const; + + // removes sequence, if found + void Remove(const SamSequence& sequence); + void Remove(const std::string& sequenceName); + + // removes multiple sequences + void Remove(const std::vector<SamSequence>& sequences); + void Remove(const std::vector<std::string>& sequenceNames); + + // returns number of sequences in dictionary + int Size() const; + + // retrieves a modifiable reference to the SamSequence object associated with this name + SamSequence& operator[](const std::string& sequenceName); + + // retrieve STL-compatible iterators +public: + SamSequenceIterator Begin(); // returns iterator to begin() + SamSequenceConstIterator Begin() const; // returns const_iterator to begin() + SamSequenceConstIterator ConstBegin() const; // returns const_iterator to begin() + SamSequenceIterator End(); // returns iterator to end() + SamSequenceConstIterator End() const; // returns const_iterator to end() + SamSequenceConstIterator ConstEnd() const; // returns const_iterator to end() + + // data members +private: + SamSequenceContainer m_data; + std::map<std::string, std::size_t> m_lookupData; +}; + +} // namespace BamTools + +#endif // SAM_SEQUENCE_DICTIONARY_H diff --git a/src/api/algorithms/Sort.h b/src/api/algorithms/Sort.h new file mode 100644 index 0000000..e9017cb --- /dev/null +++ b/src/api/algorithms/Sort.h @@ -0,0 +1,364 @@ +// *************************************************************************** +// Sort.h (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 4 April 2012 (DB) +// --------------------------------------------------------------------------- +// Provides sorting functionality. +// *************************************************************************** + +#ifndef ALGORITHMS_SORT_H +#define ALGORITHMS_SORT_H + +#include <algorithm> +#include <cassert> +#include <functional> +#include <string> +#include <vector> +#include "api/BamAlignment.h" +#include "api/BamMultiReader.h" +#include "api/BamReader.h" +#include "api/api_global.h" + +namespace BamTools { +namespace Algorithms { + +/*! \struct BamTools::Algorithms::Sort + \brief Provides classes & methods related to sorting BamAlignments +*/ +struct API_EXPORT Sort +{ + + //! Provides explicit values for specifying desired sort ordering + enum Order + { + AscendingOrder = 0, + DescendingOrder + }; + + /*! \fn template<typename ElemType> static inline bool sort_helper(const Sort::Order& order, const ElemType& lhs, const ElemType& rhs) + \internal + + Determines necessary STL function object depending on requested Sort::Order + */ + template <typename ElemType> + static inline bool sort_helper(const Sort::Order& order, const ElemType& lhs, + const ElemType& rhs) + { + switch (order) { + case (Sort::AscendingOrder): { + std::less<ElemType> comp; + return comp(lhs, rhs); + } + case (Sort::DescendingOrder): { + std::greater<ElemType> comp; + return comp(lhs, rhs); + } + default: + BT_ASSERT_UNREACHABLE; + } + return false; // <-- unreachable + } + + //! Base class for our sorting function objects + typedef std::binary_function<BamAlignment, BamAlignment, bool> AlignmentSortBase; + + /*! \struct BamTools::Algorithms::Sort::ByName + \brief Function object for comparing alignments by name + + Default sort order is Sort::AscendingOrder. + + \code + std::vector<BamAlignment> a; + + // sort by name, in ascending order (the following two lines are equivalent): + std::sort( a.begin(), a.end(), Sort::ByName() ); + std::sort( a.begin(), a.end(), Sort::ByName(Sort::AscendingOrder) ); + + // OR sort in descending order + std::sort( a.begin(), a.end(), Sort::ByName(Sort::DescendingOrder) ); + \endcode + */ + struct ByName : public AlignmentSortBase + { + + // ctor + ByName(const Sort::Order& order = Sort::AscendingOrder) + : m_order(order) + {} + + // comparison function + bool operator()(const BamTools::BamAlignment& lhs, const BamTools::BamAlignment& rhs) + { + return sort_helper(m_order, lhs.Name, rhs.Name); + } + + // used by BamMultiReader internals + static inline bool UsesCharData() + { + return true; + } + + // data members + private: + const Sort::Order m_order; + }; + + /*! \struct BamTools::Algorithms::Sort::ByPosition + \brief Function object for comparing alignments by position + + Default sort order is Sort::AscendingOrder. + + \code + std::vector<BamAlignment> a; + + // sort by position, in ascending order (the following two lines are equivalent): + std::sort( a.begin(), a.end(), Sort::ByPosition() ); + std::sort( a.begin(), a.end(), Sort::ByPosition(Sort::AscendingOrder) ); + + // OR sort in descending order + std::sort( a.begin(), a.end(), Sort::ByPosition(Sort::DescendingOrder) ); + \endcode + */ + struct ByPosition : public AlignmentSortBase + { + + // ctor + ByPosition(const Sort::Order& order = Sort::AscendingOrder) + : m_order(order) + {} + + // comparison function + bool operator()(const BamTools::BamAlignment& lhs, const BamTools::BamAlignment& rhs) + { + + // force unmapped aligmnents to end + if (lhs.RefID == -1) return false; + if (rhs.RefID == -1) return true; + + // if on same reference, sort on position + if (lhs.RefID == rhs.RefID) return sort_helper(m_order, lhs.Position, rhs.Position); + + // otherwise sort on reference ID + return sort_helper(m_order, lhs.RefID, rhs.RefID); + } + + // used by BamMultiReader internals + static inline bool UsesCharData() + { + return false; + } + + // data members + private: + const Sort::Order m_order; + }; + + /*! \struct BamTools::Algorithms::Sort::ByTag + \brief Function object for comparing alignments by tag value + + Default sort order is Sort::AscendingOrder. + + \code + std::vector<BamAlignment> a; + + // sort by edit distance, in ascending order (the following two lines are equivalent): + std::sort( a.begin(), a.end(), Sort::ByTag<int>("NM") ); + std::sort( a.begin(), a.end(), Sort::ByTag<int>("NM", Sort::AscendingOrder) ); + + // OR sort in descending order + std::sort( a.begin(), a.end(), Sort::ByTag<int>("NM", Sort::DescendingOrder) ); + \endcode + */ + template <typename T> + struct ByTag : public AlignmentSortBase + { + + // ctor + ByTag(const std::string& tag, const Sort::Order& order = Sort::AscendingOrder) + : m_tag(tag) + , m_order(order) + {} + + // comparison function + bool operator()(const BamTools::BamAlignment& lhs, const BamTools::BamAlignment& rhs) + { + + // force alignments without tag to end + T lhsTagValue; + T rhsTagValue; + if (!lhs.GetTag(m_tag, lhsTagValue)) return false; + if (!rhs.GetTag(m_tag, rhsTagValue)) return true; + + // otherwise compare on tag values + return sort_helper(m_order, lhsTagValue, rhsTagValue); + } + + // used by BamMultiReader internals + static inline bool UsesCharData() + { + return true; + } + + // data members + private: + const std::string m_tag; + const Sort::Order m_order; + }; + + /*! \struct BamTools::Algorithms::Sort::Unsorted + \brief Placeholder function object + + This function object exists purely to allow for dropping a "do not care" ordering + into methods, containers, etc that are designed to work with the other sorting objects. + + \code + std::set<BamAlignment, Sort::ByName>; // STL set, ordered on alignment name + std::set<BamAlignment, Sort::Unsorted>; // STL set, unsorted (but probably insertion order) + \endcode + */ + struct Unsorted : public AlignmentSortBase + { + + // comparison function + inline bool operator()(const BamTools::BamAlignment&, const BamTools::BamAlignment&) + { + return false; // returning false tends to retain insertion order + } + + // used by BamMultiReader internals + static inline bool UsesCharData() + { + return false; + } + }; + + /*! Sorts a std::vector of alignments (in-place), using the provided compare function. + + \code + std::vector<BamAlignemnt> a; + // populate data + + // sort our alignment list by edit distance + Sort::SortAlignments(a, Sort::ByTag<int>("NM")); + \endcode + + \param[in,out] data vector of alignments to be sorted + \param[in] comp comparison function object + */ + template <typename Compare> + static inline void SortAlignments(std::vector<BamAlignment>& data, + const Compare& comp = Compare()) + { + std::sort(data.begin(), data.end(), comp); + } + + /*! Returns a sorted copy of the input alignments, using the provided compare function. + + \code + std::vector<BamAlignemnt> a; + // populate data + + // get a copy of our original data, sorted by edit distance (descending order) + std::vector<BamAligment> sortedData; + sortedData = Sort::SortAlignments(a, Sort::ByTag<int>("NM", Sort::DescendingOrder)); + \endcode + + \param[in] input vector of alignments to be sorted + \param[in] comp comparison function object + \return sorted copy of the input data + */ + template <typename Compare> + static inline std::vector<BamAlignment> SortAlignments(const std::vector<BamAlignment>& input, + const Compare& comp = Compare()) + { + std::vector<BamAlignment> output(input); + SortAlignments(output, comp); + return output; + } + + /*! Reads a region of alignments from a position-sorted BAM file, + then sorts by the provided compare function + + \code + BamReader reader; + // open BAM file & index file + + BamRegion region; + // define a region of interest (i.e. a exon or some other feature) + + // get all alignments covering that region, sorted by read group name + std::vector<BamAlignments> a; + a = Sort::GetSortedRegion(reader, region, Sort::ByTag<std::string>("RG")); + \endcode + + \param[in] reader BamReader opened on desired BAM file + \param[in] region desired region-of-interest + \param[in] comp comparison function object + \return sorted vector of the region's alignments + */ + template <typename Compare> + static std::vector<BamAlignment> GetSortedRegion(BamReader& reader, const BamRegion& region, + const Compare& comp = Compare()) + { + // return empty container if unable to find region + if (!reader.IsOpen()) return std::vector<BamAlignment>(); + if (!reader.SetRegion(region)) return std::vector<BamAlignment>(); + + // iterate through region, grabbing alignments + BamAlignment al; + std::vector<BamAlignment> results; + while (reader.GetNextAlignmentCore(al)) + results.push_back(al); + + // sort & return alignments + SortAlignments(results, comp); + return results; + } + + /*! Reads a region of alignments from position-sorted BAM files, + then sorts by the provided compare function + + \code + BamMultiReader reader; + // open BAM files & index files + + BamRegion region; + // define a region of interest (i.e. a exon or some other feature) + + // get all alignments covering that region, sorted by read group name + std::vector<BamAlignments> a; + a = Sort::GetSortedRegion(reader, region, Sort::ByTag<std::string>("RG")); + \endcode + + \param[in] reader BamMultiReader opened on desired BAM files + \param[in] region desired region-of-interest + \param[in] comp comparison function object + \return sorted vector of the region's alignments + */ + template <typename Compare> + static std::vector<BamAlignment> GetSortedRegion(BamMultiReader& reader, + const BamRegion& region, + const Compare& comp = Compare()) + { + // return empty container if unable to find region + if (!reader.HasOpenReaders()) return std::vector<BamAlignment>(); + if (!reader.SetRegion(region)) return std::vector<BamAlignment>(); + + // iterate through region, grabbing alignments + BamAlignment al; + std::vector<BamAlignment> results; + while (reader.GetNextAlignmentCore(al)) + results.push_back(al); + + // sort & return alignments + SortAlignments(results, comp); + return results; + } +}; + +} // namespace Algorithms +} // namespace BamTools + +#endif // ALGORITHMS_SORT_H diff --git a/src/api/api_global.h b/src/api/api_global.h new file mode 100644 index 0000000..889f050 --- /dev/null +++ b/src/api/api_global.h @@ -0,0 +1,21 @@ +// *************************************************************************** +// api_global.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides macros for exporting & importing BamTools API library symbols +// *************************************************************************** + +#ifndef API_GLOBAL_H +#define API_GLOBAL_H + +#include "shared/bamtools_global.h" + +#ifdef BAMTOOLS_API_LIBRARY +#define API_EXPORT BAMTOOLS_LIBRARY_EXPORT +#else +#define API_EXPORT BAMTOOLS_LIBRARY_IMPORT +#endif + +#endif // API_GLOBAL_H diff --git a/src/api/internal/CMakeLists.txt b/src/api/internal/CMakeLists.txt new file mode 100644 index 0000000..a96cd6f --- /dev/null +++ b/src/api/internal/CMakeLists.txt @@ -0,0 +1,25 @@ +# ========================== +# BamTools CMakeLists.txt +# (c) 2011 Derek Barnett +# +# src/api/internal +# ========================== + +set( InternalDir "internal" ) + +add_subdirectory( bam ) +add_subdirectory( index ) +add_subdirectory( io ) +add_subdirectory( sam ) +add_subdirectory( utils ) + +set( InternalSources + ${InternalBamSources} + ${InternalIndexSources} + ${InternalIOSources} + ${InternalSamSources} + ${InternalUtilsSources} + + PARENT_SCOPE # <-- leave this last + ) + diff --git a/src/api/internal/bam/BamHeader_p.cpp b/src/api/internal/bam/BamHeader_p.cpp new file mode 100644 index 0000000..b97e565 --- /dev/null +++ b/src/api/internal/bam/BamHeader_p.cpp @@ -0,0 +1,132 @@ +// *************************************************************************** +// BamHeader_p.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 18 November 2012 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for handling BAM headers. +// *************************************************************************** + +#include "api/internal/bam/BamHeader_p.h" +#include "api/BamAux.h" +#include "api/BamConstants.h" +#include "api/internal/io/BgzfStream_p.h" +#include "api/internal/utils/BamException_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstddef> +#include <cstdlib> +#include <cstring> + +// ------------------------ +// static utility methods +// ------------------------ + +static inline bool isValidMagicNumber(const char* buffer) +{ + return (strncmp(buffer, Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH) == 0); +} + +// -------------------------- +// BamHeader implementation +// -------------------------- + +// ctor +BamHeader::BamHeader() {} + +// dtor +BamHeader::~BamHeader() {} + +// reads magic number from BGZF stream, returns true if valid +void BamHeader::CheckMagicNumber(BgzfStream* stream) +{ + + // try to read magic number + char buffer[Constants::BAM_HEADER_MAGIC_LENGTH]; + const std::size_t numBytesRead = stream->Read(buffer, Constants::BAM_HEADER_MAGIC_LENGTH); + if (numBytesRead != (int)Constants::BAM_HEADER_MAGIC_LENGTH) + throw BamException("BamHeader::CheckMagicNumber", "could not read magic number"); + + // validate magic number + if (!isValidMagicNumber(buffer)) + throw BamException("BamHeader::CheckMagicNumber", "invalid magic number"); +} + +// clear SamHeader data +void BamHeader::Clear() +{ + m_header.Clear(); +} + +// return true if SamHeader data is valid +bool BamHeader::IsValid() const +{ + return m_header.IsValid(); +} + +// load BAM header ('magic number' and SAM header text) from BGZF stream +void BamHeader::Load(BgzfStream* stream) +{ + + // read & check magic number + CheckMagicNumber(stream); + + // read header (length, then actual text) + uint32_t length(0); + ReadHeaderLength(stream, length); + ReadHeaderText(stream, length); +} + +// reads SAM header text length from BGZF stream, stores it in @length +void BamHeader::ReadHeaderLength(BgzfStream* stream, uint32_t& length) +{ + + // read BAM header text length + char buffer[sizeof(uint32_t)]; + const std::size_t numBytesRead = stream->Read(buffer, sizeof(uint32_t)); + if (numBytesRead != sizeof(uint32_t)) + throw BamException("BamHeader::ReadHeaderLength", "could not read header length"); + + // convert char buffer to length + length = BamTools::UnpackUnsignedInt(buffer); + if (BamTools::SystemIsBigEndian()) BamTools::SwapEndian_32(length); +} + +// reads SAM header text from BGZF stream, stores in SamHeader object +void BamHeader::ReadHeaderText(BgzfStream* stream, const uint32_t& length) +{ + + // read header text + char* headerText = (char*)calloc(length + 1, 1); + const std::size_t bytesRead = stream->Read(headerText, length); + + // if error reading, clean up buffer & throw + if (bytesRead != length) { + free(headerText); + throw BamException("BamHeader::ReadHeaderText", "could not read header text"); + } + + // otherwise, text was read OK + // store & cleanup + m_header.SetHeaderText(static_cast<std::string>((const char*)headerText)); + free(headerText); +} + +// returns const-reference to SamHeader data object +const SamHeader& BamHeader::ToConstSamHeader() const +{ + return m_header; +} + +// returns *copy* of SamHeader data object +SamHeader BamHeader::ToSamHeader() const +{ + return m_header; +} + +// returns SAM-formatted string of header data +std::string BamHeader::ToString() const +{ + return m_header.ToString(); +} diff --git a/src/api/internal/bam/BamHeader_p.h b/src/api/internal/bam/BamHeader_p.h new file mode 100644 index 0000000..eed576e --- /dev/null +++ b/src/api/internal/bam/BamHeader_p.h @@ -0,0 +1,72 @@ +// *************************************************************************** +// BamHeader_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 18 November 2012 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for handling BAM headers. +// *************************************************************************** + +#ifndef BAMHEADER_P_H +#define BAMHEADER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <string> +#include "api/SamHeader.h" + +namespace BamTools { +namespace Internal { + +class BgzfStream; + +class BamHeader +{ + + // ctor & dtor +public: + BamHeader(); + ~BamHeader(); + + // BamHeader interface +public: + // clear SamHeader data + void Clear(); + // return true if SamHeader data is valid + bool IsValid() const; + // load BAM header ('magic number' and SAM header text) from BGZF stream + // returns true if all OK + void Load(BgzfStream* stream); + // returns (read-only) reference to SamHeader data object + const SamHeader& ToConstSamHeader() const; + // returns (editable) copy of SamHeader data object + SamHeader ToSamHeader() const; + // returns SAM-formatted string of header data + std::string ToString() const; + + // internal methods +private: + // reads magic number from BGZF stream + void CheckMagicNumber(BgzfStream* stream); + // reads SAM header length from BGZF stream, stores it in @length + void ReadHeaderLength(BgzfStream* stream, uint32_t& length); + // reads SAM header text from BGZF stream, stores in SamHeader object + void ReadHeaderText(BgzfStream* stream, const uint32_t& length); + + // data members +private: + SamHeader m_header; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMHEADER_P_H diff --git a/src/api/internal/bam/BamMultiMerger_p.h b/src/api/internal/bam/BamMultiMerger_p.h new file mode 100644 index 0000000..9835559 --- /dev/null +++ b/src/api/internal/bam/BamMultiMerger_p.h @@ -0,0 +1,278 @@ +// *************************************************************************** +// BamMultiMerger_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides merging functionality for BamMultiReader. At this point, supports +// sorting results by (refId, position) or by read name. +// *************************************************************************** + +#ifndef BAMMULTIMERGER_P_H +#define BAMMULTIMERGER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <deque> +#include <functional> +#include <set> +#include <string> +#include "api/BamAlignment.h" +#include "api/BamReader.h" +#include "api/algorithms/Sort.h" + +namespace BamTools { +namespace Internal { + +struct MergeItem +{ + + // data members + BamReader* Reader; + BamAlignment* Alignment; + + // ctors & dtor + MergeItem(BamReader* reader = 0, BamAlignment* alignment = 0) + : Reader(reader) + , Alignment(alignment) + {} + + MergeItem(const MergeItem& other) + : Reader(other.Reader) + , Alignment(other.Alignment) + {} + + ~MergeItem() {} +}; + +template <typename Compare> +struct MergeItemSorter : public std::binary_function<MergeItem, MergeItem, bool> +{ + +public: + MergeItemSorter(const Compare& comp = Compare()) + : m_comp(comp) + {} + + bool operator()(const MergeItem& lhs, const MergeItem& rhs) + { + const BamAlignment& l = *lhs.Alignment; + const BamAlignment& r = *rhs.Alignment; + return m_comp(l, r); + } + +private: + Compare m_comp; +}; + +// pure ABC so we can just work polymorphically with any specific merger implementation +class IMultiMerger +{ + +public: + IMultiMerger() {} + virtual ~IMultiMerger() {} + +public: + virtual void Add(MergeItem item) = 0; + virtual void Clear() = 0; + virtual const MergeItem& First() const = 0; + virtual bool IsEmpty() const = 0; + virtual void Remove(BamReader* reader) = 0; + virtual int Size() const = 0; + virtual MergeItem TakeFirst() = 0; +}; + +// general merger +template <typename Compare> +class MultiMerger : public IMultiMerger +{ + +public: + typedef Compare CompareType; + typedef MergeItemSorter<CompareType> MergeType; + +public: + explicit MultiMerger(const Compare& comp = Compare()) + : IMultiMerger() + , m_data(MergeType(comp)) + {} + ~MultiMerger() {} + +public: + void Add(MergeItem item); + void Clear(); + const MergeItem& First() const; + bool IsEmpty() const; + void Remove(BamReader* reader); + int Size() const; + MergeItem TakeFirst(); + +private: + typedef MergeItem ValueType; + typedef std::multiset<ValueType, MergeType> ContainerType; + typedef typename ContainerType::iterator DataIterator; + typedef typename ContainerType::const_iterator DataConstIterator; + ContainerType m_data; +}; + +template <typename Compare> +inline void MultiMerger<Compare>::Add(MergeItem item) +{ + + // N.B. - any future custom Compare types must define this method + // see algorithms/Sort.h + + if (CompareType::UsesCharData()) item.Alignment->BuildCharData(); + m_data.insert(item); +} + +template <typename Compare> +inline void MultiMerger<Compare>::Clear() +{ + m_data.clear(); +} + +template <typename Compare> +inline const MergeItem& MultiMerger<Compare>::First() const +{ + const ValueType& entry = (*m_data.begin()); + return entry; +} + +template <typename Compare> +inline bool MultiMerger<Compare>::IsEmpty() const +{ + return m_data.empty(); +} +template <typename Compare> +inline void MultiMerger<Compare>::Remove(BamReader* reader) +{ + + if (reader == 0) return; + const std::string& filenameToRemove = reader->GetFilename(); + + // iterate over readers in cache + DataIterator dataIter = m_data.begin(); + DataIterator dataEnd = m_data.end(); + for (; dataIter != dataEnd; ++dataIter) { + const MergeItem& item = (*dataIter); + const BamReader* itemReader = item.Reader; + if (itemReader == 0) continue; + + // remove iterator on match + if (itemReader->GetFilename() == filenameToRemove) { + m_data.erase(dataIter); + return; + } + } +} +template <typename Compare> +inline int MultiMerger<Compare>::Size() const +{ + return m_data.size(); +} + +template <typename Compare> +inline MergeItem MultiMerger<Compare>::TakeFirst() +{ + DataIterator firstIter = m_data.begin(); + MergeItem firstItem = (*firstIter); + m_data.erase(firstIter); + return firstItem; +} + +// unsorted "merger" +template <> +class MultiMerger<Algorithms::Sort::Unsorted> : public IMultiMerger +{ + +public: + explicit MultiMerger(const Algorithms::Sort::Unsorted& comp = Algorithms::Sort::Unsorted()) + : IMultiMerger() + {} + ~MultiMerger() {} + +public: + void Add(MergeItem item); + void Clear(); + const MergeItem& First() const; + bool IsEmpty() const; + void Remove(BamReader* reader); + int Size() const; + MergeItem TakeFirst(); + +private: + typedef MergeItem ValueType; + typedef std::deque<ValueType> ContainerType; + typedef ContainerType::iterator DataIterator; + typedef ContainerType::const_iterator DataConstIterator; + ContainerType m_data; +}; + +inline void MultiMerger<Algorithms::Sort::Unsorted>::Add(MergeItem item) +{ + m_data.push_back(item); +} + +inline void MultiMerger<Algorithms::Sort::Unsorted>::Clear() +{ + m_data.clear(); +} + +inline const MergeItem& MultiMerger<Algorithms::Sort::Unsorted>::First() const +{ + return m_data.front(); +} + +inline bool MultiMerger<Algorithms::Sort::Unsorted>::IsEmpty() const +{ + return m_data.empty(); +} + +inline void MultiMerger<Algorithms::Sort::Unsorted>::Remove(BamReader* reader) +{ + + if (reader == 0) return; + const std::string filenameToRemove = reader->GetFilename(); + + // iterate over readers in cache + DataIterator dataIter = m_data.begin(); + DataIterator dataEnd = m_data.end(); + for (; dataIter != dataEnd; ++dataIter) { + const MergeItem& item = (*dataIter); + const BamReader* itemReader = item.Reader; + if (itemReader == 0) continue; + + // remove iterator on match + if (itemReader->GetFilename() == filenameToRemove) { + m_data.erase(dataIter); + return; + } + } +} + +inline int MultiMerger<Algorithms::Sort::Unsorted>::Size() const +{ + return m_data.size(); +} + +inline MergeItem MultiMerger<Algorithms::Sort::Unsorted>::TakeFirst() +{ + MergeItem firstItem = m_data.front(); + m_data.pop_front(); + return firstItem; +} + +} // namespace Internal +} // namespace BamTools + +#endif // BAMMULTIMERGER_P_H diff --git a/src/api/internal/bam/BamMultiReader_p.cpp b/src/api/internal/bam/BamMultiReader_p.cpp new file mode 100644 index 0000000..a99fac1 --- /dev/null +++ b/src/api/internal/bam/BamMultiReader_p.cpp @@ -0,0 +1,905 @@ +// *************************************************************************** +// BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 24 July 2013 (DB) +// --------------------------------------------------------------------------- +// Functionality for simultaneously reading multiple BAM files +// ************************************************************************* + +#include "api/internal/bam/BamMultiReader_p.h" +#include "api/BamAlignment.h" +#include "api/BamMultiReader.h" +#include "api/SamConstants.h" +#include "api/algorithms/Sort.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <algorithm> +#include <cstddef> +#include <fstream> +#include <iostream> +#include <iterator> +#include <sstream> + +// ctor +BamMultiReaderPrivate::BamMultiReaderPrivate() + : m_alignmentCache(0) + , m_hasUserMergeOrder(false) + , m_mergeOrder(BamMultiReader::RoundRobinMerge) +{} + +// dtor +BamMultiReaderPrivate::~BamMultiReaderPrivate() +{ + Close(); +} + +// close all BAM files +bool BamMultiReaderPrivate::Close() +{ + + m_errorString.clear(); + + if (CloseFiles(Filenames())) + return true; + else { + const std::string currentError = m_errorString; + const std::string message = + std::string("error encountered while closing all files: \n\t") + currentError; + SetErrorString("BamMultiReader::Close", message); + return false; + } +} + +// close requested BAM file +bool BamMultiReaderPrivate::CloseFile(const std::string& filename) +{ + + m_errorString.clear(); + + std::vector<std::string> filenames(1, filename); + if (CloseFiles(filenames)) + return true; + else { + const std::string currentError = m_errorString; + const std::string message = + std::string("error while closing file: ") + filename + '\n' + currentError; + SetErrorString("BamMultiReader::CloseFile", message); + return false; + } +} + +// close requested BAM files +bool BamMultiReaderPrivate::CloseFiles(const std::vector<std::string>& filenames) +{ + + bool errorsEncountered = false; + m_errorString.clear(); + + // iterate over filenames + std::vector<std::string>::const_iterator filesIter = filenames.begin(); + std::vector<std::string>::const_iterator filesEnd = filenames.end(); + for (; filesIter != filesEnd; ++filesIter) { + const std::string& filename = (*filesIter); + if (filename.empty()) continue; + + // iterate over readers + std::vector<MergeItem>::iterator readerIter = m_readers.begin(); + std::vector<MergeItem>::iterator readerEnd = m_readers.end(); + for (; readerIter != readerEnd; ++readerIter) { + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + if (reader == 0) continue; + + // if reader matches requested filename + if (reader->GetFilename() == filename) { + + // remove reader's entry from alignment cache + m_alignmentCache->Remove(reader); + + // clean up reader & its alignment + if (!reader->Close()) { + m_errorString.append(1, '\t'); + m_errorString.append(reader->GetErrorString()); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } + delete reader; + reader = 0; + + // delete reader's alignment entry + BamAlignment* alignment = item.Alignment; + delete alignment; + alignment = 0; + + // remove reader from reader list + m_readers.erase(readerIter); + + // on match, just go on to next filename + // (no need to keep looking and item iterator is invalid now anyway) + break; + } + } + } + + // make sure we clean up properly if all readers were closed + if (m_readers.empty()) { + + // clean up merger + if (m_alignmentCache) { + m_alignmentCache->Clear(); + delete m_alignmentCache; + m_alignmentCache = 0; + } + + // reset merge flags + m_hasUserMergeOrder = false; + m_mergeOrder = BamMultiReader::RoundRobinMerge; + } + + // return whether all readers closed OK + return !errorsEncountered; +} + +// creates index files for BAM files that don't have them +bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type) +{ + + bool errorsEncountered = false; + m_errorString.clear(); + + // iterate over readers + std::vector<MergeItem>::iterator itemIter = m_readers.begin(); + std::vector<MergeItem>::iterator itemEnd = m_readers.end(); + for (; itemIter != itemEnd; ++itemIter) { + MergeItem& item = (*itemIter); + BamReader* reader = item.Reader; + if (reader == 0) continue; + + // if reader doesn't have an index, create one + if (!reader->HasIndex()) { + if (!reader->CreateIndex(type)) { + m_errorString.append(1, '\t'); + m_errorString.append(reader->GetErrorString()); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } + } + } + + // check for errors encountered before returning success/fail + if (errorsEncountered) { + const std::string currentError = m_errorString; + const std::string message = + std::string("error while creating index files: \n") + currentError; + SetErrorString("BamMultiReader::CreateIndexes", message); + return false; + } else + return true; +} + +IMultiMerger* BamMultiReaderPrivate::CreateAlignmentCache() +{ + + // if no merge order set explicitly, use SAM header to lookup proper order + if (!m_hasUserMergeOrder) { + + // fetch SamHeader from BAM files + SamHeader header = GetHeader(); + + // if BAM files are sorted by position + if (header.SortOrder == Constants::SAM_HD_SORTORDER_COORDINATE) + m_mergeOrder = BamMultiReader::MergeByCoordinate; + + // if BAM files are sorted by read name + else if (header.SortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME) + m_mergeOrder = BamMultiReader::MergeByName; + + // otherwise, sorting is either "unknown" or marked as "unsorted" + else + m_mergeOrder = BamMultiReader::RoundRobinMerge; + } + + // use current merge order to create proper 'multi-merger' + switch (m_mergeOrder) { + + // merge BAM files by position + case BamMultiReader::MergeByCoordinate: + return new MultiMerger<Algorithms::Sort::ByPosition>(); + + // merge BAM files by read name + case BamMultiReader::MergeByName: + return new MultiMerger<Algorithms::Sort::ByName>(); + + // sorting is "unknown", "unsorted" or "ignored"... so use unsorted merger + case BamMultiReader::RoundRobinMerge: + return new MultiMerger<Algorithms::Sort::Unsorted>(); + + // unknown merge order, can't create merger + default: + return 0; + } +} + +const std::vector<std::string> BamMultiReaderPrivate::Filenames() const +{ + + // init filename container + std::vector<std::string> filenames; + filenames.reserve(m_readers.size()); + + // iterate over readers + std::vector<MergeItem>::const_iterator itemIter = m_readers.begin(); + std::vector<MergeItem>::const_iterator itemEnd = m_readers.end(); + for (; itemIter != itemEnd; ++itemIter) { + const MergeItem& item = (*itemIter); + const BamReader* reader = item.Reader; + if (reader == 0) continue; + + // store filename if not empty + const std::string& filename = reader->GetFilename(); + if (!filename.empty()) filenames.push_back(filename); + } + + // return result + return filenames; +} + +std::string BamMultiReaderPrivate::GetErrorString() const +{ + return m_errorString; +} + +SamHeader BamMultiReaderPrivate::GetHeader() const +{ + const std::string& text = GetHeaderText(); + return SamHeader(text); +} + +// makes a virtual, unified header for all the bam files in the multireader +std::string BamMultiReaderPrivate::GetHeaderText() const +{ + + // N.B. - right now, simply copies all header data from first BAM, + // and then appends RG's from other BAM files + // TODO: make this more intelligent wrt other header lines/fields + + // if no readers open + const std::size_t numReaders = m_readers.size(); + if (numReaders == 0) return std::string(); + + // retrieve first reader's header + const MergeItem& firstItem = m_readers.front(); + const BamReader* reader = firstItem.Reader; + if (reader == 0) return std::string(); + SamHeader mergedHeader = reader->GetHeader(); + + // iterate over any remaining readers (skipping the first) + for (std::size_t i = 1; i < numReaders; ++i) { + const MergeItem& item = m_readers.at(i); + const BamReader* reader = item.Reader; + if (reader == 0) continue; + + // retrieve current reader's header + const SamHeader currentHeader = reader->GetHeader(); + + // append current reader's RG entries to merged header + // N.B. - SamReadGroupDictionary handles duplicate-checking + mergedHeader.ReadGroups.Add(currentHeader.ReadGroups); + + // TODO: merge anything else?? + } + + // return stringified header + return mergedHeader.ToString(); +} + +BamMultiReader::MergeOrder BamMultiReaderPrivate::GetMergeOrder() const +{ + return m_mergeOrder; +} + +// get next alignment among all files +bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) +{ + return PopNextCachedAlignment(al, true); +} + +// get next alignment among all files without parsing character data from alignments +bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) +{ + return PopNextCachedAlignment(al, false); +} + +// --------------------------------------------------------------------------------------- +// +// NB: The following GetReferenceX() functions assume that we have identical +// references for all BAM files. We enforce this by invoking the +// ValidateReaders() method to verify that our reference data is the same +// across all files on Open - so we will not encounter a situation in which +// there is a mismatch and we are still live. +// +// --------------------------------------------------------------------------------------- + +// returns the number of reference sequences +int BamMultiReaderPrivate::GetReferenceCount() const +{ + + // handle empty multireader + if (m_readers.empty()) return 0; + + // return reference count from first reader + const MergeItem& item = m_readers.front(); + const BamReader* reader = item.Reader; + if (reader == 0) + return 0; + else + return reader->GetReferenceCount(); +} + +// returns vector of reference objects +const RefVector BamMultiReaderPrivate::GetReferenceData() const +{ + + // handle empty multireader + if (m_readers.empty()) return RefVector(); + + // return reference data from first BamReader + const MergeItem& item = m_readers.front(); + const BamReader* reader = item.Reader; + if (reader == 0) + return RefVector(); + else + return reader->GetReferenceData(); +} + +// returns refID from reference name +int BamMultiReaderPrivate::GetReferenceID(const std::string& refName) const +{ + + // handle empty multireader + if (m_readers.empty()) return -1; + + // return reference ID from first BamReader + const MergeItem& item = m_readers.front(); + const BamReader* reader = item.Reader; + if (reader == 0) + return -1; + else + return reader->GetReferenceID(refName); +} +// --------------------------------------------------------------------------------------- + +// returns true if all readers have index data available +// this is useful to indicate whether Jump() or SetRegion() are possible +bool BamMultiReaderPrivate::HasIndexes() const +{ + + // handle empty multireader + if (m_readers.empty()) return false; + + bool result = true; + + // iterate over readers + std::vector<MergeItem>::const_iterator readerIter = m_readers.begin(); + std::vector<MergeItem>::const_iterator readerEnd = m_readers.end(); + for (; readerIter != readerEnd; ++readerIter) { + const MergeItem& item = (*readerIter); + const BamReader* reader = item.Reader; + if (reader == 0) continue; + + // see if current reader has index data + result &= reader->HasIndex(); + } + + return result; +} + +// returns true if multireader has open readers +bool BamMultiReaderPrivate::HasOpenReaders() +{ + + // iterate over readers + std::vector<MergeItem>::const_iterator readerIter = m_readers.begin(); + std::vector<MergeItem>::const_iterator readerEnd = m_readers.end(); + for (; readerIter != readerEnd; ++readerIter) { + const MergeItem& item = (*readerIter); + const BamReader* reader = item.Reader; + if (reader == 0) continue; + + // return true whenever an open reader is found + if (reader->IsOpen()) return true; + } + + // no readers open + return false; +} + +// performs random-access jump using (refID, position) as a left-bound +bool BamMultiReaderPrivate::Jump(int refID, int position) +{ + + // NB: While it may make sense to track readers in which we can + // successfully Jump, in practice a failure of Jump means "no + // alignments here." It makes sense to simply accept the failure, + // UpdateAlignments(), and continue. + + // iterate over readers + std::vector<MergeItem>::iterator readerIter = m_readers.begin(); + std::vector<MergeItem>::iterator readerEnd = m_readers.end(); + for (; readerIter != readerEnd; ++readerIter) { + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + if (reader == 0) continue; + + // jump in each BamReader to position of interest + reader->Jump(refID, position); + } + + // returns status of cache update + return UpdateAlignmentCache(); +} + +// locate (& load) index files for BAM readers that don't already have one loaded +bool BamMultiReaderPrivate::LocateIndexes(const BamIndex::IndexType& preferredType) +{ + + bool errorsEncountered = false; + m_errorString.clear(); + + // iterate over readers + std::vector<MergeItem>::iterator readerIter = m_readers.begin(); + std::vector<MergeItem>::iterator readerEnd = m_readers.end(); + for (; readerIter != readerEnd; ++readerIter) { + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + if (reader == 0) continue; + + // if reader has no index, try to locate one + if (!reader->HasIndex()) { + if (!reader->LocateIndex(preferredType)) { + m_errorString.append(1, '\t'); + m_errorString.append(reader->GetErrorString()); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } + } + } + + // check for errors encountered before returning success/fail + if (errorsEncountered) { + const std::string currentError = m_errorString; + const std::string message = + std::string("error while locating index files: \n") + currentError; + SetErrorString("BamMultiReader::LocatingIndexes", message); + return false; + } else + return true; +} + +// opens BAM files +bool BamMultiReaderPrivate::Open(const std::vector<std::string>& filenames) +{ + + m_errorString.clear(); + + // put all current readers back at beginning (refreshes alignment cache) + if (!Rewind()) { + const std::string currentError = m_errorString; + const std::string message = + std::string("unable to rewind existing readers: \n\t") + currentError; + SetErrorString("BamMultiReader::Open", message); + return false; + } + + // iterate over filenames + bool errorsEncountered = false; + std::vector<std::string>::const_iterator filenameIter = filenames.begin(); + std::vector<std::string>::const_iterator filenameEnd = filenames.end(); + for (; filenameIter != filenameEnd; ++filenameIter) { + const std::string& filename = (*filenameIter); + if (filename.empty()) continue; + + // attempt to open BamReader + BamReader* reader = new BamReader; + const bool readerOpened = reader->Open(filename); + + // if opened OK, store it + if (readerOpened) m_readers.push_back(MergeItem(reader, new BamAlignment)); + + // otherwise store error & clean up invalid reader + else { + m_errorString.append(1, '\t'); + m_errorString += std::string("unable to open file: ") + filename; + m_errorString.append(1, '\n'); + errorsEncountered = true; + + delete reader; + reader = 0; + } + } + + // check for errors while opening + if (errorsEncountered) { + const std::string currentError = m_errorString; + const std::string message = std::string("unable to open all files: \t\n") + currentError; + SetErrorString("BamMultiReader::Open", message); + return false; + } + + // check for BAM file consistency + if (!ValidateReaders()) { + const std::string currentError = m_errorString; + const std::string message = + std::string("unable to open inconsistent files: \t\n") + currentError; + SetErrorString("BamMultiReader::Open", message); + return false; + } + + // update alignment cache + return UpdateAlignmentCache(); +} + +bool BamMultiReaderPrivate::OpenFile(const std::string& filename) +{ + std::vector<std::string> filenames(1, filename); + if (Open(filenames)) + return true; + else { + const std::string currentError = m_errorString; + const std::string message = + std::string("could not open file: ") + filename + "\n\t" + currentError; + SetErrorString("BamMultiReader::OpenFile", message); + return false; + } +} + +bool BamMultiReaderPrivate::OpenIndexes(const std::vector<std::string>& indexFilenames) +{ + + // TODO: This needs to be cleaner - should not assume same order. + // And either way, shouldn't start at first reader. Should start at + // first reader without an index? + + // make sure same number of index filenames as readers + if (m_readers.size() != indexFilenames.size()) { + const std::string message("size of index file list does not match current BAM file count"); + SetErrorString("BamMultiReader::OpenIndexes", message); + return false; + } + + bool errorsEncountered = false; + m_errorString.clear(); + + // iterate over BamReaders + std::vector<std::string>::const_iterator indexFilenameIter = indexFilenames.begin(); + std::vector<std::string>::const_iterator indexFilenameEnd = indexFilenames.end(); + std::vector<MergeItem>::iterator readerIter = m_readers.begin(); + std::vector<MergeItem>::iterator readerEnd = m_readers.end(); + for (; readerIter != readerEnd; ++readerIter) { + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + + // open index filename on reader + if (reader) { + const std::string& indexFilename = (*indexFilenameIter); + if (!reader->OpenIndex(indexFilename)) { + m_errorString.append(1, '\t'); + m_errorString += reader->GetErrorString(); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } + } + + // increment filename iterator, skip if no more index files to open + if (++indexFilenameIter == indexFilenameEnd) break; + } + + // return success/fail + if (errorsEncountered) { + const std::string currentError = m_errorString; + const std::string message = + std::string("could not open all index files: \n\t") + currentError; + SetErrorString("BamMultiReader::OpenIndexes", message); + return false; + } else + return true; +} + +bool BamMultiReaderPrivate::PopNextCachedAlignment(BamAlignment& al, const bool needCharData) +{ + + // skip if no alignments available + if (m_alignmentCache == 0 || m_alignmentCache->IsEmpty()) return false; + + // pop next merge item entry from cache + MergeItem item = m_alignmentCache->TakeFirst(); + BamReader* reader = item.Reader; + BamAlignment* alignment = item.Alignment; + if (reader == 0 || alignment == 0) return false; + + // set char data if requested + if (needCharData) { + alignment->BuildCharData(); + alignment->Filename = reader->GetFilename(); + } + + // store cached alignment into destination parameter (by copy) + al = *alignment; + + // load next alignment from reader & store in cache + SaveNextAlignment(reader, alignment); + return true; +} + +// returns BAM file pointers to beginning of alignment data & resets alignment cache +bool BamMultiReaderPrivate::Rewind() +{ + + // skip if no readers open + if (m_readers.empty()) return true; + + // attempt to rewind files + if (!RewindReaders()) { + const std::string currentError = m_errorString; + const std::string message = std::string("could not rewind readers: \n\t") + currentError; + SetErrorString("BamMultiReader::Rewind", message); + return false; + } + + // return status of cache update + return UpdateAlignmentCache(); +} + +// returns BAM file pointers to beginning of alignment data +bool BamMultiReaderPrivate::RewindReaders() +{ + + m_errorString.clear(); + bool errorsEncountered = false; + + // iterate over readers + std::vector<MergeItem>::iterator readerIter = m_readers.begin(); + std::vector<MergeItem>::iterator readerEnd = m_readers.end(); + for (; readerIter != readerEnd; ++readerIter) { + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + if (reader == 0) continue; + + // attempt rewind on BamReader + if (!reader->Rewind()) { + m_errorString.append(1, '\t'); + m_errorString.append(reader->GetErrorString()); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } + } + + return !errorsEncountered; +} + +void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* alignment) +{ + + // if can read alignment from reader, store in cache + // + // N.B. - lazy building of alignment's char data - populated only: + // automatically by alignment cache to maintain its sorting OR + // on demand from client call to future call to GetNextAlignment() + + if (reader->GetNextAlignmentCore(*alignment)) + m_alignmentCache->Add(MergeItem(reader, alignment)); +} + +bool BamMultiReaderPrivate::SetExplicitMergeOrder(BamMultiReader::MergeOrder order) +{ + + // set new merge flags + m_hasUserMergeOrder = true; + m_mergeOrder = order; + + // remove any existing merger (storing any existing data sitting in the cache) + std::vector<MergeItem> currentCacheData; + if (m_alignmentCache) { + while (!m_alignmentCache->IsEmpty()) + currentCacheData.push_back(m_alignmentCache->TakeFirst()); + delete m_alignmentCache; + m_alignmentCache = 0; + } + + // create new cache using the new merge flags + m_alignmentCache = CreateAlignmentCache(); + if (m_alignmentCache == 0) { + SetErrorString("BamMultiReader::SetExplicitMergeOrder", "requested order is unrecognized"); + return false; + } + + // push current data onto new cache + std::vector<MergeItem>::const_iterator readerIter = currentCacheData.begin(); + std::vector<MergeItem>::const_iterator readerEnd = currentCacheData.end(); + for (; readerIter != readerEnd; ++readerIter) { + const MergeItem& item = (*readerIter); + m_alignmentCache->Add(item); + } + + // return success + return true; +} + +void BamMultiReaderPrivate::SetErrorString(const std::string& where, const std::string& what) const +{ + static const std::string SEPARATOR(": "); + m_errorString = where + SEPARATOR + what; +} + +bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) +{ + + // NB: While it may make sense to track readers in which we can + // successfully SetRegion, In practice a failure of SetRegion means "no + // alignments here." It makes sense to simply accept the failure, + // UpdateAlignments(), and continue. + + // iterate over alignments + std::vector<MergeItem>::iterator readerIter = m_readers.begin(); + std::vector<MergeItem>::iterator readerEnd = m_readers.end(); + for (; readerIter != readerEnd; ++readerIter) { + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + if (reader == 0) continue; + + // set region of interest + reader->SetRegion(region); + } + + // return status of cache update + return UpdateAlignmentCache(); +} + +// updates our alignment cache +bool BamMultiReaderPrivate::UpdateAlignmentCache() +{ + + // create alignment cache if not created yet + if (m_alignmentCache == 0) { + m_alignmentCache = CreateAlignmentCache(); + if (m_alignmentCache == 0) { + SetErrorString("BamMultiReader::UpdateAlignmentCache", + "unable to create new alignment cache"); + return false; + } + } + + // clear any prior cache data + m_alignmentCache->Clear(); + + // iterate over readers + std::vector<MergeItem>::iterator readerIter = m_readers.begin(); + std::vector<MergeItem>::iterator readerEnd = m_readers.end(); + for (; readerIter != readerEnd; ++readerIter) { + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + BamAlignment* alignment = item.Alignment; + if (reader == 0 || alignment == 0) continue; + + // save next alignment from each reader in cache + SaveNextAlignment(reader, alignment); + } + + // if we get here, ok + return true; +} + +// ValidateReaders checks that all the readers point to BAM files representing +// alignments against the same set of reference sequences, and that the +// sequences are identically ordered. If these checks fail the operation of +// the multireader is undefined, so we force program exit. +bool BamMultiReaderPrivate::ValidateReaders() const +{ + + m_errorString.clear(); + + // skip if 0 or 1 readers opened + if (m_readers.empty() || (m_readers.size() == 1)) return true; + + // retrieve first reader + const MergeItem& firstItem = m_readers.front(); + const BamReader* firstReader = firstItem.Reader; + if (firstReader == 0) return false; + + // retrieve first reader's header data + const SamHeader& firstReaderHeader = firstReader->GetHeader(); + const std::string& firstReaderSortOrder = firstReaderHeader.SortOrder; + + // retrieve first reader's reference data + const RefVector& firstReaderRefData = firstReader->GetReferenceData(); + const int firstReaderRefCount = firstReader->GetReferenceCount(); + const int firstReaderRefSize = firstReaderRefData.size(); + + // iterate over all readers + std::vector<MergeItem>::const_iterator readerIter = m_readers.begin(); + std::vector<MergeItem>::const_iterator readerEnd = m_readers.end(); + for (; readerIter != readerEnd; ++readerIter) { + const MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + if (reader == 0) continue; + + // get current reader's header data + const SamHeader& currentReaderHeader = reader->GetHeader(); + const std::string& currentReaderSortOrder = currentReaderHeader.SortOrder; + + // check compatible sort order + if (currentReaderSortOrder != firstReaderSortOrder) { + const std::string message = + std::string("mismatched sort order in ") + reader->GetFilename() + ", expected " + + firstReaderSortOrder + ", but found " + currentReaderSortOrder; + SetErrorString("BamMultiReader::ValidateReaders", message); + return false; + } + + // get current reader's reference data + const RefVector currentReaderRefData = reader->GetReferenceData(); + const int currentReaderRefCount = reader->GetReferenceCount(); + const int currentReaderRefSize = currentReaderRefData.size(); + + // init reference data iterators + RefVector::const_iterator firstRefIter = firstReaderRefData.begin(); + RefVector::const_iterator firstRefEnd = firstReaderRefData.end(); + RefVector::const_iterator currentRefIter = currentReaderRefData.begin(); + + // compare reference counts from BamReader ( & container size, in case of BR error) + if ((currentReaderRefCount != firstReaderRefCount) || + (firstReaderRefSize != currentReaderRefSize)) { + std::stringstream s; + s << "mismatched reference count in " << reader->GetFilename() << ", expected " + << firstReaderRefCount << ", but found " << currentReaderRefCount; + SetErrorString("BamMultiReader::ValidateReaders", s.str()); + return false; + } + + // this will be ok; we just checked above that we have identically-sized sets of references + // here we simply check if they are all, in fact, equal in content + while (firstRefIter != firstRefEnd) { + const RefData& firstRef = (*firstRefIter); + const RefData& currentRef = (*currentRefIter); + + // compare reference name & length + if ((firstRef.RefName != currentRef.RefName) || + (firstRef.RefLength != currentRef.RefLength)) { + std::stringstream s; + s << "mismatched references found in" << reader->GetFilename() + << "expected: " << std::endl; + + // print first reader's reference data + RefVector::const_iterator refIter = firstReaderRefData.begin(); + RefVector::const_iterator refEnd = firstReaderRefData.end(); + for (; refIter != refEnd; ++refIter) { + const RefData& entry = (*refIter); + std::stringstream s; + s << entry.RefName << ' ' << std::endl; + } + + s << "but found: " << std::endl; + + // print current reader's reference data + refIter = currentReaderRefData.begin(); + refEnd = currentReaderRefData.end(); + for (; refIter != refEnd; ++refIter) { + const RefData& entry = (*refIter); + s << entry.RefName << ' ' << entry.RefLength << std::endl; + } + + SetErrorString("BamMultiReader::ValidateReaders", s.str()); + return false; + } + + // update iterators + ++firstRefIter; + ++currentRefIter; + } + } + + // if we get here, everything checks out + return true; +} diff --git a/src/api/internal/bam/BamMultiReader_p.h b/src/api/internal/bam/BamMultiReader_p.h new file mode 100644 index 0000000..aa661cd --- /dev/null +++ b/src/api/internal/bam/BamMultiReader_p.h @@ -0,0 +1,104 @@ +// *************************************************************************** +// BamMultiReader_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 14 January 2013 (DB) +// --------------------------------------------------------------------------- +// Functionality for simultaneously reading multiple BAM files +// ************************************************************************* + +#ifndef BAMMULTIREADER_P_H +#define BAMMULTIREADER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <string> +#include <vector> +#include "api/BamMultiReader.h" +#include "api/SamHeader.h" +#include "api/internal/bam/BamMultiMerger_p.h" + +namespace BamTools { +namespace Internal { + +class BamMultiReaderPrivate +{ + + // typedefs +public: + typedef std::pair<BamReader*, BamAlignment*> ReaderAlignment; + + // constructor / destructor +public: + BamMultiReaderPrivate(); + ~BamMultiReaderPrivate(); + + // public interface +public: + // file operations + bool Close(); + bool CloseFile(const std::string& filename); + const std::vector<std::string> Filenames() const; + bool Jump(int refID, int position = 0); + bool Open(const std::vector<std::string>& filenames); + bool OpenFile(const std::string& filename); + bool Rewind(); + bool SetRegion(const BamRegion& region); + + // access alignment data + BamMultiReader::MergeOrder GetMergeOrder() const; + bool GetNextAlignment(BamAlignment& al); + bool GetNextAlignmentCore(BamAlignment& al); + bool HasOpenReaders(); + bool SetExplicitMergeOrder(BamMultiReader::MergeOrder order); + + // access auxiliary data + SamHeader GetHeader() const; + std::string GetHeaderText() const; + int GetReferenceCount() const; + const BamTools::RefVector GetReferenceData() const; + int GetReferenceID(const std::string& refName) const; + + // BAM index operations + bool CreateIndexes(const BamIndex::IndexType& type = BamIndex::STANDARD); + bool HasIndexes() const; + bool LocateIndexes(const BamIndex::IndexType& preferredType = BamIndex::STANDARD); + bool OpenIndexes(const std::vector<std::string>& indexFilenames); + + // error handling + std::string GetErrorString() const; + + // 'internal' methods +public: + bool CloseFiles(const std::vector<std::string>& filenames); + IMultiMerger* CreateAlignmentCache(); + bool PopNextCachedAlignment(BamAlignment& al, const bool needCharData); + bool RewindReaders(); + void SaveNextAlignment(BamReader* reader, BamAlignment* alignment); + void SetErrorString(const std::string& where, const std::string& what) const; // + bool UpdateAlignmentCache(); + bool ValidateReaders() const; + + // data members +public: + std::vector<MergeItem> m_readers; + IMultiMerger* m_alignmentCache; + + bool m_hasUserMergeOrder; + BamMultiReader::MergeOrder m_mergeOrder; + + mutable std::string m_errorString; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMMULTIREADER_P_H diff --git a/src/api/internal/bam/BamRandomAccessController_p.cpp b/src/api/internal/bam/BamRandomAccessController_p.cpp new file mode 100644 index 0000000..5b5bc58 --- /dev/null +++ b/src/api/internal/bam/BamRandomAccessController_p.cpp @@ -0,0 +1,302 @@ +// *************************************************************************** +// BamRandomAccessController_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011(DB) +// --------------------------------------------------------------------------- +// Manages random access operations in a BAM file +// ************************************************************************** + +#include "api/internal/bam/BamRandomAccessController_p.h" +#include "api/BamIndex.h" +#include "api/internal/bam/BamReader_p.h" +#include "api/internal/index/BamIndexFactory_p.h" +#include "api/internal/utils/BamException_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cassert> +#include <sstream> + +BamRandomAccessController::BamRandomAccessController() + : m_index(0) + , m_hasAlignmentsInRegion(true) +{} + +BamRandomAccessController::~BamRandomAccessController() +{ + Close(); +} + +void BamRandomAccessController::AdjustRegion(const int& referenceCount) +{ + + // skip if no index available + if (m_index == 0) return; + + // see if any references in region have alignments + m_hasAlignmentsInRegion = false; + int currentId = m_region.LeftRefID; + const int rightBoundRefId = + (m_region.isRightBoundSpecified() ? m_region.RightRefID : referenceCount - 1); + while (currentId <= rightBoundRefId) { + m_hasAlignmentsInRegion = m_index->HasAlignments(currentId); + if (m_hasAlignmentsInRegion) break; + ++currentId; + } + + // if no data found on any reference in region + if (!m_hasAlignmentsInRegion) return; + + // if left bound of desired region had no data, use first reference that had data + // otherwise, leave requested region as-is + if (currentId != m_region.LeftRefID) { + m_region.LeftRefID = currentId; + m_region.LeftPosition = 0; + } +} + +// returns alignments' "RegionState": { Before|Overlaps|After } current region +BamRandomAccessController::RegionState BamRandomAccessController::AlignmentState( + const BamAlignment& alignment) const +{ + + // if region has no left bound at all + if (!m_region.isLeftBoundSpecified()) return OverlapsRegion; + + // handle unmapped reads - return AFTER region to halt processing + if (alignment.RefID == -1) return AfterRegion; + + // if alignment is on any reference before left bound reference + if (alignment.RefID < m_region.LeftRefID) return BeforeRegion; + + // if alignment is on left bound reference + else if (alignment.RefID == m_region.LeftRefID) { + + // if alignment starts at or after left bound position + if (alignment.Position >= m_region.LeftPosition) { + + if (m_region.isRightBoundSpecified() && // right bound is specified AND + m_region.LeftRefID == + m_region.RightRefID && // left & right bounds on same reference AND + alignment.Position >= + m_region.RightPosition) // alignment starts on or after right bound position + return AfterRegion; + + // otherwise, alignment overlaps region + else + return OverlapsRegion; + } + + // alignment starts before left bound position + else { + + // if alignment overlaps left bound position + if (alignment.GetEndPosition() > m_region.LeftPosition) + return OverlapsRegion; + else + return BeforeRegion; + } + } + + // otherwise alignment is on a reference after left bound reference + else { + + // if region has a right bound + if (m_region.isRightBoundSpecified()) { + + // alignment is on any reference between boundaries + if (alignment.RefID < m_region.RightRefID) return OverlapsRegion; + + // alignment is on any reference after right boundary + else if (alignment.RefID > m_region.RightRefID) + return AfterRegion; + + // alignment is on right bound reference + else { + + // if alignment starts before right bound position + if (alignment.Position < m_region.RightPosition) + return OverlapsRegion; + else + return AfterRegion; + } + } + + // otherwise, alignment starts after left bound and there is no right bound given + else + return OverlapsRegion; + } +} + +void BamRandomAccessController::Close() +{ + ClearIndex(); + ClearRegion(); +} + +void BamRandomAccessController::ClearIndex() +{ + if (m_index) { + delete m_index; + m_index = 0; + } +} + +void BamRandomAccessController::ClearRegion() +{ + m_region.clear(); + m_hasAlignmentsInRegion = true; +} + +bool BamRandomAccessController::CreateIndex(BamReaderPrivate* reader, + const BamIndex::IndexType& type) +{ + // skip if reader is invalid + assert(reader); + if (!reader->IsOpen()) { + SetErrorString("BamRandomAccessController::CreateIndex", + "cannot create index for unopened reader"); + return false; + } + + // create new index of requested type + BamIndex* newIndex = BamIndexFactory::CreateIndexOfType(type, reader); + if (newIndex == 0) { + std::stringstream s; + s << "could not create index of type: " << type; + SetErrorString("BamRandomAccessController::CreateIndex", s.str()); + return false; + } + + // attempt to build index from current BamReader file + if (!newIndex->Create()) { + const std::string indexError = newIndex->GetErrorString(); + const std::string message = "could not create index: \n\t" + indexError; + SetErrorString("BamRandomAccessController::CreateIndex", message); + return false; + } + + // save new index & return success + SetIndex(newIndex); + return true; +} + +std::string BamRandomAccessController::GetErrorString() const +{ + return m_errorString; +} + +bool BamRandomAccessController::HasIndex() const +{ + return (m_index != 0); +} + +bool BamRandomAccessController::HasRegion() const +{ + return (!m_region.isNull()); +} + +bool BamRandomAccessController::IndexHasAlignmentsForReference(const int& refId) +{ + return m_index->HasAlignments(refId); +} + +bool BamRandomAccessController::LocateIndex(BamReaderPrivate* reader, + const BamIndex::IndexType& preferredType) +{ + // look up index filename, deferring to preferredType if possible + assert(reader); + const std::string& indexFilename = + BamIndexFactory::FindIndexFilename(reader->Filename(), preferredType); + + // if no index file found (of any type) + if (indexFilename.empty()) { + const std::string message = + std::string("could not find index file for:") + reader->Filename(); + SetErrorString("BamRandomAccessController::LocateIndex", message); + return false; + } + + // otherwise open & use index file that was found + return OpenIndex(indexFilename, reader); +} + +bool BamRandomAccessController::OpenIndex(const std::string& indexFilename, + BamReaderPrivate* reader) +{ + + // attempt create new index of type based on filename + BamIndex* index = BamIndexFactory::CreateIndexFromFilename(indexFilename, reader); + if (index == 0) { + const std::string message = std::string("could not open index file: ") + indexFilename; + SetErrorString("BamRandomAccessController::OpenIndex", message); + return false; + } + + // attempt to load data from index file + if (!index->Load(indexFilename)) { + const std::string indexError = index->GetErrorString(); + const std::string message = std::string("could not load index data from file: ") + + indexFilename + "\n\t" + indexError; + SetErrorString("BamRandomAccessController::OpenIndex", message); + return false; + } + + // save new index & return success + SetIndex(index); + return true; +} + +bool BamRandomAccessController::RegionHasAlignments() const +{ + return m_hasAlignmentsInRegion; +} + +void BamRandomAccessController::SetErrorString(const std::string& where, const std::string& what) +{ + m_errorString = where + ": " + what; +} + +void BamRandomAccessController::SetIndex(BamIndex* index) +{ + if (m_index) ClearIndex(); + m_index = index; +} + +bool BamRandomAccessController::SetRegion(const BamRegion& region, const int& referenceCount) +{ + + // store region + m_region = region; + + // cannot jump when no index is available + if (!HasIndex()) { + SetErrorString("BamRandomAccessController", "cannot jump if no index data available"); + return false; + } + + // adjust region as necessary to reflect where data actually begins + AdjustRegion(referenceCount); + + // if no data present, return true + // * Not an error, but future attempts to access alignments in this region will not return data + // Returning true is useful in a BamMultiReader setting where some BAM files may + // lack alignments in regions where other files still have data available. + if (!m_hasAlignmentsInRegion) return true; + + // return success/failure of jump to specified region, + // + // * Index::Jump() is allowed to modify the m_hasAlignmentsInRegion flag + // This covers 'corner case' where a region is requested that lies beyond the last + // alignment on a reference. If this occurs, any subsequent calls to GetNextAlignment[Core] + // will not return data. BamMultiReader will still be able to successfully pull alignments + // from a region from other files even if this one has no data. + if (!m_index->Jump(m_region, &m_hasAlignmentsInRegion)) { + const std::string indexError = m_index->GetErrorString(); + const std::string message = std::string("could not set region\n\t") + indexError; + SetErrorString("BamRandomAccessController::OpenIndex", message); + return false; + } else + return true; +} diff --git a/src/api/internal/bam/BamRandomAccessController_p.h b/src/api/internal/bam/BamRandomAccessController_p.h new file mode 100644 index 0000000..e569581 --- /dev/null +++ b/src/api/internal/bam/BamRandomAccessController_p.h @@ -0,0 +1,96 @@ +// *************************************************************************** +// BamRandomAccessController_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011(DB) +// --------------------------------------------------------------------------- +// Manages random access operations in a BAM file +// *************************************************************************** + +#ifndef BAMRACONTROLLER_P_H +#define BAMRACONTROLLER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/BamAux.h" +#include "api/BamIndex.h" + +namespace BamTools { + +class BamAlignment; + +namespace Internal { + +class BamReaderPrivate; + +class BamRandomAccessController +{ + + // enums +public: + enum RegionState + { + BeforeRegion = 0, + OverlapsRegion, + AfterRegion + }; + + // ctor & dtor +public: + BamRandomAccessController(); + ~BamRandomAccessController(); + + // BamRandomAccessController interface +public: + // index methods + void ClearIndex(); + bool CreateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& type); + bool HasIndex() const; + bool IndexHasAlignmentsForReference(const int& refId); + bool LocateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& preferredType); + bool OpenIndex(const std::string& indexFilename, BamReaderPrivate* reader); + void SetIndex(BamIndex* index); + + // region methods + void ClearRegion(); + bool HasRegion() const; + RegionState AlignmentState(const BamAlignment& alignment) const; + bool RegionHasAlignments() const; + bool SetRegion(const BamRegion& region, const int& referenceCount); + + // general methods + void Close(); + std::string GetErrorString() const; + + // internal methods +private: + // adjusts requested region if necessary (depending on where data actually begins) + void AdjustRegion(const int& referenceCount); + // error-string handling + void SetErrorString(const std::string& where, const std::string& what); + + // data members +private: + // index data + BamIndex* m_index; // owns the index, not a copy - responsible for deleting + + // region data + BamRegion m_region; + bool m_hasAlignmentsInRegion; + + // general data + std::string m_errorString; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMRACONTROLLER_P_H diff --git a/src/api/internal/bam/BamReader_p.cpp b/src/api/internal/bam/BamReader_p.cpp new file mode 100644 index 0000000..76faa63 --- /dev/null +++ b/src/api/internal/bam/BamReader_p.cpp @@ -0,0 +1,591 @@ +// *************************************************************************** +// BamReader_p.cpp (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 18 November 2012 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading BAM files +// *************************************************************************** + +#include "api/internal/bam/BamReader_p.h" +#include "api/BamConstants.h" +#include "api/BamReader.h" +#include "api/IBamIODevice.h" +#include "api/internal/bam/BamHeader_p.h" +#include "api/internal/bam/BamRandomAccessController_p.h" +#include "api/internal/index/BamStandardIndex_p.h" +#include "api/internal/index/BamToolsIndex_p.h" +#include "api/internal/io/BamDeviceFactory_p.h" +#include "api/internal/utils/BamException_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <algorithm> +#include <cassert> +#include <iostream> +#include <iterator> +#include <vector> + +// constructor +BamReaderPrivate::BamReaderPrivate(BamReader* parent) + : m_alignmentsBeginOffset(0) + , m_parent(parent) +{ + m_isBigEndian = BamTools::SystemIsBigEndian(); +} + +// destructor +BamReaderPrivate::~BamReaderPrivate() +{ + Close(); +} + +// closes the BAM file +bool BamReaderPrivate::Close() +{ + + // clear BAM metadata + m_references.clear(); + m_header.Clear(); + + // clear filename + m_filename.clear(); + + // close random access controller + m_randomAccessController.Close(); + + // if stream is open, attempt close + if (IsOpen()) { + try { + m_stream.Close(); + } catch (BamException& e) { + const std::string streamError = e.what(); + const std::string message = + std::string("encountered error closing BAM file: \n\t") + streamError; + SetErrorString("BamReader::Close", message); + return false; + } + } + + // return success + return true; +} + +// creates an index file of requested type on current BAM file +bool BamReaderPrivate::CreateIndex(const BamIndex::IndexType& type) +{ + + // skip if BAM file not open + if (!IsOpen()) { + SetErrorString("BamReader::CreateIndex", "cannot create index on unopened BAM file"); + return false; + } + + // attempt to create index + if (m_randomAccessController.CreateIndex(this, type)) + return true; + else { + const std::string bracError = m_randomAccessController.GetErrorString(); + const std::string message = std::string("could not create index: \n\t") + bracError; + SetErrorString("BamReader::CreateIndex", message); + return false; + } +} + +// return path & filename of current BAM file +const std::string BamReaderPrivate::Filename() const +{ + return m_filename; +} + +const SamHeader& BamReaderPrivate::GetConstSamHeader() const +{ + return m_header.ToConstSamHeader(); +} + +std::string BamReaderPrivate::GetErrorString() const +{ + return m_errorString; +} + +// return header data as string +std::string BamReaderPrivate::GetHeaderText() const +{ + return m_header.ToString(); +} + +// return header data as SamHeader object +SamHeader BamReaderPrivate::GetSamHeader() const +{ + return m_header.ToSamHeader(); +} + +// get next alignment (with character data fully parsed) +bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment) +{ + + // if valid alignment found + if (GetNextAlignmentCore(alignment)) { + + // store alignment's "source" filename + alignment.Filename = m_filename; + + // return success/failure of parsing char data + if (alignment.BuildCharData()) + return true; + else { + const std::string alError = alignment.GetErrorString(); + const std::string message = + std::string("could not populate alignment data: \n\t") + alError; + SetErrorString("BamReader::GetNextAlignment", message); + return false; + } + } + + // no valid alignment found + return false; +} + +// retrieves next available alignment core data (returns success/fail) +// ** DOES NOT populate any character data fields (read name, bases, qualities, tag data, filename) +// these can be accessed, if necessary, from the supportData +// useful for operations requiring ONLY positional or other alignment-related information +bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& alignment) +{ + + // skip if stream not opened + if (!m_stream.IsOpen()) return false; + + try { + + // skip if region is set but has no alignments + if (m_randomAccessController.HasRegion() && + !m_randomAccessController.RegionHasAlignments()) { + return false; + } + + // if can't read next alignment + if (!LoadNextAlignment(alignment)) return false; + + // check alignment's region-overlap state + BamRandomAccessController::RegionState state = + m_randomAccessController.AlignmentState(alignment); + + // if alignment starts after region, no need to keep reading + if (state == BamRandomAccessController::AfterRegion) return false; + + // read until overlap is found + while (state != BamRandomAccessController::OverlapsRegion) { + + // if can't read next alignment + if (!LoadNextAlignment(alignment)) return false; + + // check alignment's region-overlap state + state = m_randomAccessController.AlignmentState(alignment); + + // if alignment starts after region, no need to keep reading + if (state == BamRandomAccessController::AfterRegion) return false; + } + + // if we get here, we found the next 'valid' alignment + // (e.g. overlaps current region if one was set, simply the next alignment if not) + alignment.SupportData.HasCoreOnly = true; + return true; + + } catch (BamException& e) { + const std::string streamError = e.what(); + const std::string message = + std::string("encountered error reading BAM alignment: \n\t") + streamError; + SetErrorString("BamReader::GetNextAlignmentCore", message); + return false; + } +} + +int BamReaderPrivate::GetReferenceCount() const +{ + return m_references.size(); +} + +const RefVector& BamReaderPrivate::GetReferenceData() const +{ + return m_references; +} + +// returns RefID for given RefName (returns References.size() if not found) +int BamReaderPrivate::GetReferenceID(const std::string& refName) const +{ + + // retrieve names from reference data + std::vector<std::string> refNames; + RefVector::const_iterator refIter = m_references.begin(); + RefVector::const_iterator refEnd = m_references.end(); + for (; refIter != refEnd; ++refIter) + refNames.push_back((*refIter).RefName); + + // return 'index-of' refName (or -1 if not found) + int index = distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName)); + if (index == (int)m_references.size()) + return -1; + else + return index; +} + +bool BamReaderPrivate::HasIndex() const +{ + return m_randomAccessController.HasIndex(); +} + +bool BamReaderPrivate::IsOpen() const +{ + return m_stream.IsOpen(); +} + +// load BAM header data +void BamReaderPrivate::LoadHeaderData() +{ + m_header.Load(&m_stream); +} + +static inline int bam_aux_type2size(int x) +{ + if (x == 'C' || x == 'c' || x == 'A') + return 1; + else if (x == 'S' || x == 's') + return 2; + else if (x == 'I' || x == 'i' || x == 'f') + return 4; + else + return 0; +} + +static unsigned char* bam_aux_get(int aux_data_len, const unsigned char* aux_start, const char* tag) +{ + const unsigned char* p = aux_start; + while (p < aux_start + aux_data_len) { + if (p[0] == tag[0] && p[1] == tag[1]) return (unsigned char*)(p + 2); + p += 2; // skip tag + int type = *p++; // read type + if (type == 'B') { + int size = bam_aux_type2size(*p++); // read array type + unsigned len = + (unsigned)p[0] | (unsigned)p[1] << 8 | (unsigned)p[2] << 16 | (unsigned)p[3] << 24; + p += 4; // skip the size field + p += len * size; // skip array + } else if (type == 'Z' || type == 'H') { + while (*p++ != 0) { + } // skip NULL terminated string + } else { + p += bam_aux_type2size(type); // skip value + } + } + return NULL; +} + +static inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls) +{ + int l, s = min_shift, t = ((1 << ((n_lvls << 1) + n_lvls)) - 1) / 7; + for (--end, l = n_lvls; l > 0; --l, s += 3, t -= 1 << ((l << 1) + l)) + if (beg >> s == end >> s) return t + (beg >> s); + return 0; +} + +bool BamReaderPrivate::Tag2Cigar(BamAlignment& a, RaiiBuffer& buf) +{ + if (a.RefID < 0 || a.Position < 0 || a.SupportData.NumCigarOperations == 0) return false; + + const unsigned char* data = (const unsigned char*)buf.Buffer; + const unsigned data_len = a.SupportData.BlockLength - Constants::BAM_CORE_SIZE; + const unsigned char* p = data + a.SupportData.QueryNameLength; // the original CIGAR + unsigned cigar1 = + (unsigned)p[0] | (unsigned)p[1] << 8 | (unsigned)p[2] << 16 | (unsigned)p[3] << 24; + if ((cigar1 & 0xf) != 4 || cigar1 >> 4 != a.SupportData.QuerySequenceLength) return false; + + const int seq_offset = a.SupportData.QueryNameLength + a.SupportData.NumCigarOperations * 4; + const int aux_offset = seq_offset + (a.SupportData.QuerySequenceLength + 1) / 2 + + a.SupportData.QuerySequenceLength; + unsigned char* CG = bam_aux_get(data_len - aux_offset, data + aux_offset, "CG"); + if (CG == NULL || CG[0] != 'B' || CG[1] != 'I') return false; + + const unsigned tag_cigar_len = + (unsigned)CG[2] | (unsigned)CG[3] << 8 | (unsigned)CG[4] << 16 | (unsigned)CG[5] << 24; + if (tag_cigar_len == 0) return false; + + // recalculate bin, as it may be incorrect if it was calculated by a tool unaware of the real CIGAR in tag + const unsigned tag_cigar_offset = CG - data + 6; + unsigned alignment_end = a.Position; + p = data + tag_cigar_offset; + for (unsigned i = 0; i < tag_cigar_len * 4; i += 4, p += 4) { + unsigned cigar1 = + (unsigned)p[0] | (unsigned)p[1] << 8 | (unsigned)p[2] << 16 | (unsigned)p[3] << 24; + int op = cigar1 & 0xf; + if (op == 0 || op == 2 || op == 3 || op == 7 || op == 8) alignment_end += cigar1 >> 4; + } + a.Bin = hts_reg2bin(a.Position, alignment_end, 14, 5); + + // populate new AllCharData + int fake_bytes = a.SupportData.NumCigarOperations * 4; + std::string new_data; + new_data.reserve(data_len - 8 - fake_bytes + 1); + new_data.append((char*)data, a.SupportData.QueryNameLength); // query name + new_data.append((char*)data + tag_cigar_offset, tag_cigar_len * 4); // real CIGAR + new_data.append((char*)data + seq_offset, + tag_cigar_offset - 8 - seq_offset); // seq, qual and tags before CG + const unsigned tag_cigar_end_offset = tag_cigar_offset + tag_cigar_len * 4; + if (tag_cigar_end_offset < data_len) // tags after CG, if there is any + new_data.append((char*)data + tag_cigar_end_offset, data_len - tag_cigar_end_offset); + + // update member variables + a.SupportData.NumCigarOperations = tag_cigar_len; + a.SupportData.BlockLength -= 8 + fake_bytes; + memcpy(buf.Buffer, new_data.c_str(), buf.NumBytes - 8 - fake_bytes); + return true; +} + +// populates BamAlignment with alignment data under file pointer, returns success/fail +bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment) +{ + + // read in the 'block length' value, make sure it's not zero + char buffer[sizeof(uint32_t)]; + std::fill_n(buffer, sizeof(uint32_t), 0); + m_stream.Read(buffer, sizeof(uint32_t)); + alignment.SupportData.BlockLength = BamTools::UnpackUnsignedInt(buffer); + if (m_isBigEndian) BamTools::SwapEndian_32(alignment.SupportData.BlockLength); + if (alignment.SupportData.BlockLength == 0) return false; + + // read in core alignment data, make sure the right size of data was read + char x[Constants::BAM_CORE_SIZE]; + if (m_stream.Read(x, Constants::BAM_CORE_SIZE) != Constants::BAM_CORE_SIZE) return false; + + // swap core endian-ness if necessary + if (m_isBigEndian) { + for (unsigned int i = 0; i < Constants::BAM_CORE_SIZE; i += sizeof(uint32_t)) + BamTools::SwapEndian_32p(&x[i]); + } + + // set BamAlignment 'core' and 'support' data + alignment.RefID = BamTools::UnpackSignedInt(&x[0]); + alignment.Position = BamTools::UnpackSignedInt(&x[4]); + + unsigned int tempValue = BamTools::UnpackUnsignedInt(&x[8]); + alignment.Bin = tempValue >> 16; + alignment.MapQuality = tempValue >> 8 & 0xff; + alignment.SupportData.QueryNameLength = tempValue & 0xff; + + tempValue = BamTools::UnpackUnsignedInt(&x[12]); + alignment.AlignmentFlag = tempValue >> 16; + alignment.SupportData.NumCigarOperations = tempValue & 0xffff; + + alignment.SupportData.QuerySequenceLength = BamTools::UnpackUnsignedInt(&x[16]); + alignment.MateRefID = BamTools::UnpackSignedInt(&x[20]); + alignment.MatePosition = BamTools::UnpackSignedInt(&x[24]); + alignment.InsertSize = BamTools::UnpackSignedInt(&x[28]); + + // set BamAlignment length + alignment.Length = alignment.SupportData.QuerySequenceLength; + + // read in character data - make sure proper data size was read + bool readCharDataOK = false; + unsigned int dataLength = alignment.SupportData.BlockLength - Constants::BAM_CORE_SIZE; + RaiiBuffer allCharData(dataLength); + + if (m_stream.Read(allCharData.Buffer, dataLength) == dataLength) { + + int OldNumCigarOperations = alignment.SupportData.NumCigarOperations; + if (Tag2Cigar(alignment, allCharData)) dataLength -= 8 + OldNumCigarOperations * 4; + + // store 'allCharData' in supportData structure + alignment.SupportData.AllCharData.assign((const char*)allCharData.Buffer, dataLength); + + // set success flag + readCharDataOK = true; + + // save CIGAR ops + // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly, + // even when GetNextAlignmentCore() is called + const unsigned int cigarDataOffset = alignment.SupportData.QueryNameLength; + uint32_t* cigarData = (uint32_t*)(allCharData.Buffer + cigarDataOffset); + CigarOp op; + alignment.CigarData.clear(); + alignment.CigarData.reserve(alignment.SupportData.NumCigarOperations); + for (unsigned int i = 0; i < alignment.SupportData.NumCigarOperations; ++i) { + + // swap endian-ness if necessary + if (m_isBigEndian) BamTools::SwapEndian_32(cigarData[i]); + + // build CigarOp structure + op.Length = (cigarData[i] >> Constants::BAM_CIGAR_SHIFT); + op.Type = Constants::BAM_CIGAR_LOOKUP[(cigarData[i] & Constants::BAM_CIGAR_MASK)]; + + // save CigarOp + alignment.CigarData.push_back(op); + } + } + + // return success/failure + return readCharDataOK; +} + +// loads reference data from BAM file +bool BamReaderPrivate::LoadReferenceData() +{ + + // get number of reference sequences + char buffer[sizeof(uint32_t)]; + m_stream.Read(buffer, sizeof(uint32_t)); + uint32_t numberRefSeqs = BamTools::UnpackUnsignedInt(buffer); + if (m_isBigEndian) BamTools::SwapEndian_32(numberRefSeqs); + m_references.reserve((int)numberRefSeqs); + + // iterate over all references in header + for (unsigned int i = 0; i != numberRefSeqs; ++i) { + + // get length of reference name + m_stream.Read(buffer, sizeof(uint32_t)); + uint32_t refNameLength = BamTools::UnpackUnsignedInt(buffer); + if (m_isBigEndian) BamTools::SwapEndian_32(refNameLength); + RaiiBuffer refName(refNameLength); + + // get reference name and reference sequence length + m_stream.Read(refName.Buffer, refNameLength); + m_stream.Read(buffer, sizeof(int32_t)); + int32_t refLength = BamTools::UnpackSignedInt(buffer); + if (m_isBigEndian) BamTools::SwapEndian_32(refLength); + + // store data for reference + RefData aReference; + aReference.RefName = static_cast<std::string>((const char*)refName.Buffer); + aReference.RefLength = refLength; + m_references.push_back(aReference); + } + + // return success + return true; +} + +bool BamReaderPrivate::LocateIndex(const BamIndex::IndexType& preferredType) +{ + + if (m_randomAccessController.LocateIndex(this, preferredType)) + return true; + else { + const std::string bracError = m_randomAccessController.GetErrorString(); + const std::string message = std::string("could not locate index: \n\t") + bracError; + SetErrorString("BamReader::LocateIndex", message); + return false; + } +} + +// opens BAM file (and index) +bool BamReaderPrivate::Open(const std::string& filename) +{ + + try { + + // make sure we're starting with fresh state + Close(); + + // open BgzfStream + m_stream.Open(filename, IBamIODevice::ReadOnly); + + // load BAM metadata + LoadHeaderData(); + LoadReferenceData(); + + // store filename & offset of first alignment + m_filename = filename; + m_alignmentsBeginOffset = m_stream.Tell(); + + // return success + return true; + + } catch (BamException& e) { + const std::string error = e.what(); + const std::string message = + std::string("could not open file: ") + filename + "\n\t" + error; + SetErrorString("BamReader::Open", message); + return false; + } +} + +bool BamReaderPrivate::OpenIndex(const std::string& indexFilename) +{ + + if (m_randomAccessController.OpenIndex(indexFilename, this)) + return true; + else { + const std::string bracError = m_randomAccessController.GetErrorString(); + const std::string message = std::string("could not open index: \n\t") + bracError; + SetErrorString("BamReader::OpenIndex", message); + return false; + } +} + +// returns BAM file pointer to beginning of alignment data +bool BamReaderPrivate::Rewind() +{ + + // reset region + m_randomAccessController.ClearRegion(); + + // return status of seeking back to first alignment + if (Seek(m_alignmentsBeginOffset)) + return true; + else { + const std::string currentError = m_errorString; + const std::string message = std::string("could not rewind: \n\t") + currentError; + SetErrorString("BamReader::Rewind", message); + return false; + } +} + +bool BamReaderPrivate::Seek(const int64_t& position) +{ + + // skip if BAM file not open + if (!IsOpen()) { + SetErrorString("BamReader::Seek", "cannot seek on unopened BAM file"); + return false; + } + + try { + m_stream.Seek(position); + return true; + } catch (BamException& e) { + const std::string streamError = e.what(); + const std::string message = std::string("could not seek in BAM file: \n\t") + streamError; + SetErrorString("BamReader::Seek", message); + return false; + } +} + +void BamReaderPrivate::SetErrorString(const std::string& where, const std::string& what) +{ + static const std::string SEPARATOR(": "); + m_errorString = where + SEPARATOR + what; +} + +void BamReaderPrivate::SetIndex(BamIndex* index) +{ + m_randomAccessController.SetIndex(index); +} + +// sets current region & attempts to jump to it +// returns success/failure +bool BamReaderPrivate::SetRegion(const BamRegion& region) +{ + + if (m_randomAccessController.SetRegion(region, m_references.size())) + return true; + else { + const std::string bracError = m_randomAccessController.GetErrorString(); + const std::string message = std::string("could not set region: \n\t") + bracError; + SetErrorString("BamReader::SetRegion", message); + return false; + } +} + +int64_t BamReaderPrivate::Tell() const +{ + return m_stream.Tell(); +} diff --git a/src/api/internal/bam/BamReader_p.h b/src/api/internal/bam/BamReader_p.h new file mode 100644 index 0000000..48dea89 --- /dev/null +++ b/src/api/internal/bam/BamReader_p.h @@ -0,0 +1,119 @@ +// *************************************************************************** +// BamReader_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 18 November 2012 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading BAM files +// *************************************************************************** + +#ifndef BAMREADER_P_H +#define BAMREADER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <string> +#include "api/BamAlignment.h" +#include "api/BamIndex.h" +#include "api/BamReader.h" +#include "api/SamHeader.h" +#include "api/internal/bam/BamHeader_p.h" +#include "api/internal/bam/BamRandomAccessController_p.h" +#include "api/internal/io/BgzfStream_p.h" + +namespace BamTools { +namespace Internal { + +class BamReaderPrivate +{ + + // ctor & dtor +public: + BamReaderPrivate(BamReader* parent); + ~BamReaderPrivate(); + + // BamReader interface +public: + // file operations + bool Close(); + const std::string Filename() const; + bool IsOpen() const; + bool Open(const std::string& filename); + bool Rewind(); + bool SetRegion(const BamRegion& region); + + // access alignment data + bool GetNextAlignment(BamAlignment& alignment); + bool GetNextAlignmentCore(BamAlignment& alignment); + bool Tag2Cigar(BamAlignment& alignment, RaiiBuffer& buf); + + // access auxiliary data + std::string GetHeaderText() const; + const SamHeader& GetConstSamHeader() const; + SamHeader GetSamHeader() const; + int GetReferenceCount() const; + const RefVector& GetReferenceData() const; + int GetReferenceID(const std::string& refName) const; + + // index operations + bool CreateIndex(const BamIndex::IndexType& type); + bool HasIndex() const; + bool LocateIndex(const BamIndex::IndexType& preferredType); + bool OpenIndex(const std::string& indexFilename); + void SetIndex(BamIndex* index); + + // error handling + std::string GetErrorString() const; + void SetErrorString(const std::string& where, const std::string& what); + + // internal methods, but available as a BamReaderPrivate 'interface' + // + // these methods should only be used by BamTools::Internal classes + // (currently only used by the BamIndex subclasses) +public: + // retrieves header text from BAM file + void LoadHeaderData(); + // retrieves BAM alignment under file pointer + // (does no overlap checking or character data parsing) + bool LoadNextAlignment(BamAlignment& alignment); + // builds reference data structure from BAM file + bool LoadReferenceData(); + // seek reader to file position + bool Seek(const int64_t& position); + // return reader's file position + int64_t Tell() const; + + // data members +public: + // general BAM file data + int64_t m_alignmentsBeginOffset; + std::string m_filename; + RefVector m_references; + + // system data + bool m_isBigEndian; + + // parent BamReader + BamReader* m_parent; + + // BamReaderPrivate components + BamHeader m_header; + BamRandomAccessController m_randomAccessController; + BgzfStream m_stream; + + // error handling + std::string m_errorString; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMREADER_P_H diff --git a/src/api/internal/bam/BamWriter_p.cpp b/src/api/internal/bam/BamWriter_p.cpp new file mode 100644 index 0000000..9509777 --- /dev/null +++ b/src/api/internal/bam/BamWriter_p.cpp @@ -0,0 +1,599 @@ +// *************************************************************************** +// BamWriter_p.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 18 November 2012 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#include "api/internal/bam/BamWriter_p.h" +#include "api/BamAlignment.h" +#include "api/BamConstants.h" +#include "api/IBamIODevice.h" +#include "api/internal/utils/BamException_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstddef> +#include <cstdlib> +#include <cstring> + +// ctor +BamWriterPrivate::BamWriterPrivate() + : m_isBigEndian(BamTools::SystemIsBigEndian()) +{} + +// dtor +BamWriterPrivate::~BamWriterPrivate() +{ + Close(); +} + +// calculates minimum bin for a BAM alignment interval [begin, end) +uint32_t BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const +{ + --end; + if ((begin >> 14) == (end >> 14)) return 4681 + (begin >> 14); + if ((begin >> 17) == (end >> 17)) return 585 + (begin >> 17); + if ((begin >> 20) == (end >> 20)) return 73 + (begin >> 20); + if ((begin >> 23) == (end >> 23)) return 9 + (begin >> 23); + if ((begin >> 26) == (end >> 26)) return 1 + (begin >> 26); + return 0; +} + +// closes the alignment archive +void BamWriterPrivate::Close() +{ + + // skip if file not open + if (!IsOpen()) return; + + // close output stream + try { + m_stream.Close(); + } catch (BamException& e) { + m_errorString = e.what(); + } +} + +// creates a cigar string from the supplied alignment +void BamWriterPrivate::CreatePackedCigar(const std::vector<CigarOp>& cigarOperations, + std::string& packedCigar) +{ + + // initialize + const std::size_t numCigarOperations = cigarOperations.size(); + packedCigar.resize(numCigarOperations * Constants::BAM_SIZEOF_INT); + + // pack the cigar data into the string + unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); + + // iterate over cigar operations + std::vector<CigarOp>::const_iterator coIter = cigarOperations.begin(); + std::vector<CigarOp>::const_iterator coEnd = cigarOperations.end(); + for (; coIter != coEnd; ++coIter) { + + // store op in packedCigar + uint8_t cigarOp; + switch (coIter->Type) { + case (Constants::BAM_CIGAR_MATCH_CHAR): + cigarOp = Constants::BAM_CIGAR_MATCH; + break; + case (Constants::BAM_CIGAR_INS_CHAR): + cigarOp = Constants::BAM_CIGAR_INS; + break; + case (Constants::BAM_CIGAR_DEL_CHAR): + cigarOp = Constants::BAM_CIGAR_DEL; + break; + case (Constants::BAM_CIGAR_REFSKIP_CHAR): + cigarOp = Constants::BAM_CIGAR_REFSKIP; + break; + case (Constants::BAM_CIGAR_SOFTCLIP_CHAR): + cigarOp = Constants::BAM_CIGAR_SOFTCLIP; + break; + case (Constants::BAM_CIGAR_HARDCLIP_CHAR): + cigarOp = Constants::BAM_CIGAR_HARDCLIP; + break; + case (Constants::BAM_CIGAR_PAD_CHAR): + cigarOp = Constants::BAM_CIGAR_PAD; + break; + case (Constants::BAM_CIGAR_SEQMATCH_CHAR): + cigarOp = Constants::BAM_CIGAR_SEQMATCH; + break; + case (Constants::BAM_CIGAR_MISMATCH_CHAR): + cigarOp = Constants::BAM_CIGAR_MISMATCH; + break; + default: + const std::string message = + std::string("invalid CIGAR operation type") + coIter->Type; + throw BamException("BamWriter::CreatePackedCigar", message); + } + + *pPackedCigar = coIter->Length << Constants::BAM_CIGAR_SHIFT | cigarOp; + pPackedCigar++; + } +} + +// encodes the supplied query sequence into 4-bit notation +void BamWriterPrivate::EncodeQuerySequence(const std::string& query, std::string& encodedQuery) +{ + + // prepare the encoded query string + const std::size_t queryLength = query.size(); + const std::size_t encodedQueryLength = static_cast<std::size_t>((queryLength + 1) / 2); + encodedQuery.resize(encodedQueryLength); + char* pEncodedQuery = (char*)encodedQuery.data(); + const char* pQuery = (const char*)query.data(); + + // walk through original query sequence, encoding its bases + unsigned char nucleotideCode; + bool useHighWord = true; + while (*pQuery) { + switch (*pQuery) { + case (Constants::BAM_DNA_EQUAL): + nucleotideCode = Constants::BAM_BASECODE_EQUAL; + break; + case (Constants::BAM_DNA_A): + nucleotideCode = Constants::BAM_BASECODE_A; + break; + case (Constants::BAM_DNA_C): + nucleotideCode = Constants::BAM_BASECODE_C; + break; + case (Constants::BAM_DNA_M): + nucleotideCode = Constants::BAM_BASECODE_M; + break; + case (Constants::BAM_DNA_G): + nucleotideCode = Constants::BAM_BASECODE_G; + break; + case (Constants::BAM_DNA_R): + nucleotideCode = Constants::BAM_BASECODE_R; + break; + case (Constants::BAM_DNA_S): + nucleotideCode = Constants::BAM_BASECODE_S; + break; + case (Constants::BAM_DNA_V): + nucleotideCode = Constants::BAM_BASECODE_V; + break; + case (Constants::BAM_DNA_T): + nucleotideCode = Constants::BAM_BASECODE_T; + break; + case (Constants::BAM_DNA_W): + nucleotideCode = Constants::BAM_BASECODE_W; + break; + case (Constants::BAM_DNA_Y): + nucleotideCode = Constants::BAM_BASECODE_Y; + break; + case (Constants::BAM_DNA_H): + nucleotideCode = Constants::BAM_BASECODE_H; + break; + case (Constants::BAM_DNA_K): + nucleotideCode = Constants::BAM_BASECODE_K; + break; + case (Constants::BAM_DNA_D): + nucleotideCode = Constants::BAM_BASECODE_D; + break; + case (Constants::BAM_DNA_B): + nucleotideCode = Constants::BAM_BASECODE_B; + break; + case (Constants::BAM_DNA_N): + nucleotideCode = Constants::BAM_BASECODE_N; + break; + default: + const std::string message = std::string("invalid base: ") + *pQuery; + throw BamException("BamWriter::EncodeQuerySequence", message); + } + + // pack the nucleotide code + if (useHighWord) { + *pEncodedQuery = nucleotideCode << 4; + useHighWord = false; + } else { + *pEncodedQuery |= nucleotideCode; + ++pEncodedQuery; + useHighWord = true; + } + + // increment the query position + ++pQuery; + } +} + +// returns a description of the last error that occurred +std::string BamWriterPrivate::GetErrorString() const +{ + return m_errorString; +} + +// returns whether BAM file is open for writing or not +bool BamWriterPrivate::IsOpen() const +{ + return m_stream.IsOpen(); +} + +// opens the alignment archive +bool BamWriterPrivate::Open(const std::string& filename, const std::string& samHeaderText, + const RefVector& referenceSequences) +{ + try { + + // open the BGZF file for writing + m_stream.Open(filename, IBamIODevice::WriteOnly); + + // write BAM file 'metadata' components + WriteMagicNumber(); + WriteSamHeaderText(samHeaderText); + WriteReferences(referenceSequences); + + // return success + return true; + + } catch (BamException& e) { + m_errorString = e.what(); + return false; + } +} + +// saves the alignment to the alignment archive +bool BamWriterPrivate::SaveAlignment(const BamAlignment& al) +{ + + try { + + // if BamAlignment contains only the core data and a raw char data buffer + // (as a result of BamReader::GetNextAlignmentCore()) + if (al.SupportData.HasCoreOnly) WriteCoreAlignment(al); + + // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc + // (resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code) + else + WriteAlignment(al); + + // if we get here, everything OK + return true; + + } catch (BamException& e) { + m_errorString = e.what(); + return false; + } +} + +void BamWriterPrivate::SetWriteCompressed(bool ok) +{ + // modifying compression is not allowed if BAM file is open + if (!IsOpen()) m_stream.SetWriteCompressed(ok); +} + +void BamWriterPrivate::WriteAlignment(const BamAlignment& al) +{ + + // calculate char lengths + const unsigned int nameLength = al.Name.size() + 1; + const unsigned int numCigarOperations = al.CigarData.size(); + const unsigned int queryLength = ((al.QueryBases == "*") ? 0 : al.QueryBases.size()); + const unsigned int tagDataLength = al.TagData.size(); + + // no way to tell if alignment's bin is already defined (there is no default, invalid value) + // so we'll go ahead calculate its bin ID before storing + const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition()); + + // create our packed cigar string + std::string packedCigar; + CreatePackedCigar(al.CigarData, packedCigar); + const unsigned int packedCigarLength = packedCigar.size(); + + // encode the query + unsigned int encodedQueryLength = 0; + std::string encodedQuery; + if (queryLength > 0) { + EncodeQuerySequence(al.QueryBases, encodedQuery); + encodedQueryLength = encodedQuery.size(); + } + + // write the block size + const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + + queryLength + // here referring to quality length + tagDataLength; + unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize; + if (numCigarOperations >= 65536) blockSize += 16; + if (m_isBigEndian) BamTools::SwapEndian_32(blockSize); + m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); + + // assign the BAM core data + uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; + buffer[3] = (al.AlignmentFlag << 16) | (numCigarOperations < 65536 ? numCigarOperations : 2); + buffer[4] = queryLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if (m_isBigEndian) { + for (int i = 0; i < 8; ++i) + BamTools::SwapEndian_32(buffer[i]); + } + + // write the BAM core + m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); + + // write the query name + m_stream.Write(al.Name.c_str(), nameLength); + + // write the packed cigar + if (numCigarOperations < 65536) { + if (m_isBigEndian) { + char* cigarData = new char[packedCigarLength](); + memcpy(cigarData, packedCigar.data(), packedCigarLength); + if (m_isBigEndian) { + for (size_t i = 0; i < packedCigarLength; ++i) + BamTools::SwapEndian_32p(&cigarData[i]); + } + m_stream.Write(cigarData, packedCigarLength); + delete[] cigarData; // TODO: cleanup on Write exception thrown? + } else + m_stream.Write(packedCigar.data(), packedCigarLength); + } else { + unsigned int cigar[2]; + cigar[0] = queryLength << 4 | 4; + cigar[1] = (al.GetEndPosition() - al.Position) << 4 | 3; + if (m_isBigEndian) { + BamTools::SwapEndian_32(cigar[0]); + BamTools::SwapEndian_32(cigar[1]); + } + m_stream.Write((char*)cigar, 8); + } + + if (queryLength > 0) { + + // write the encoded query sequence + m_stream.Write(encodedQuery.data(), encodedQueryLength); + + // write the base qualities + char* pBaseQualities = new char[queryLength](); + if (al.Qualities.empty() || (al.Qualities.size() == 1 && al.Qualities[0] == '*') || + al.Qualities[0] == (char)0xFF) + memset(pBaseQualities, 0xFF, queryLength); // if missing or '*', fill with invalid qual + else { + for (std::size_t i = 0; i < queryLength; ++i) + pBaseQualities[i] = + al.Qualities.at(i) - 33; // FASTQ ASCII -> phred score conversion + } + m_stream.Write(pBaseQualities, queryLength); + delete[] pBaseQualities; + } + + // write the tag data + if (m_isBigEndian) { + + char* tagData = new char[tagDataLength](); + memcpy(tagData, al.TagData.data(), tagDataLength); + + std::size_t i = 0; + while (i < tagDataLength) { + + i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.) + const char type = tagData[i]; // get tag type at position i + ++i; + + switch (type) { + + case (Constants::BAM_TAG_TYPE_ASCII): + case (Constants::BAM_TAG_TYPE_INT8): + case (Constants::BAM_TAG_TYPE_UINT8): + ++i; + break; + + case (Constants::BAM_TAG_TYPE_INT16): + case (Constants::BAM_TAG_TYPE_UINT16): + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + + case (Constants::BAM_TAG_TYPE_FLOAT): + case (Constants::BAM_TAG_TYPE_INT32): + case (Constants::BAM_TAG_TYPE_UINT32): + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + + case (Constants::BAM_TAG_TYPE_HEX): + case (Constants::BAM_TAG_TYPE_STRING): + // no endian swapping necessary for hex-string/string data + while (tagData[i]) + ++i; + // increment one more for null terminator + ++i; + break; + + case (Constants::BAM_TAG_TYPE_ARRAY): + + { + // read array type + const char arrayType = tagData[i]; + ++i; + + // swap endian-ness of number of elements in place, then retrieve for loop + BamTools::SwapEndian_32p(&tagData[i]); + int32_t numElements; + memcpy(&numElements, &tagData[i], sizeof(uint32_t)); + i += sizeof(uint32_t); + + // swap endian-ness of array elements + for (int j = 0; j < numElements; ++j) { + switch (arrayType) { + case (Constants::BAM_TAG_TYPE_INT8): + case (Constants::BAM_TAG_TYPE_UINT8): + // no endian-swapping necessary + ++i; + break; + case (Constants::BAM_TAG_TYPE_INT16): + case (Constants::BAM_TAG_TYPE_UINT16): + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + case (Constants::BAM_TAG_TYPE_FLOAT): + case (Constants::BAM_TAG_TYPE_INT32): + case (Constants::BAM_TAG_TYPE_UINT32): + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + default: + delete[] tagData; + const std::string message = + std::string("invalid binary array type: ") + arrayType; + throw BamException("BamWriter::SaveAlignment", message); + } + } + + break; + } + + default: + delete[] tagData; + const std::string message = std::string("invalid tag type: ") + type; + throw BamException("BamWriter::SaveAlignment", message); + } + } + + m_stream.Write(tagData, tagDataLength); + delete[] tagData; // TODO: cleanup on Write exception thrown? + } else + m_stream.Write(al.TagData.data(), tagDataLength); + + if (numCigarOperations >= 65536) { + m_stream.Write("CGBI", 4); + if (m_isBigEndian) { + unsigned int cigar_len_buf = numCigarOperations; + BamTools::SwapEndian_32(cigar_len_buf); + m_stream.Write((char*)&cigar_len_buf, 4); + + char* cigarData = new char[packedCigarLength](); + memcpy(cigarData, packedCigar.data(), packedCigarLength); + if (m_isBigEndian) { + for (size_t i = 0; i < packedCigarLength; + ++i) // FIXME: similarly, this should be "i += 4", not "++i" + BamTools::SwapEndian_32p(&cigarData[i]); + } + m_stream.Write(cigarData, packedCigarLength); + delete[] cigarData; // TODO: cleanup on Write exception thrown? + } else { + m_stream.Write((char*)&numCigarOperations, 4); + m_stream.Write(packedCigar.data(), packedCigarLength); + } + } +} + +void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al) +{ + + // write the block size + unsigned int blockSize = al.SupportData.BlockLength; + if (al.SupportData.NumCigarOperations >= 65536) blockSize += 16; + if (m_isBigEndian) BamTools::SwapEndian_32(blockSize); + m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); + + // re-calculate bin (in case BamAlignment's position has been previously modified) + const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition()); + + // assign the BAM core data + uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; + buffer[3] = (al.AlignmentFlag << 16) | + (al.SupportData.NumCigarOperations < 65536 ? al.SupportData.NumCigarOperations : 2); + buffer[4] = al.SupportData.QuerySequenceLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if (m_isBigEndian) { + for (int i = 0; i < 8; ++i) + BamTools::SwapEndian_32(buffer[i]); + } + + // write the BAM core + m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); + + // write the raw char data + if (al.SupportData.NumCigarOperations < 65536) { + m_stream.Write((char*)al.SupportData.AllCharData.data(), + al.SupportData.BlockLength - Constants::BAM_CORE_SIZE); + } else { + const char* data = al.SupportData.AllCharData.c_str(); + const unsigned data_len = al.SupportData.BlockLength - Constants::BAM_CORE_SIZE; + const unsigned cigar_offset = al.SupportData.QueryNameLength; + const unsigned seq_offset = cigar_offset + al.SupportData.NumCigarOperations * 4; + unsigned fake_cigar[2]; + fake_cigar[0] = al.SupportData.QuerySequenceLength << 4 | 4; + fake_cigar[1] = (al.GetEndPosition() - al.Position) << 4 | 3; + m_stream.Write(data, al.SupportData.QueryNameLength); + if (m_isBigEndian) { + BamTools::SwapEndian_32(fake_cigar[0]); + BamTools::SwapEndian_32(fake_cigar[1]); + } + m_stream.Write((char*)&fake_cigar, 8); + m_stream.Write(data + seq_offset, data_len - seq_offset); + m_stream.Write("CGBI", 4); + if (m_isBigEndian) { + unsigned cigar_len_buf = al.SupportData.NumCigarOperations; + BamTools::SwapEndian_32(cigar_len_buf); + m_stream.Write((char*)&cigar_len_buf, 4); + } else { + m_stream.Write((char*)&al.SupportData.NumCigarOperations, 4); + } + m_stream.Write(data + cigar_offset, al.SupportData.NumCigarOperations * 4); + } +} + +void BamWriterPrivate::WriteMagicNumber() +{ + // write BAM file 'magic number' + m_stream.Write(Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH); +} + +void BamWriterPrivate::WriteReferences(const BamTools::RefVector& referenceSequences) +{ + + // write the number of reference sequences + uint32_t numReferenceSequences = referenceSequences.size(); + if (m_isBigEndian) BamTools::SwapEndian_32(numReferenceSequences); + m_stream.Write((char*)&numReferenceSequences, Constants::BAM_SIZEOF_INT); + + // foreach reference sequence + RefVector::const_iterator rsIter = referenceSequences.begin(); + RefVector::const_iterator rsEnd = referenceSequences.end(); + for (; rsIter != rsEnd; ++rsIter) { + + // write the reference sequence name length (+1 for terminator) + const uint32_t actualNameLen = rsIter->RefName.size() + 1; + uint32_t maybeSwappedNameLen = actualNameLen; + if (m_isBigEndian) BamTools::SwapEndian_32(maybeSwappedNameLen); + m_stream.Write((char*)&maybeSwappedNameLen, Constants::BAM_SIZEOF_INT); + + // write the reference sequence name + m_stream.Write(rsIter->RefName.c_str(), actualNameLen); + + // write the reference sequence length + int32_t referenceLength = rsIter->RefLength; + if (m_isBigEndian) BamTools::SwapEndian_32(referenceLength); + m_stream.Write((char*)&referenceLength, Constants::BAM_SIZEOF_INT); + } +} + +void BamWriterPrivate::WriteSamHeaderText(const std::string& samHeaderText) +{ + + // write the SAM header text length + const uint32_t actualHeaderLen = samHeaderText.size(); + uint32_t maybeSwappedHeaderLen = samHeaderText.size(); + if (m_isBigEndian) BamTools::SwapEndian_32(maybeSwappedHeaderLen); + m_stream.Write((char*)&maybeSwappedHeaderLen, Constants::BAM_SIZEOF_INT); + + // write the SAM header text + if (actualHeaderLen > 0) m_stream.Write(samHeaderText.data(), actualHeaderLen); +} diff --git a/src/api/internal/bam/BamWriter_p.h b/src/api/internal/bam/BamWriter_p.h new file mode 100644 index 0000000..550d7fb --- /dev/null +++ b/src/api/internal/bam/BamWriter_p.h @@ -0,0 +1,74 @@ +// *************************************************************************** +// BamWriter_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#ifndef BAMWRITER_P_H +#define BAMWRITER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. + +#include <string> +#include <vector> +#include "api/BamAux.h" +#include "api/internal/io/BgzfStream_p.h" + +namespace BamTools { + +class BamAlignment; + +namespace Internal { + +class BamWriterPrivate +{ + + // ctor & dtor +public: + BamWriterPrivate(); + ~BamWriterPrivate(); + + // interface methods +public: + void Close(); + std::string GetErrorString() const; + bool IsOpen() const; + bool Open(const std::string& filename, const std::string& samHeaderText, + const BamTools::RefVector& referenceSequences); + bool SaveAlignment(const BamAlignment& al); + void SetWriteCompressed(bool ok); + + // 'internal' methods +public: + uint32_t CalculateMinimumBin(const int begin, int end) const; + void CreatePackedCigar(const std::vector<BamTools::CigarOp>& cigarOperations, + std::string& packedCigar); + void EncodeQuerySequence(const std::string& query, std::string& encodedQuery); + void WriteAlignment(const BamAlignment& al); + void WriteCoreAlignment(const BamAlignment& al); + void WriteMagicNumber(); + void WriteReferences(const BamTools::RefVector& referenceSequences); + void WriteSamHeaderText(const std::string& samHeaderText); + + // data members +private: + BgzfStream m_stream; + bool m_isBigEndian; + std::string m_errorString; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMWRITER_P_H diff --git a/src/api/internal/bam/CMakeLists.txt b/src/api/internal/bam/CMakeLists.txt new file mode 100644 index 0000000..1bd2569 --- /dev/null +++ b/src/api/internal/bam/CMakeLists.txt @@ -0,0 +1,19 @@ +# ========================== +# BamTools CMakeLists.txt +# (c) 2011 Derek Barnett +# +# src/api/internal/bam +# ========================== + +set( InternalBamDir "${InternalDir}/bam" ) + +set( InternalBamSources + ${InternalBamDir}/BamHeader_p.cpp + ${InternalBamDir}/BamMultiReader_p.cpp + ${InternalBamDir}/BamRandomAccessController_p.cpp + ${InternalBamDir}/BamReader_p.cpp + ${InternalBamDir}/BamWriter_p.cpp + + PARENT_SCOPE # <-- leave this last + ) + diff --git a/src/api/internal/index/BamIndexFactory_p.cpp b/src/api/internal/index/BamIndexFactory_p.cpp new file mode 100644 index 0000000..a719243 --- /dev/null +++ b/src/api/internal/index/BamIndexFactory_p.cpp @@ -0,0 +1,111 @@ +// *************************************************************************** +// BamIndexFactory_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides interface for generating BamIndex implementations +// *************************************************************************** + +#include "api/internal/index/BamIndexFactory_p.h" +#include "api/internal/index/BamStandardIndex_p.h" +#include "api/internal/index/BamToolsIndex_p.h" + +#include <cstddef> + +using namespace BamTools; +using namespace BamTools::Internal; + +// generates index filename from BAM filename (depending on requested type) +// if type is unknown, returns empty string +const std::string BamIndexFactory::CreateIndexFilename(const std::string& bamFilename, + const BamIndex::IndexType& type) +{ + switch (type) { + case (BamIndex::STANDARD): + return (bamFilename + BamStandardIndex::Extension()); + case (BamIndex::BAMTOOLS): + return (bamFilename + BamToolsIndex::Extension()); + default: + return std::string(); + } +} + +// creates a new BamIndex object, depending on extension of @indexFilename +BamIndex* BamIndexFactory::CreateIndexFromFilename(const std::string& indexFilename, + BamReaderPrivate* reader) +{ + + // get file extension from index filename, including dot (".EXT") + // if can't get file extension, return null index + const std::string extension = FileExtension(indexFilename); + if (extension.empty()) return 0; + + // create index based on extension + if (extension == BamStandardIndex::Extension()) + return new BamStandardIndex(reader); + else if (extension == BamToolsIndex::Extension()) + return new BamToolsIndex(reader); + else + return 0; +} + +// creates a new BamIndex, object of requested @type +BamIndex* BamIndexFactory::CreateIndexOfType(const BamIndex::IndexType& type, + BamReaderPrivate* reader) +{ + switch (type) { + case (BamIndex::STANDARD): + return new BamStandardIndex(reader); + case (BamIndex::BAMTOOLS): + return new BamToolsIndex(reader); + default: + return 0; + } +} + +// retrieves file extension (including '.') +const std::string BamIndexFactory::FileExtension(const std::string& filename) +{ + + // if filename cannot contain valid path + extension, return empty string + if (filename.empty() || filename.length() <= 4) return std::string(); + + // look for last dot in filename + const std::size_t lastDotPosition = filename.find_last_of('.'); + + // if none found, return empty string + if (lastDotPosition == std::string::npos) return std::string(); + + // return substring from last dot position + return filename.substr(lastDotPosition); +} + +// returns name of existing index file that corresponds to @bamFilename +// will defer to @preferredType if possible, if not will attempt to load any supported type +// returns empty string if not found +const std::string BamIndexFactory::FindIndexFilename(const std::string& bamFilename, + const BamIndex::IndexType& preferredType) +{ + // skip if BAM filename provided is empty + if (bamFilename.empty()) return std::string(); + + // try to find index of preferred type first + // return index filename if found + std::string indexFilename = CreateIndexFilename(bamFilename, preferredType); + if (!indexFilename.empty()) return indexFilename; + + // couldn't find preferred type, try the other supported types + // return index filename if found + if (preferredType != BamIndex::STANDARD) { + indexFilename = CreateIndexFilename(bamFilename, BamIndex::STANDARD); + if (!indexFilename.empty()) return indexFilename; + } + if (preferredType != BamIndex::BAMTOOLS) { + indexFilename = CreateIndexFilename(bamFilename, BamIndex::BAMTOOLS); + if (!indexFilename.empty()) return indexFilename; + } + + // otherwise couldn't find any index matching this filename + return std::string(); +} diff --git a/src/api/internal/index/BamIndexFactory_p.h b/src/api/internal/index/BamIndexFactory_p.h new file mode 100644 index 0000000..fc51793 --- /dev/null +++ b/src/api/internal/index/BamIndexFactory_p.h @@ -0,0 +1,49 @@ +// *************************************************************************** +// BamIndexFactory_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides interface for generating BamIndex implementations +// *************************************************************************** + +#ifndef BAMINDEX_FACTORY_P_H +#define BAMINDEX_FACTORY_P_H + +#include <string> +#include "api/BamIndex.h" + +namespace BamTools { +namespace Internal { + +class BamIndexFactory +{ + + // static interface methods +public: + // creates a new BamIndex object, depending on extension of @indexFilename + static BamIndex* CreateIndexFromFilename(const std::string& indexFilename, + BamReaderPrivate* reader); + // creates a new BamIndex object, of requested @type + static BamIndex* CreateIndexOfType(const BamIndex::IndexType& type, BamReaderPrivate* reader); + // returns name of existing index file that corresponds to @bamFilename + // will defer to @preferredType if possible + // if @preferredType not found, will attempt to load any supported index type + // returns empty string if no index file (of any type) is found + static const std::string FindIndexFilename(const std::string& bamFilename, + const BamIndex::IndexType& preferredType); + + // internal methods +public: + // generates index filename from BAM filename (depending on requested type) + // if type is unknown, returns empty string + static const std::string CreateIndexFilename(const std::string& bamFilename, + const BamIndex::IndexType& type); + // retrieves file extension (including '.') + static const std::string FileExtension(const std::string& filename); +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMINDEX_FACTORY_P_H diff --git a/src/api/internal/index/BamStandardIndex_p.cpp b/src/api/internal/index/BamStandardIndex_p.cpp new file mode 100644 index 0000000..cc81711 --- /dev/null +++ b/src/api/internal/index/BamStandardIndex_p.cpp @@ -0,0 +1,1023 @@ +// *************************************************************************** +// BamStandardIndex.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 May 2012 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the standardized BAM index format (".bai") +// *************************************************************************** + +#include "api/internal/index/BamStandardIndex_p.h" +#include "api/BamAlignment.h" +#include "api/internal/bam/BamReader_p.h" +#include "api/internal/io/BamDeviceFactory_p.h" +#include "api/internal/utils/BamException_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <algorithm> +#include <cstddef> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <sstream> + +// ----------------------------------- +// static BamStandardIndex constants +// ----------------------------------- + +const int BamStandardIndex::MAX_BIN = 37450; // =(8^6-1)/7+1 +const int BamStandardIndex::BAM_LIDX_SHIFT = 14; +const std::string BamStandardIndex::BAI_EXTENSION = ".bai"; +const char* const BamStandardIndex::BAI_MAGIC = "BAI\1"; +const int BamStandardIndex::SIZEOF_ALIGNMENTCHUNK = sizeof(uint64_t) * 2; +const int BamStandardIndex::SIZEOF_BINCORE = sizeof(uint32_t) + sizeof(int32_t); +const int BamStandardIndex::SIZEOF_LINEAROFFSET = sizeof(uint64_t); + +// ---------------------------- +// RaiiWrapper implementation +// ---------------------------- + +BamStandardIndex::RaiiWrapper::RaiiWrapper() + : Device(0) + , Buffer(0) +{} + +BamStandardIndex::RaiiWrapper::~RaiiWrapper() +{ + + if (Device) { + Device->Close(); + delete Device; + Device = 0; + } + + if (Buffer) { + delete[] Buffer; + Buffer = 0; + } +} + +// --------------------------------- +// BamStandardIndex implementation +// --------------------------------- + +// ctor +BamStandardIndex::BamStandardIndex(Internal::BamReaderPrivate* reader) + : BamIndex(reader) + , m_bufferLength(0) +{ + m_isBigEndian = BamTools::SystemIsBigEndian(); +} + +// dtor +BamStandardIndex::~BamStandardIndex() +{ + CloseFile(); +} + +void BamStandardIndex::AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end) +{ + + // retrieve references from reader + const RefVector& references = m_reader->GetReferenceData(); + + // LeftPosition cannot be greater than or equal to reference length + if (region.LeftPosition >= references.at(region.LeftRefID).RefLength) + throw BamException("BamStandardIndex::AdjustRegion", "invalid region requested"); + + // set region 'begin' + begin = (unsigned int)region.LeftPosition; + + // if right bound specified AND left&right bounds are on same reference + // OK to use right bound position as region 'end' + if (region.isRightBoundSpecified() && (region.LeftRefID == region.RightRefID)) + end = (unsigned int)region.RightPosition; + + // otherwise, set region 'end' to last reference base + else + end = (unsigned int)references.at(region.LeftRefID).RefLength; +} + +// [begin, end) +void BamStandardIndex::CalculateCandidateBins(const uint32_t& begin, const uint32_t& end, + std::set<uint16_t>& candidateBins) +{ + // initialize list, bin '0' is always a valid bin + candidateBins.insert(0); + + // get rest of bins that contain this region + unsigned int k; + for (k = 1 + (begin >> 26); k <= 1 + (end >> 26); ++k) { + candidateBins.insert(k); + } + for (k = 9 + (begin >> 23); k <= 9 + (end >> 23); ++k) { + candidateBins.insert(k); + } + for (k = 73 + (begin >> 20); k <= 73 + (end >> 20); ++k) { + candidateBins.insert(k); + } + for (k = 585 + (begin >> 17); k <= 585 + (end >> 17); ++k) { + candidateBins.insert(k); + } + for (k = 4681 + (begin >> 14); k <= 4681 + (end >> 14); ++k) { + candidateBins.insert(k); + } +} + +void BamStandardIndex::CalculateCandidateOffsets(const BaiReferenceSummary& refSummary, + const uint64_t& minOffset, + std::set<uint16_t>& candidateBins, + std::vector<int64_t>& offsets) +{ + // seek to first bin + Seek(refSummary.FirstBinFilePosition, SEEK_SET); + + // iterate over reference bins + uint32_t binId; + int32_t numAlignmentChunks; + std::set<uint16_t>::iterator candidateBinIter; + for (int i = 0; i < refSummary.NumBins; ++i) { + + // read bin contents (if successful, alignment chunks are now in m_buffer) + ReadBinIntoBuffer(binId, numAlignmentChunks); + + // see if bin is a 'candidate bin' + candidateBinIter = candidateBins.find(binId); + + // if not, move on to next bin + if (candidateBinIter == candidateBins.end()) continue; + + // otherwise, check bin's contents against for overlap + else { + + std::size_t offset = 0; + uint64_t chunkStart; + uint64_t chunkStop; + + // iterate over alignment chunks + for (int j = 0; j < numAlignmentChunks; ++j) { + + // read chunk start & stop from buffer + memcpy((char*)&chunkStart, m_resources.Buffer + offset, sizeof(uint64_t)); + offset += sizeof(uint64_t); + memcpy((char*)&chunkStop, m_resources.Buffer + offset, sizeof(uint64_t)); + offset += sizeof(uint64_t); + + // swap endian-ness if necessary + if (m_isBigEndian) { + SwapEndian_64(chunkStart); + SwapEndian_64(chunkStop); + } + + // store alignment chunk's start offset + // if its stop offset is larger than our 'minOffset' + if (chunkStop >= minOffset) offsets.push_back(chunkStart); + } + + // 'pop' bin ID from candidate bins set + candidateBins.erase(candidateBinIter); + + // quit if no more candidates + if (candidateBins.empty()) break; + } + } +} + +uint64_t BamStandardIndex::CalculateMinOffset(const BaiReferenceSummary& refSummary, + const uint32_t& begin) +{ + // if no linear offsets exist, return 0 + if (refSummary.NumLinearOffsets == 0) return 0; + + // if 'begin' starts beyond last linear offset, use the last linear offset as minimum + // else use the offset corresponding to the requested start position + const int shiftedBegin = begin >> BamStandardIndex::BAM_LIDX_SHIFT; + if (shiftedBegin >= refSummary.NumLinearOffsets) + return LookupLinearOffset(refSummary, refSummary.NumLinearOffsets - 1); + else + return LookupLinearOffset(refSummary, shiftedBegin); +} + +void BamStandardIndex::CheckBufferSize(char*& buffer, unsigned int& bufferLength, + const unsigned int& requestedBytes) +{ + try { + if (requestedBytes > bufferLength) { + bufferLength = requestedBytes + 10; + delete[] buffer; + buffer = new char[bufferLength]; + } + } catch (std::bad_alloc&) { + std::stringstream s; + s << "out of memory when allocating " << requestedBytes << " bytes"; + throw BamException("BamStandardIndex::CheckBufferSize", s.str()); + } +} + +void BamStandardIndex::CheckBufferSize(unsigned char*& buffer, unsigned int& bufferLength, + const unsigned int& requestedBytes) +{ + try { + if (requestedBytes > bufferLength) { + bufferLength = requestedBytes + 10; + delete[] buffer; + buffer = new unsigned char[bufferLength]; + } + } catch (std::bad_alloc&) { + std::stringstream s; + s << "out of memory when allocating " << requestedBytes << " bytes"; + throw BamException("BamStandardIndex::CheckBufferSize", s.str()); + } +} + +void BamStandardIndex::CheckMagicNumber() +{ + + // check 'magic number' to see if file is BAI index + char magic[4]; + const int64_t numBytesRead = m_resources.Device->Read(magic, sizeof(magic)); + if (numBytesRead != 4) + throw BamException("BamStandardIndex::CheckMagicNumber", "could not read BAI magic number"); + + // compare to expected value + if (strncmp(magic, BamStandardIndex::BAI_MAGIC, 4) != 0) + throw BamException("BamStandardIndex::CheckMagicNumber", "invalid BAI magic number"); +} + +void BamStandardIndex::ClearReferenceEntry(BaiReferenceEntry& refEntry) +{ + refEntry.ID = -1; + refEntry.Bins.clear(); + refEntry.LinearOffsets.clear(); +} + +void BamStandardIndex::CloseFile() +{ + + // close file stream + if (IsDeviceOpen()) { + m_resources.Device->Close(); + delete m_resources.Device; + m_resources.Device = 0; + } + + // clear index file summary data + m_indexFileSummary.clear(); + + // clean up I/O buffer + delete[] m_resources.Buffer; + m_resources.Buffer = 0; + m_bufferLength = 0; +} + +// builds index from associated BAM file & writes out to index file +bool BamStandardIndex::Create() +{ + + // skip if BamReader is invalid or not open + if (m_reader == 0 || !m_reader->IsOpen()) { + SetErrorString("BamStandardIndex::Create", "could not create index: reader is not open"); + return false; + } + + // rewind BamReader + if (!m_reader->Rewind()) { + const std::string readerError = m_reader->GetErrorString(); + const std::string message = "could not create index: \n\t" + readerError; + SetErrorString("BamStandardIndex::Create", message); + return false; + } + + try { + + // open new index file (read & write) + std::string indexFilename = m_reader->Filename() + Extension(); + OpenFile(indexFilename, IBamIODevice::ReadWrite); + + // initialize BaiFileSummary with number of references + const int& numReferences = m_reader->GetReferenceCount(); + ReserveForSummary(numReferences); + + // initialize output file + WriteHeader(); + + // set up bin, ID, offset, & coordinate markers + const uint32_t defaultValue = 0xffffffffu; + uint32_t currentBin = defaultValue; + uint32_t lastBin = defaultValue; + int32_t currentRefID = defaultValue; + int32_t lastRefID = defaultValue; + uint64_t currentOffset = (uint64_t)m_reader->Tell(); + uint64_t lastOffset = currentOffset; + int32_t lastPosition = defaultValue; + + // iterate through alignments in BAM file + BamAlignment al; + BaiReferenceEntry refEntry; + while (m_reader->LoadNextAlignment(al)) { + + // changed to new reference + if (lastRefID != al.RefID) { + + // if not first reference, save previous reference data + if (lastRefID != (int32_t)defaultValue) { + + SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); + WriteReferenceEntry(refEntry); + ClearReferenceEntry(refEntry); + + // write any empty references between (but *NOT* including) lastRefID & al.RefID + for (int i = lastRefID + 1; i < al.RefID; ++i) { + BaiReferenceEntry emptyEntry(i); + WriteReferenceEntry(emptyEntry); + } + + // update bin markers + currentOffset = lastOffset; + currentBin = al.Bin; + lastBin = al.Bin; + currentRefID = al.RefID; + } + + // otherwise, this is first pass + // be sure to write any empty references up to (but *NOT* including) current RefID + else { + for (int i = 0; i < al.RefID; ++i) { + BaiReferenceEntry emptyEntry(i); + WriteReferenceEntry(emptyEntry); + } + } + + // update reference markers + refEntry.ID = al.RefID; + lastRefID = al.RefID; + lastBin = defaultValue; + } + + // if lastPosition greater than current alignment position - file not sorted properly + else if (lastPosition > al.Position) { + std::stringstream s; + s << "BAM file is not properly sorted by coordinate" << std::endl + << "Current alignment position: " << al.Position + << " < previous alignment position: " << lastPosition + << " on reference ID: " << al.RefID << std::endl; + SetErrorString("BamStandardIndex::Create", s.str()); + return false; + } + + // if alignment's ref ID is valid & its bin is not a 'leaf' + if ((al.RefID >= 0) && (al.Bin < 4681)) + SaveLinearOffsetEntry(refEntry.LinearOffsets, al.Position, al.GetEndPosition(), + lastOffset); + + // changed to new BAI bin + if (al.Bin != lastBin) { + + // if not first bin on reference, save previous bin data + if (currentBin != defaultValue) + SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); + + // update markers + currentOffset = lastOffset; + currentBin = al.Bin; + lastBin = al.Bin; + currentRefID = al.RefID; + + // if invalid RefID, break out + if (currentRefID < 0) break; + } + + // make sure that current file pointer is beyond lastOffset + if (m_reader->Tell() <= (int64_t)lastOffset) { + SetErrorString("BamStandardIndex::Create", "calculating offsets failed"); + return false; + } + + // update lastOffset & lastPosition + lastOffset = m_reader->Tell(); + lastPosition = al.Position; + } + + // after finishing alignments, if any data was read, check: + if (lastOffset != currentOffset) { + + // store last alignment chunk to its bin, then write last reference entry with data + SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); + WriteReferenceEntry(refEntry); + } + + // then write any empty references remaining at end of file + for (int i = currentRefID + 1; i < numReferences; ++i) { + BaiReferenceEntry emptyEntry(i); + WriteReferenceEntry(emptyEntry); + } + + } catch (BamException& e) { + m_errorString = e.what(); + return false; + } + + // rewind BamReader + if (!m_reader->Rewind()) { + const std::string readerError = m_reader->GetErrorString(); + const std::string message = "could not create index: \n\t" + readerError; + SetErrorString("BamStandardIndex::Create", message); + return false; + } + + // return success + return true; +} + +// returns format's file extension +const std::string BamStandardIndex::Extension() +{ + return BamStandardIndex::BAI_EXTENSION; +} + +void BamStandardIndex::GetOffset(const BamRegion& region, int64_t& offset, + bool* hasAlignmentsInRegion) +{ + + // cannot calculate offsets if unknown/invalid reference ID requested + if (region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size()) + throw BamException("BamStandardIndex::GetOffset", "invalid reference ID requested"); + + // retrieve index summary for left bound reference + const BaiReferenceSummary& refSummary = m_indexFileSummary.at(region.LeftRefID); + + // set up region boundaries based on actual BamReader data + uint32_t begin; + uint32_t end; + AdjustRegion(region, begin, end); + + // retrieve all candidate bin IDs for region + std::set<uint16_t> candidateBins; + CalculateCandidateBins(begin, end, candidateBins); + + // use reference's linear offsets to calculate the minimum offset + // that must be considered to find overlap + const uint64_t& minOffset = CalculateMinOffset(refSummary, begin); + + // attempt to use reference summary, minOffset, & candidateBins to calculate offsets + // no data should not be error, just bail + std::vector<int64_t> offsets; + CalculateCandidateOffsets(refSummary, minOffset, candidateBins, offsets); + if (offsets.empty()) return; + + // ensure that offsets are sorted before processing + sort(offsets.begin(), offsets.end()); + + // binary search for an overlapping block (may not be first one though) + BamAlignment al; + typedef std::vector<int64_t>::const_iterator OffsetConstIterator; + OffsetConstIterator offsetFirst = offsets.begin(); + OffsetConstIterator offsetIter = offsetFirst; + OffsetConstIterator offsetLast = offsets.end(); + std::iterator_traits<OffsetConstIterator>::difference_type count = + distance(offsetFirst, offsetLast); + std::iterator_traits<OffsetConstIterator>::difference_type step; + while (count > 0) { + offsetIter = offsetFirst; + step = count / 2; + advance(offsetIter, step); + + // attempt seek to candidate offset + const int64_t& candidateOffset = (*offsetIter); + if (!m_reader->Seek(candidateOffset)) { + const std::string readerError = m_reader->GetErrorString(); + const std::string message = "could not seek in BAM file: \n\t" + readerError; + throw BamException("BamToolsIndex::GetOffset", message); + } + + // load first available alignment, setting flag to true if data exists + *hasAlignmentsInRegion = m_reader->LoadNextAlignment(al); + + // check alignment against region + if (al.GetEndPosition() <= region.LeftPosition) { + offsetFirst = ++offsetIter; + count -= step + 1; + } else + count = step; + } + + // step back to the offset before the 'current offset' (to make sure we cover overlaps) + if (offsetIter != offsets.begin()) --offsetIter; + offset = (*offsetIter); +} + +// returns whether reference has alignments or no +bool BamStandardIndex::HasAlignments(const int& referenceID) const +{ + if (referenceID < 0 || referenceID >= (int)m_indexFileSummary.size()) return false; + const BaiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID); + return (refSummary.NumBins > 0); +} + +bool BamStandardIndex::IsDeviceOpen() const +{ + if (m_resources.Device == 0) return false; + return m_resources.Device->IsOpen(); +} + +// attempts to use index data to jump to @region, returns success/fail +// a "successful" jump indicates no error, but not whether this region has data +// * thus, the method sets a flag to indicate whether there are alignments +// available after the jump position +bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) +{ + + // clear out flag + *hasAlignmentsInRegion = false; + + // skip if invalid reader or not open + if (m_reader == 0 || !m_reader->IsOpen()) { + SetErrorString("BamStandardIndex::Jump", "could not jump: reader is not open"); + return false; + } + + // calculate nearest offset to jump to + int64_t offset; + try { + GetOffset(region, offset, hasAlignmentsInRegion); + } catch (BamException& e) { + m_errorString = e.what(); + return false; + } + + // if region has alignments, return success/fail of seeking there + if (*hasAlignmentsInRegion) return m_reader->Seek(offset); + + // otherwise, simply return true (but hasAlignmentsInRegion flag has been set to false) + // (this is OK, BamReader will check this flag before trying to load data) + return true; +} + +// loads existing data from file into memory +bool BamStandardIndex::Load(const std::string& filename) +{ + + try { + + // attempt to open file (read-only) + OpenFile(filename, IBamIODevice::ReadOnly); + + // validate format + CheckMagicNumber(); + + // load in-memory summary of index data + SummarizeIndexFile(); + + // return success + return true; + + } catch (BamException& e) { + m_errorString = e.what(); + return false; + } +} + +uint64_t BamStandardIndex::LookupLinearOffset(const BaiReferenceSummary& refSummary, + const int& index) +{ + + // attempt seek to proper index file position + const int64_t linearOffsetFilePosition = (int64_t)refSummary.FirstLinearOffsetFilePosition + + index * BamStandardIndex::SIZEOF_LINEAROFFSET; + Seek(linearOffsetFilePosition, SEEK_SET); + + // read linear offset from BAI file + uint64_t linearOffset; + ReadLinearOffset(linearOffset); + return linearOffset; +} + +void BamStandardIndex::MergeAlignmentChunks(BaiAlignmentChunkVector& chunks) +{ + + // skip if chunks are empty, nothing to merge + if (chunks.empty()) return; + + // set up merged alignment chunk container + BaiAlignmentChunkVector mergedChunks; + mergedChunks.push_back(chunks[0]); + + // iterate over chunks + int i = 0; + BaiAlignmentChunkVector::iterator chunkIter = chunks.begin(); + BaiAlignmentChunkVector::iterator chunkEnd = chunks.end(); + for (++chunkIter; chunkIter != chunkEnd; ++chunkIter) { + + // get 'currentMergeChunk' based on numeric index + BaiAlignmentChunk& currentMergeChunk = mergedChunks[i]; + + // get sourceChunk based on source vector iterator + BaiAlignmentChunk& sourceChunk = (*chunkIter); + + // if currentMergeChunk ends where sourceChunk starts, then merge the two + if (currentMergeChunk.Stop >> 16 == sourceChunk.Start >> 16) + currentMergeChunk.Stop = sourceChunk.Stop; + + // otherwise + else { + // append sourceChunk after currentMergeChunk + mergedChunks.push_back(sourceChunk); + + // update i, so the next iteration will consider the + // recently-appended sourceChunk as new mergeChunk candidate + ++i; + } + } + + // saved newly-merged chunks into (parameter) chunks + chunks = mergedChunks; +} + +void BamStandardIndex::OpenFile(const std::string& filename, IBamIODevice::OpenMode mode) +{ + + // make sure any previous index file is closed + CloseFile(); + + m_resources.Device = BamDeviceFactory::CreateDevice(filename); + if (m_resources.Device == 0) { + const std::string message = std::string("could not open file: ") + filename; + throw BamException("BamStandardIndex::OpenFile", message); + } + + // attempt to open file + m_resources.Device->Open(mode); + if (!IsDeviceOpen()) { + const std::string message = std::string("could not open file: ") + filename; + throw BamException("BamStandardIndex::OpenFile", message); + } +} + +void BamStandardIndex::ReadBinID(uint32_t& binId) +{ + const int64_t numBytesRead = m_resources.Device->Read((char*)&binId, sizeof(binId)); + if (m_isBigEndian) SwapEndian_32(binId); + if (numBytesRead != sizeof(binId)) + throw BamException("BamStandardIndex::ReadBinID", "could not read BAI bin ID"); +} + +void BamStandardIndex::ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks) +{ + + // read bin header + ReadBinID(binId); + ReadNumAlignmentChunks(numAlignmentChunks); + + // read bin contents + const unsigned int bytesRequested = + numAlignmentChunks * BamStandardIndex::SIZEOF_ALIGNMENTCHUNK; + ReadIntoBuffer(bytesRequested); +} + +void BamStandardIndex::ReadIntoBuffer(const unsigned int& bytesRequested) +{ + + // ensure that our buffer is big enough for request + BamStandardIndex::CheckBufferSize(m_resources.Buffer, m_bufferLength, bytesRequested); + + // read from BAI file stream + const int64_t bytesRead = m_resources.Device->Read(m_resources.Buffer, bytesRequested); + if (bytesRead != static_cast<int64_t>(bytesRequested)) { + std::stringstream s; + s << "expected to read: " << bytesRequested << " bytes, " + << "but instead read: " << bytesRead; + throw BamException("BamStandardIndex::ReadIntoBuffer", s.str()); + } +} + +void BamStandardIndex::ReadLinearOffset(uint64_t& linearOffset) +{ + const int64_t numBytesRead = + m_resources.Device->Read((char*)&linearOffset, sizeof(linearOffset)); + if (m_isBigEndian) SwapEndian_64(linearOffset); + if (numBytesRead != sizeof(linearOffset)) + throw BamException("BamStandardIndex::ReadLinearOffset", + "could not read BAI linear offset"); +} + +void BamStandardIndex::ReadNumAlignmentChunks(int& numAlignmentChunks) +{ + const int64_t numBytesRead = + m_resources.Device->Read((char*)&numAlignmentChunks, sizeof(numAlignmentChunks)); + if (m_isBigEndian) SwapEndian_32(numAlignmentChunks); + if (numBytesRead != sizeof(numAlignmentChunks)) + throw BamException("BamStandardIndex::ReadNumAlignmentChunks", + "could not read BAI chunk count"); +} + +void BamStandardIndex::ReadNumBins(int& numBins) +{ + const int64_t numBytesRead = m_resources.Device->Read((char*)&numBins, sizeof(numBins)); + if (m_isBigEndian) SwapEndian_32(numBins); + if (numBytesRead != sizeof(numBins)) + throw BamException("BamStandardIndex::ReadNumBins", "could not read BAI bin count"); +} + +void BamStandardIndex::ReadNumLinearOffsets(int& numLinearOffsets) +{ + const int64_t numBytesRead = + m_resources.Device->Read((char*)&numLinearOffsets, sizeof(numLinearOffsets)); + if (m_isBigEndian) SwapEndian_32(numLinearOffsets); + if (numBytesRead != sizeof(numLinearOffsets)) + throw BamException("BamStandardIndex::ReadNumAlignmentChunks", + "could not read BAI linear offset count"); +} + +void BamStandardIndex::ReadNumReferences(int& numReferences) +{ + const int64_t numBytesRead = + m_resources.Device->Read((char*)&numReferences, sizeof(numReferences)); + if (m_isBigEndian) SwapEndian_32(numReferences); + if (numBytesRead != sizeof(numReferences)) + throw BamException("BamStandardIndex::ReadNumReferences", "could not read reference count"); +} + +void BamStandardIndex::ReserveForSummary(const int& numReferences) +{ + m_indexFileSummary.clear(); + m_indexFileSummary.assign(numReferences, BaiReferenceSummary()); +} + +void BamStandardIndex::SaveAlignmentChunkToBin(BaiBinMap& binMap, const uint32_t& currentBin, + const uint64_t& currentOffset, + const uint64_t& lastOffset) +{ + // create new alignment chunk + BaiAlignmentChunk newChunk(currentOffset, lastOffset); + + // if no entry exists yet for this bin, create one and store alignment chunk + BaiBinMap::iterator binIter = binMap.find(currentBin); + if (binIter == binMap.end()) { + BaiAlignmentChunkVector newChunks; + newChunks.push_back(newChunk); + binMap.insert(std::pair<uint32_t, BaiAlignmentChunkVector>(currentBin, newChunks)); + } + + // otherwise, just append alignment chunk + else { + BaiAlignmentChunkVector& binChunks = (*binIter).second; + binChunks.push_back(newChunk); + } +} + +void BamStandardIndex::SaveBinsSummary(const int& refId, const int& numBins) +{ + BaiReferenceSummary& refSummary = m_indexFileSummary.at(refId); + refSummary.NumBins = numBins; + refSummary.FirstBinFilePosition = Tell(); +} + +void BamStandardIndex::SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets, + const int& alignmentStartPosition, + const int& alignmentStopPosition, + const uint64_t& lastOffset) +{ + // get converted offsets + const int beginOffset = alignmentStartPosition >> BamStandardIndex::BAM_LIDX_SHIFT; + const int endOffset = (alignmentStopPosition - 1) >> BamStandardIndex::BAM_LIDX_SHIFT; + + // resize vector if necessary + int oldSize = offsets.size(); + int newSize = endOffset + 1; + if (oldSize < newSize) offsets.resize(newSize, 0); + + // store offset + for (int i = beginOffset + 1; i <= endOffset; ++i) { + if (offsets[i] == 0) offsets[i] = lastOffset; + } +} + +void BamStandardIndex::SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets) +{ + BaiReferenceSummary& refSummary = m_indexFileSummary.at(refId); + refSummary.NumLinearOffsets = numLinearOffsets; + refSummary.FirstLinearOffsetFilePosition = Tell(); +} + +// seek to position in index file stream +void BamStandardIndex::Seek(const int64_t& position, const int origin) +{ + if (!m_resources.Device->Seek(position, origin)) + throw BamException("BamStandardIndex::Seek", "could not seek in BAI file"); +} + +void BamStandardIndex::SkipBins(const int& numBins) +{ + uint32_t binId; + int32_t numAlignmentChunks; + for (int i = 0; i < numBins; ++i) + ReadBinIntoBuffer(binId, numAlignmentChunks); // results & buffer ignored +} + +void BamStandardIndex::SkipLinearOffsets(const int& numLinearOffsets) +{ + const unsigned int bytesRequested = numLinearOffsets * BamStandardIndex::SIZEOF_LINEAROFFSET; + ReadIntoBuffer(bytesRequested); +} + +void BamStandardIndex::SortLinearOffsets(BaiLinearOffsetVector& linearOffsets) +{ + sort(linearOffsets.begin(), linearOffsets.end()); +} + +void BamStandardIndex::SummarizeBins(BaiReferenceSummary& refSummary) +{ + + // load number of bins + int numBins; + ReadNumBins(numBins); + + // store bins summary for this reference + refSummary.NumBins = numBins; + refSummary.FirstBinFilePosition = Tell(); + + // skip this reference's bins + SkipBins(numBins); +} + +void BamStandardIndex::SummarizeIndexFile() +{ + + // load number of reference sequences + int numReferences; + ReadNumReferences(numReferences); + + // initialize file summary data + ReserveForSummary(numReferences); + + // iterate over reference entries + BaiFileSummary::iterator summaryIter = m_indexFileSummary.begin(); + BaiFileSummary::iterator summaryEnd = m_indexFileSummary.end(); + for (int i = 0; summaryIter != summaryEnd; ++summaryIter, ++i) + SummarizeReference(*summaryIter); +} + +void BamStandardIndex::SummarizeLinearOffsets(BaiReferenceSummary& refSummary) +{ + + // load number of linear offsets + int numLinearOffsets; + ReadNumLinearOffsets(numLinearOffsets); + + // store bin summary data for this reference + refSummary.NumLinearOffsets = numLinearOffsets; + refSummary.FirstLinearOffsetFilePosition = Tell(); + + // skip linear offsets in index file + SkipLinearOffsets(numLinearOffsets); +} + +void BamStandardIndex::SummarizeReference(BaiReferenceSummary& refSummary) +{ + SummarizeBins(refSummary); + SummarizeLinearOffsets(refSummary); +} + +// return position of file pointer in index file stream +int64_t BamStandardIndex::Tell() const +{ + return m_resources.Device->Tell(); +} + +void BamStandardIndex::WriteAlignmentChunk(const BaiAlignmentChunk& chunk) +{ + + // localize alignment chunk offsets + uint64_t start = chunk.Start; + uint64_t stop = chunk.Stop; + + // swap endian-ness if necessary + if (m_isBigEndian) { + SwapEndian_64(start); + SwapEndian_64(stop); + } + + // write to index file + int64_t numBytesWritten = 0; + numBytesWritten += m_resources.Device->Write((const char*)&start, sizeof(start)); + numBytesWritten += m_resources.Device->Write((const char*)&stop, sizeof(stop)); + if (numBytesWritten != (sizeof(start) + sizeof(stop))) + throw BamException("BamStandardIndex::WriteAlignmentChunk", + "could not write BAI alignment chunk"); +} + +void BamStandardIndex::WriteAlignmentChunks(BaiAlignmentChunkVector& chunks) +{ + + // make sure chunks are merged (simplified) before writing & saving summary + MergeAlignmentChunks(chunks); + + // write chunks + int32_t chunkCount = chunks.size(); + if (m_isBigEndian) SwapEndian_32(chunkCount); + const int64_t numBytesWritten = + m_resources.Device->Write((const char*)&chunkCount, sizeof(chunkCount)); + if (numBytesWritten != sizeof(chunkCount)) + throw BamException("BamStandardIndex::WriteAlignmentChunks", + "could not write BAI chunk count"); + + // iterate over chunks + BaiAlignmentChunkVector::const_iterator chunkIter = chunks.begin(); + BaiAlignmentChunkVector::const_iterator chunkEnd = chunks.end(); + for (; chunkIter != chunkEnd; ++chunkIter) + WriteAlignmentChunk((*chunkIter)); +} + +void BamStandardIndex::WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks) +{ + + // write BAM bin ID + uint32_t binKey = binId; + if (m_isBigEndian) SwapEndian_32(binKey); + const int64_t numBytesWritten = m_resources.Device->Write((const char*)&binKey, sizeof(binKey)); + if (numBytesWritten != sizeof(binKey)) + throw BamException("BamStandardIndex::WriteBin", "could not write bin ID"); + + // write bin's alignment chunks + WriteAlignmentChunks(chunks); +} + +void BamStandardIndex::WriteBins(const int& refId, BaiBinMap& bins) +{ + + // write number of bins + int32_t binCount = bins.size(); + if (m_isBigEndian) SwapEndian_32(binCount); + const int64_t numBytesWritten = + m_resources.Device->Write((const char*)&binCount, sizeof(binCount)); + if (numBytesWritten != sizeof(binCount)) + throw BamException("BamStandardIndex::WriteBins", "could not write bin count"); + + // save summary for reference's bins + SaveBinsSummary(refId, bins.size()); + + // iterate over bins + BaiBinMap::iterator binIter = bins.begin(); + BaiBinMap::iterator binEnd = bins.end(); + for (; binIter != binEnd; ++binIter) + WriteBin((*binIter).first, (*binIter).second); +} + +void BamStandardIndex::WriteHeader() +{ + + int64_t numBytesWritten = 0; + + // write magic number + numBytesWritten += m_resources.Device->Write(BamStandardIndex::BAI_MAGIC, 4); + + // write number of reference sequences + int32_t numReferences = m_indexFileSummary.size(); + if (m_isBigEndian) SwapEndian_32(numReferences); + numBytesWritten += + m_resources.Device->Write((const char*)&numReferences, sizeof(numReferences)); + + if (numBytesWritten != sizeof(numReferences) + 4) + throw BamException("BamStandardIndex::WriteHeader", "could not write BAI header"); +} + +void BamStandardIndex::WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets) +{ + + // make sure linear offsets are sorted before writing & saving summary + SortLinearOffsets(linearOffsets); + + int64_t numBytesWritten = 0; + + // write number of linear offsets + int32_t offsetCount = linearOffsets.size(); + if (m_isBigEndian) SwapEndian_32(offsetCount); + numBytesWritten += m_resources.Device->Write((const char*)&offsetCount, sizeof(offsetCount)); + + // save summary for reference's linear offsets + SaveLinearOffsetsSummary(refId, linearOffsets.size()); + + // iterate over linear offsets + BaiLinearOffsetVector::const_iterator offsetIter = linearOffsets.begin(); + BaiLinearOffsetVector::const_iterator offsetEnd = linearOffsets.end(); + for (; offsetIter != offsetEnd; ++offsetIter) { + + // write linear offset + uint64_t linearOffset = (*offsetIter); + if (m_isBigEndian) SwapEndian_64(linearOffset); + numBytesWritten += + m_resources.Device->Write((const char*)&linearOffset, sizeof(linearOffset)); + } + + if (numBytesWritten != + static_cast<int64_t>(sizeof(offsetCount) + linearOffsets.size() * sizeof(uint64_t))) + throw BamException("BamStandardIndex::WriteLinearOffsets", + "could not write BAI linear offsets"); +} + +void BamStandardIndex::WriteReferenceEntry(BaiReferenceEntry& refEntry) +{ + WriteBins(refEntry.ID, refEntry.Bins); + WriteLinearOffsets(refEntry.ID, refEntry.LinearOffsets); +} diff --git a/src/api/internal/index/BamStandardIndex_p.h b/src/api/internal/index/BamStandardIndex_p.h new file mode 100644 index 0000000..514b638 --- /dev/null +++ b/src/api/internal/index/BamStandardIndex_p.h @@ -0,0 +1,236 @@ +// *************************************************************************** +// BamStandardIndex.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the standardized BAM index format (".bai") +// *************************************************************************** + +#ifndef BAM_STANDARD_INDEX_FORMAT_H +#define BAM_STANDARD_INDEX_FORMAT_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. + +#include <map> +#include <set> +#include <string> +#include <vector> +#include "api/BamAux.h" +#include "api/BamIndex.h" +#include "api/IBamIODevice.h" + +namespace BamTools { +namespace Internal { + +// ----------------------------------------------------------------------------- +// BamStandardIndex data structures + +// defines start and end of a contiguous run of alignments +struct BaiAlignmentChunk +{ + + // data members + uint64_t Start; + uint64_t Stop; + + // constructor + BaiAlignmentChunk(const uint64_t& start = 0, const uint64_t& stop = 0) + : Start(start) + , Stop(stop) + {} +}; + +// comparison operator (for sorting) +inline bool operator<(const BaiAlignmentChunk& lhs, const BaiAlignmentChunk& rhs) +{ + return lhs.Start < rhs.Start; +} + +// convenience typedef for a list of all alignment 'chunks' in a BAI bin +typedef std::vector<BaiAlignmentChunk> BaiAlignmentChunkVector; + +// convenience typedef for a map of all BAI bins in a reference (ID => chunks) +typedef std::map<uint32_t, BaiAlignmentChunkVector> BaiBinMap; + +// convenience typedef for a list of all 'linear offsets' in a reference +typedef std::vector<uint64_t> BaiLinearOffsetVector; + +// contains all fields necessary for building, loading, & writing +// full BAI index data for a single reference +struct BaiReferenceEntry +{ + + // data members + int32_t ID; + BaiBinMap Bins; + BaiLinearOffsetVector LinearOffsets; + + // ctor + BaiReferenceEntry(const int32_t& id = -1) + : ID(id) + {} +}; + +// provides (persistent) summary of BaiReferenceEntry's index data +struct BaiReferenceSummary +{ + + // data members + int NumBins; + int NumLinearOffsets; + uint64_t FirstBinFilePosition; + uint64_t FirstLinearOffsetFilePosition; + + // ctor + BaiReferenceSummary() + : NumBins(0) + , NumLinearOffsets(0) + , FirstBinFilePosition(0) + , FirstLinearOffsetFilePosition(0) + {} +}; + +// convenience typedef for describing a full BAI index file summary +typedef std::vector<BaiReferenceSummary> BaiFileSummary; + +// end BamStandardIndex data structures +// ----------------------------------------------------------------------------- + +class BamStandardIndex : public BamIndex +{ + + // ctor & dtor +public: + BamStandardIndex(Internal::BamReaderPrivate* reader); + ~BamStandardIndex(); + + // BamIndex implementation +public: + // builds index from associated BAM file & writes out to index file + bool Create(); + // returns whether reference has alignments or no + bool HasAlignments(const int& referenceID) const; + // attempts to use index data to jump to @region, returns success/fail + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); + // loads existing data from file into memory + bool Load(const std::string& filename); + BamIndex::IndexType Type() const + { + return BamIndex::STANDARD; + } + +public: + // returns format's file extension + static const std::string Extension(); + + // internal methods +private: + // index file ops + void CheckMagicNumber(); + void CloseFile(); + bool IsDeviceOpen() const; + void OpenFile(const std::string& filename, IBamIODevice::OpenMode mode); + void Seek(const int64_t& position, const int origin); + int64_t Tell() const; + + // BAI index building methods + void ClearReferenceEntry(BaiReferenceEntry& refEntry); + void SaveAlignmentChunkToBin(BaiBinMap& binMap, const uint32_t& currentBin, + const uint64_t& currentOffset, const uint64_t& lastOffset); + void SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets, const int& alignmentStartPosition, + const int& alignmentStopPosition, const uint64_t& lastOffset); + + // random-access methods + void AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end); + void CalculateCandidateBins(const uint32_t& begin, const uint32_t& end, + std::set<uint16_t>& candidateBins); + void CalculateCandidateOffsets(const BaiReferenceSummary& refSummary, const uint64_t& minOffset, + std::set<uint16_t>& candidateBins, + std::vector<int64_t>& offsets); + uint64_t CalculateMinOffset(const BaiReferenceSummary& refSummary, const uint32_t& begin); + void GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); + uint64_t LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index); + + // BAI summary (create/load) methods + void ReserveForSummary(const int& numReferences); + void SaveBinsSummary(const int& refId, const int& numBins); + void SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets); + void SkipBins(const int& numBins); + void SkipLinearOffsets(const int& numLinearOffsets); + void SummarizeBins(BaiReferenceSummary& refSummary); + void SummarizeIndexFile(); + void SummarizeLinearOffsets(BaiReferenceSummary& refSummary); + void SummarizeReference(BaiReferenceSummary& refSummary); + + // BAI full index input methods + void ReadBinID(uint32_t& binId); + void ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks); + void ReadIntoBuffer(const unsigned int& bytesRequested); + void ReadLinearOffset(uint64_t& linearOffset); + void ReadNumAlignmentChunks(int& numAlignmentChunks); + void ReadNumBins(int& numBins); + void ReadNumLinearOffsets(int& numLinearOffsets); + void ReadNumReferences(int& numReferences); + + // BAI full index output methods + void MergeAlignmentChunks(BaiAlignmentChunkVector& chunks); + void SortLinearOffsets(BaiLinearOffsetVector& linearOffsets); + void WriteAlignmentChunk(const BaiAlignmentChunk& chunk); + void WriteAlignmentChunks(BaiAlignmentChunkVector& chunks); + void WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks); + void WriteBins(const int& refId, BaiBinMap& bins); + void WriteHeader(); + void WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets); + void WriteReferenceEntry(BaiReferenceEntry& refEntry); + + // data members +private: + bool m_isBigEndian; + BaiFileSummary m_indexFileSummary; + + // our input buffer + unsigned int m_bufferLength; + struct RaiiWrapper + { + IBamIODevice* Device; + char* Buffer; + RaiiWrapper(); + ~RaiiWrapper(); + }; + RaiiWrapper m_resources; + + // static methods +private: + // checks if the buffer is large enough to accomodate the requested size + static void CheckBufferSize(char*& buffer, unsigned int& bufferLength, + const unsigned int& requestedBytes); + // checks if the buffer is large enough to accomodate the requested size + static void CheckBufferSize(unsigned char*& buffer, unsigned int& bufferLength, + const unsigned int& requestedBytes); + // static constants +private: + static const int MAX_BIN; + static const int BAM_LIDX_SHIFT; + static const std::string BAI_EXTENSION; + static const char* const BAI_MAGIC; + static const int SIZEOF_ALIGNMENTCHUNK; + static const int SIZEOF_BINCORE; + static const int SIZEOF_LINEAROFFSET; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAM_STANDARD_INDEX_FORMAT_H diff --git a/src/api/internal/index/BamToolsIndex_p.cpp b/src/api/internal/index/BamToolsIndex_p.cpp new file mode 100644 index 0000000..01a2a82 --- /dev/null +++ b/src/api/internal/index/BamToolsIndex_p.cpp @@ -0,0 +1,677 @@ +// *************************************************************************** +// BamToolsIndex.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the BamTools index format (".bti") +// *************************************************************************** + +#include "api/internal/index/BamToolsIndex_p.h" +#include "api/BamAlignment.h" +#include "api/internal/bam/BamReader_p.h" +#include "api/internal/io/BamDeviceFactory_p.h" +#include "api/internal/io/BgzfStream_p.h" +#include "api/internal/utils/BamException_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <algorithm> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <iostream> +#include <iterator> +#include <map> + +// -------------------------------- +// static BamToolsIndex constants +// -------------------------------- + +const uint32_t BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000; +const std::string BamToolsIndex::BTI_EXTENSION = ".bti"; +const char* const BamToolsIndex::BTI_MAGIC = "BTI\1"; +const int BamToolsIndex::SIZEOF_BLOCK = sizeof(int32_t) * 2 + sizeof(int64_t); + +// ---------------------------- +// RaiiWrapper implementation +// ---------------------------- + +BamToolsIndex::RaiiWrapper::RaiiWrapper() + : Device(0) +{} + +BamToolsIndex::RaiiWrapper::~RaiiWrapper() +{ + if (Device) { + Device->Close(); + delete Device; + Device = 0; + } +} + +// ------------------------------ +// BamToolsIndex implementation +// ------------------------------ + +// ctor +BamToolsIndex::BamToolsIndex(Internal::BamReaderPrivate* reader) + : BamIndex(reader) + , m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH) + , m_inputVersion(0) + , m_outputVersion(BTI_2_0) // latest version - used for writing new index files +{ + m_isBigEndian = BamTools::SystemIsBigEndian(); +} + +// dtor +BamToolsIndex::~BamToolsIndex() +{ + CloseFile(); +} + +void BamToolsIndex::CheckMagicNumber() +{ + + // read magic number + char magic[4]; + const int64_t numBytesRead = m_resources.Device->Read(magic, 4); + if (numBytesRead != 4) + throw BamException("BamToolsIndex::CheckMagicNumber", "could not read BTI magic number"); + + // validate expected magic number + if (strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0) + throw BamException("BamToolsIndex::CheckMagicNumber", "invalid BTI magic number"); +} + +// check index file version, return true if OK +void BamToolsIndex::CheckVersion() +{ + + // read version from file + const int64_t numBytesRead = + m_resources.Device->Read((char*)&m_inputVersion, sizeof(m_inputVersion)); + if (numBytesRead != sizeof(m_inputVersion)) + throw BamException("BamToolsIndex::CheckVersion", "could not read format version"); + if (m_isBigEndian) SwapEndian_32(m_inputVersion); + + // if version is negative, or zero + if (m_inputVersion <= 0) + throw BamException("BamToolsIndex::CheckVersion", "invalid format version"); + + // if version is newer than can be supported by this version of bamtools + else if (m_inputVersion > m_outputVersion) { + const std::string message = + "unsupported format: this index was created by a newer version of BamTools. " + "Update your local version of BamTools to use the index file."; + throw BamException("BamToolsIndex::CheckVersion", message); + } + + // ------------------------------------------------------------------ + // check for deprecated, unsupported versions + // (the format had to be modified to accomodate a particular bug fix) + + // Version 2.0: introduced support for half-open intervals, instead of the old closed intervals + // respondBy: throwing exception - we're not going to try to handle the old BTI files. + else if ((Version)m_inputVersion < BamToolsIndex::BTI_2_0) { + const std::string message = + "unsupported format: this version of the index may not properly handle " + "coordinate intervals. Please run 'bamtools index -bti -in yourData.bam' " + "to generate an up-to-date, fixed BTI file."; + throw BamException("BamToolsIndex::CheckVersion", message); + } +} + +void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry) +{ + refEntry.ID = -1; + refEntry.Blocks.clear(); +} + +void BamToolsIndex::CloseFile() +{ + if (IsDeviceOpen()) { + m_resources.Device->Close(); + delete m_resources.Device; + m_resources.Device = 0; + } + m_indexFileSummary.clear(); +} + +// builds index from associated BAM file & writes out to index file +bool BamToolsIndex::Create() +{ + + // skip if BamReader is invalid or not open + if (m_reader == 0 || !m_reader->IsOpen()) { + SetErrorString("BamToolsIndex::Create", "could not create index: reader is not open"); + return false; + } + + // rewind BamReader + if (!m_reader->Rewind()) { + const std::string readerError = m_reader->GetErrorString(); + const std::string message = "could not create index: \n\t" + readerError; + SetErrorString("BamToolsIndex::Create", message); + return false; + } + + try { + // open new index file (read & write) + const std::string indexFilename = m_reader->Filename() + Extension(); + OpenFile(indexFilename, IBamIODevice::ReadWrite); + + // initialize BtiFileSummary with number of references + const int& numReferences = m_reader->GetReferenceCount(); + InitializeFileSummary(numReferences); + + // intialize output file header + WriteHeader(); + + // index building markers + uint32_t currentBlockCount = 0; + int64_t currentAlignmentOffset = m_reader->Tell(); + int32_t blockRefId = -1; + int32_t blockMaxEndPosition = -1; + int64_t blockStartOffset = currentAlignmentOffset; + int32_t blockStartPosition = -1; + + // plow through alignments, storing index entries + BamAlignment al; + BtiReferenceEntry refEntry; + while (m_reader->LoadNextAlignment(al)) { + + // if moved to new reference + if (al.RefID != blockRefId) { + + // if first pass, check: + if (currentBlockCount == 0) { + + // write any empty references up to (but not including) al.RefID + for (int i = 0; i < al.RefID; ++i) + WriteReferenceEntry(BtiReferenceEntry(i)); + } + + // not first pass: + else { + + // store previous BTI block data in reference entry + const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); + refEntry.Blocks.push_back(block); + + // write reference entry, then clear + WriteReferenceEntry(refEntry); + ClearReferenceEntry(refEntry); + + // write any empty references between (but not including) + // the last blockRefID and current al.RefID + for (int i = blockRefId + 1; i < al.RefID; ++i) + WriteReferenceEntry(BtiReferenceEntry(i)); + + // reset block count + currentBlockCount = 0; + } + + // set ID for new reference entry + refEntry.ID = al.RefID; + } + + // if beginning of block, update counters + if (currentBlockCount == 0) { + blockRefId = al.RefID; + blockStartOffset = currentAlignmentOffset; + blockStartPosition = al.Position; + blockMaxEndPosition = al.GetEndPosition(); + } + + // increment block counter + ++currentBlockCount; + + // check end position + const int32_t alignmentEndPosition = al.GetEndPosition(); + if (alignmentEndPosition > blockMaxEndPosition) + blockMaxEndPosition = alignmentEndPosition; + + // if block is full, get offset for next block, reset currentBlockCount + if (currentBlockCount == m_blockSize) { + + // store previous block data in reference entry + const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); + refEntry.Blocks.push_back(block); + + // update markers + blockStartOffset = m_reader->Tell(); + currentBlockCount = 0; + } + + // not the best name, but for the next iteration, this value will be the offset of the + // *current* alignment. this is necessary because we won't know if this next alignment + // is on a new reference until we actually read it + currentAlignmentOffset = m_reader->Tell(); + } + + // after finishing alignments, if any data was read, check: + if (blockRefId >= 0) { + + // store last BTI block data in reference entry + const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); + refEntry.Blocks.push_back(block); + + // write last reference entry, then clear + WriteReferenceEntry(refEntry); + ClearReferenceEntry(refEntry); + + // then write any empty references remaining at end of file + for (int i = blockRefId + 1; i < numReferences; ++i) + WriteReferenceEntry(BtiReferenceEntry(i)); + } + + } catch (BamException& e) { + m_errorString = e.what(); + return false; + } + + // rewind BamReader + if (!m_reader->Rewind()) { + const std::string readerError = m_reader->GetErrorString(); + const std::string message = "could not create index: \n\t" + readerError; + SetErrorString("BamToolsIndex::Create", message); + return false; + } + + // return success + return true; +} + +// returns format's file extension +const std::string BamToolsIndex::Extension() +{ + return BamToolsIndex::BTI_EXTENSION; +} + +void BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) +{ + + // return false ref ID is not a valid index in file summary data + if (region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size()) + throw BamException("BamToolsIndex::GetOffset", "invalid region requested"); + + // retrieve reference index data for left bound reference + BtiReferenceEntry refEntry(region.LeftRefID); + ReadReferenceEntry(refEntry); + + // binary search for an overlapping block (may not be first one though) + bool found = false; + typedef BtiBlockVector::const_iterator BtiBlockConstIterator; + BtiBlockConstIterator blockFirst = refEntry.Blocks.begin(); + BtiBlockConstIterator blockIter = blockFirst; + BtiBlockConstIterator blockLast = refEntry.Blocks.end(); + std::iterator_traits<BtiBlockConstIterator>::difference_type count = + std::distance(blockFirst, blockLast); + std::iterator_traits<BtiBlockConstIterator>::difference_type step; + while (count > 0) { + blockIter = blockFirst; + step = count / 2; + advance(blockIter, step); + + const BtiBlock& block = (*blockIter); + if (block.StartPosition <= region.RightPosition) { + if (block.MaxEndPosition > region.LeftPosition) { + offset = block.StartOffset; + break; + } + blockFirst = ++blockIter; + count -= step + 1; + } else + count = step; + } + + // if we didn't search "off the end" of the blocks + if (blockIter != blockLast) { + + // "walk back" until we've gone too far + while (blockIter != blockFirst) { + const BtiBlock& currentBlock = (*blockIter); + + --blockIter; + const BtiBlock& previousBlock = (*blockIter); + if (previousBlock.MaxEndPosition <= region.LeftPosition) { + offset = currentBlock.StartOffset; + found = true; + break; + } + } + + // if we walked all the way to first block, just return that and let the reader's + // region overlap parsing do the rest + if (blockIter == blockFirst) { + const BtiBlock& block = (*blockIter); + offset = block.StartOffset; + found = true; + } + } + + // sets to false if blocks container is empty, or if no matching block could be found + *hasAlignmentsInRegion = found; +} + +// returns whether reference has alignments or no +bool BamToolsIndex::HasAlignments(const int& referenceID) const +{ + if (referenceID < 0 || referenceID >= (int)m_indexFileSummary.size()) return false; + const BtiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID); + return (refSummary.NumBlocks > 0); +} + +// pre-allocates space for each reference's summary data +void BamToolsIndex::InitializeFileSummary(const int& numReferences) +{ + m_indexFileSummary.clear(); + for (int i = 0; i < numReferences; ++i) + m_indexFileSummary.push_back(BtiReferenceSummary()); +} + +// returns true if the index stream is open +bool BamToolsIndex::IsDeviceOpen() const +{ + if (m_resources.Device == 0) return false; + return m_resources.Device->IsOpen(); +} + +// attempts to use index data to jump to @region, returns success/fail +// a "successful" jump indicates no error, but not whether this region has data +// * thus, the method sets a flag to indicate whether there are alignments +// available after the jump position +bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) +{ + + // clear flag + *hasAlignmentsInRegion = false; + + // skip if invalid reader or not open + if (m_reader == 0 || !m_reader->IsOpen()) { + SetErrorString("BamToolsIndex::Jump", "could not jump: reader is not open"); + return false; + } + + // make sure left-bound position is valid + const RefVector& references = m_reader->GetReferenceData(); + if (region.LeftPosition > references.at(region.LeftRefID).RefLength) { + SetErrorString("BamToolsIndex::Jump", "could not create index: invalid region requested"); + return false; + } + + // calculate nearest offset to jump to + int64_t offset; + try { + GetOffset(region, offset, hasAlignmentsInRegion); + } catch (BamException& e) { + m_errorString = e.what(); + return false; + } + + // return success/failure of seek + return m_reader->Seek(offset); +} + +// loads existing data from file into memory +bool BamToolsIndex::Load(const std::string& filename) +{ + + try { + + // attempt to open file (read-only) + OpenFile(filename, IBamIODevice::ReadOnly); + + // load metadata & generate in-memory summary + LoadHeader(); + LoadFileSummary(); + + // return success + return true; + + } catch (BamException& e) { + m_errorString = e.what(); + return false; + } +} + +void BamToolsIndex::LoadFileSummary() +{ + + // load number of reference sequences + int numReferences; + LoadNumReferences(numReferences); + + // initialize file summary data + InitializeFileSummary(numReferences); + + // load summary for each reference + BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin(); + BtiFileSummary::iterator summaryEnd = m_indexFileSummary.end(); + for (; summaryIter != summaryEnd; ++summaryIter) + LoadReferenceSummary(*summaryIter); +} + +void BamToolsIndex::LoadHeader() +{ + + // check BTI file metadata + CheckMagicNumber(); + CheckVersion(); + + // use file's BTI block size to set member variable + const int64_t numBytesRead = m_resources.Device->Read((char*)&m_blockSize, sizeof(m_blockSize)); + if (m_isBigEndian) SwapEndian_32(m_blockSize); + if (numBytesRead != sizeof(m_blockSize)) + throw BamException("BamToolsIndex::LoadHeader", "could not read BTI block size"); +} + +void BamToolsIndex::LoadNumBlocks(int& numBlocks) +{ + const int64_t numBytesRead = m_resources.Device->Read((char*)&numBlocks, sizeof(numBlocks)); + if (m_isBigEndian) SwapEndian_32(numBlocks); + if (numBytesRead != sizeof(numBlocks)) + throw BamException("BamToolsIndex::LoadNumBlocks", "could not read number of BTI blocks"); +} + +void BamToolsIndex::LoadNumReferences(int& numReferences) +{ + const int64_t numBytesRead = + m_resources.Device->Read((char*)&numReferences, sizeof(numReferences)); + if (m_isBigEndian) SwapEndian_32(numReferences); + if (numBytesRead != sizeof(numReferences)) + throw BamException("BamToolsIndex::LoadNumReferences", + "could not read number of references"); +} + +void BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary) +{ + + // load number of blocks + int numBlocks; + LoadNumBlocks(numBlocks); + + // store block summary data for this reference + refSummary.NumBlocks = numBlocks; + refSummary.FirstBlockFilePosition = Tell(); + + // skip reference's blocks + SkipBlocks(numBlocks); +} + +void BamToolsIndex::OpenFile(const std::string& filename, IBamIODevice::OpenMode mode) +{ + + // make sure any previous index file is closed + CloseFile(); + + m_resources.Device = BamDeviceFactory::CreateDevice(filename); + if (m_resources.Device == 0) { + const std::string message = std::string("could not open file: ") + filename; + throw BamException("BamStandardIndex::OpenFile", message); + } + + // attempt to open file + m_resources.Device->Open(mode); + if (!IsDeviceOpen()) { + const std::string message = std::string("could not open file: ") + filename; + throw BamException("BamToolsIndex::OpenFile", message); + } +} + +void BamToolsIndex::ReadBlock(BtiBlock& block) +{ + + // read in block data members + int64_t numBytesRead = 0; + numBytesRead += + m_resources.Device->Read((char*)&block.MaxEndPosition, sizeof(block.MaxEndPosition)); + numBytesRead += m_resources.Device->Read((char*)&block.StartOffset, sizeof(block.StartOffset)); + numBytesRead += + m_resources.Device->Read((char*)&block.StartPosition, sizeof(block.StartPosition)); + + // swap endian-ness if necessary + if (m_isBigEndian) { + SwapEndian_32(block.MaxEndPosition); + SwapEndian_64(block.StartOffset); + SwapEndian_32(block.StartPosition); + } + + // check block read ok + const int expectedBytes = + sizeof(block.MaxEndPosition) + sizeof(block.StartOffset) + sizeof(block.StartPosition); + if (numBytesRead != expectedBytes) + throw BamException("BamToolsIndex::ReadBlock", "could not read block"); +} + +void BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks) +{ + + // prep blocks container + blocks.clear(); + blocks.reserve(refSummary.NumBlocks); + + // skip to first block entry + Seek(refSummary.FirstBlockFilePosition, SEEK_SET); + + // read & store block entries + BtiBlock block; + for (int i = 0; i < refSummary.NumBlocks; ++i) { + ReadBlock(block); + blocks.push_back(block); + } +} + +void BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry) +{ + + // return false if refId not valid index in file summary structure + if (refEntry.ID < 0 || refEntry.ID >= (int)m_indexFileSummary.size()) + throw BamException("BamToolsIndex::ReadReferenceEntry", "invalid reference requested"); + + // use index summary to assist reading the reference's BTI blocks + const BtiReferenceSummary& refSummary = m_indexFileSummary.at(refEntry.ID); + ReadBlocks(refSummary, refEntry.Blocks); +} + +void BamToolsIndex::Seek(const int64_t& position, const int origin) +{ + if (!m_resources.Device->Seek(position, origin)) + throw BamException("BamToolsIndex::Seek", "could not seek in BAI file"); +} + +void BamToolsIndex::SkipBlocks(const int& numBlocks) +{ + Seek(numBlocks * BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR); +} + +int64_t BamToolsIndex::Tell() const +{ + return m_resources.Device->Tell(); +} + +void BamToolsIndex::WriteBlock(const BtiBlock& block) +{ + + // copy entry data + int32_t maxEndPosition = block.MaxEndPosition; + int64_t startOffset = block.StartOffset; + int32_t startPosition = block.StartPosition; + + // swap endian-ness if necessary + if (m_isBigEndian) { + SwapEndian_32(maxEndPosition); + SwapEndian_64(startOffset); + SwapEndian_32(startPosition); + } + + // write the reference index entry + int64_t numBytesWritten = 0; + numBytesWritten += + m_resources.Device->Write((const char*)&maxEndPosition, sizeof(maxEndPosition)); + numBytesWritten += m_resources.Device->Write((const char*)&startOffset, sizeof(startOffset)); + numBytesWritten += + m_resources.Device->Write((const char*)&startPosition, sizeof(startPosition)); + + // check block written ok + const int expectedBytes = sizeof(maxEndPosition) + sizeof(startOffset) + sizeof(startPosition); + if (numBytesWritten != expectedBytes) + throw BamException("BamToolsIndex::WriteBlock", "could not write BTI block"); +} + +void BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks) +{ + BtiBlockVector::const_iterator blockIter = blocks.begin(); + BtiBlockVector::const_iterator blockEnd = blocks.end(); + for (; blockIter != blockEnd; ++blockIter) + WriteBlock(*blockIter); +} + +void BamToolsIndex::WriteHeader() +{ + + int64_t numBytesWritten = 0; + + // write BTI index format 'magic number' + numBytesWritten += m_resources.Device->Write(BamToolsIndex::BTI_MAGIC, 4); + + // write BTI index format version + int32_t currentVersion = (int32_t)m_outputVersion; + if (m_isBigEndian) SwapEndian_32(currentVersion); + numBytesWritten += + m_resources.Device->Write((const char*)¤tVersion, sizeof(currentVersion)); + + // write block size + uint32_t blockSize = m_blockSize; + if (m_isBigEndian) SwapEndian_32(blockSize); + numBytesWritten += m_resources.Device->Write((const char*)&blockSize, sizeof(blockSize)); + + // write number of references + int32_t numReferences = m_indexFileSummary.size(); + if (m_isBigEndian) SwapEndian_32(numReferences); + numBytesWritten += + m_resources.Device->Write((const char*)&numReferences, sizeof(numReferences)); + + // check header written ok + const int expectedBytes = + 4 + sizeof(currentVersion) + sizeof(blockSize) + sizeof(numReferences); + if (numBytesWritten != expectedBytes) + throw BamException("BamToolsIndex::WriteHeader", "could not write BTI header"); +} + +void BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry) +{ + + // write number of blocks this reference + uint32_t numBlocks = refEntry.Blocks.size(); + if (m_isBigEndian) SwapEndian_32(numBlocks); + const int64_t numBytesWritten = + m_resources.Device->Write((const char*)&numBlocks, sizeof(numBlocks)); + if (numBytesWritten != sizeof(numBlocks)) + throw BamException("BamToolsIndex::WriteReferenceEntry", + "could not write number of blocks"); + + // write actual block entries + WriteBlocks(refEntry.Blocks); +} diff --git a/src/api/internal/index/BamToolsIndex_p.h b/src/api/internal/index/BamToolsIndex_p.h new file mode 100644 index 0000000..909b164 --- /dev/null +++ b/src/api/internal/index/BamToolsIndex_p.h @@ -0,0 +1,195 @@ +// *************************************************************************** +// BamToolsIndex.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the BamTools index format (".bti") +// *************************************************************************** + +#ifndef BAMTOOLS_INDEX_FORMAT_H +#define BAMTOOLS_INDEX_FORMAT_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. + +#include <map> +#include <string> +#include <vector> +#include "api/BamAux.h" +#include "api/BamIndex.h" +#include "api/IBamIODevice.h" + +namespace BamTools { +namespace Internal { + +// contains data for each 'block' in a BTI index +struct BtiBlock +{ + + // data members + int32_t MaxEndPosition; + int64_t StartOffset; + int32_t StartPosition; + + // ctor + BtiBlock(const int32_t& maxEndPosition = 0, const int64_t& startOffset = 0, + const int32_t& startPosition = 0) + : MaxEndPosition(maxEndPosition) + , StartOffset(startOffset) + , StartPosition(startPosition) + {} +}; + +// convenience typedef for describing a a list of BTI blocks on a reference +typedef std::vector<BtiBlock> BtiBlockVector; + +// contains all fields necessary for building, loading, & writing +// full BTI index data for a single reference +struct BtiReferenceEntry +{ + + // data members + int32_t ID; + BtiBlockVector Blocks; + + // ctor + BtiReferenceEntry(const int& id = -1) + : ID(id) + {} +}; + +// provides (persistent) summary of BtiReferenceEntry's index data +struct BtiReferenceSummary +{ + + // data members + int NumBlocks; + uint64_t FirstBlockFilePosition; + + // ctor + BtiReferenceSummary() + : NumBlocks(0) + , FirstBlockFilePosition(0) + {} +}; + +// convenience typedef for describing a full BTI index file summary +typedef std::vector<BtiReferenceSummary> BtiFileSummary; + +class BamToolsIndex : public BamIndex +{ + + // keep a list of any supported versions here + // (might be useful later to handle any 'legacy' versions if the format changes) + // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on + // + // so a change introduced in BTI_1_2 may be handled from then on by: + // + // if ( indexVersion >= BTI_1_2 ) + // do something new + // else + // do the old thing + enum Version + { + BTI_1_0 = 1, + BTI_1_1, + BTI_1_2, + BTI_2_0 + }; + + // ctor & dtor +public: + BamToolsIndex(Internal::BamReaderPrivate* reader); + ~BamToolsIndex(); + + // BamIndex implementation +public: + // builds index from associated BAM file & writes out to index file + bool Create(); + // returns whether reference has alignments or no + bool HasAlignments(const int& referenceID) const; + // attempts to use index data to jump to @region, returns success/fail + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); + // loads existing data from file into memory + bool Load(const std::string& filename); + BamIndex::IndexType Type() const + { + return BamIndex::BAMTOOLS; + } + +public: + // returns format's file extension + static const std::string Extension(); + + // internal methods +private: + // index file ops + void CheckMagicNumber(); + void CheckVersion(); + void CloseFile(); + bool IsDeviceOpen() const; + void OpenFile(const std::string& filename, IBamIODevice::OpenMode mode); + void Seek(const int64_t& position, const int origin); + int64_t Tell() const; + + // index-creation methods + void ClearReferenceEntry(BtiReferenceEntry& refEntry); + void WriteBlock(const BtiBlock& block); + void WriteBlocks(const BtiBlockVector& blocks); + void WriteHeader(); + void WriteReferenceEntry(const BtiReferenceEntry& refEntry); + + // random-access methods + void GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); + void ReadBlock(BtiBlock& block); + void ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks); + void ReadReferenceEntry(BtiReferenceEntry& refEntry); + + // BTI summary data methods + void InitializeFileSummary(const int& numReferences); + void LoadFileSummary(); + void LoadHeader(); + void LoadNumBlocks(int& numBlocks); + void LoadNumReferences(int& numReferences); + void LoadReferenceSummary(BtiReferenceSummary& refSummary); + void SkipBlocks(const int& numBlocks); + + // data members +private: + bool m_isBigEndian; + BtiFileSummary m_indexFileSummary; + uint32_t m_blockSize; + int32_t m_inputVersion; // Version is serialized as int + Version m_outputVersion; + + struct RaiiWrapper + { + IBamIODevice* Device; + RaiiWrapper(); + ~RaiiWrapper(); + }; + RaiiWrapper m_resources; + + // static constants +private: + static const uint32_t DEFAULT_BLOCK_LENGTH; + static const std::string BTI_EXTENSION; + static const char* const BTI_MAGIC; + static const int SIZEOF_BLOCK; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMTOOLS_INDEX_FORMAT_H diff --git a/src/api/internal/index/CMakeLists.txt b/src/api/internal/index/CMakeLists.txt new file mode 100644 index 0000000..d6a7df6 --- /dev/null +++ b/src/api/internal/index/CMakeLists.txt @@ -0,0 +1,17 @@ +# ========================== +# BamTools CMakeLists.txt +# (c) 2011 Derek Barnett +# +# src/api/internal/index +# ========================== + +set( InternalIndexDir "${InternalDir}/index" ) + +set( InternalIndexSources + ${InternalIndexDir}/BamIndexFactory_p.cpp + ${InternalIndexDir}/BamStandardIndex_p.cpp + ${InternalIndexDir}/BamToolsIndex_p.cpp + + PARENT_SCOPE # <-- leave this last +) + diff --git a/src/api/internal/io/BamDeviceFactory_p.cpp b/src/api/internal/io/BamDeviceFactory_p.cpp new file mode 100644 index 0000000..2844ab1 --- /dev/null +++ b/src/api/internal/io/BamDeviceFactory_p.cpp @@ -0,0 +1,34 @@ +// *************************************************************************** +// BamDeviceFactory_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 September 2011 (DB) +// --------------------------------------------------------------------------- +// Creates built-in concrete implementations of IBamIODevices +// *************************************************************************** + +#include "api/internal/io/BamDeviceFactory_p.h" +#include "api/internal/io/BamFile_p.h" +#include "api/internal/io/BamFtp_p.h" +#include "api/internal/io/BamHttp_p.h" +#include "api/internal/io/BamPipe_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <iostream> + +IBamIODevice* BamDeviceFactory::CreateDevice(const std::string& source) +{ + + // check for requested pipe + if (source == "-" || source == "stdin" || source == "stdout") return new BamPipe; + + // check for HTTP prefix + if (source.find("http://") == 0) return new BamHttp(source); + + // check for FTP prefix + if (source.find("ftp://") == 0) return new BamFtp(source); + + // otherwise assume a "normal" file + return new BamFile(source); +} diff --git a/src/api/internal/io/BamDeviceFactory_p.h b/src/api/internal/io/BamDeviceFactory_p.h new file mode 100644 index 0000000..ddd93b8 --- /dev/null +++ b/src/api/internal/io/BamDeviceFactory_p.h @@ -0,0 +1,38 @@ +// *************************************************************************** +// BamDeviceFactory_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Creates built-in concrete implementations of IBamIODevices +// *************************************************************************** + +#ifndef BAMDEVICEFACTORY_P_H +#define BAMDEVICEFACTORY_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <string> +#include "api/IBamIODevice.h" + +namespace BamTools { +namespace Internal { + +class BamDeviceFactory +{ +public: + static IBamIODevice* CreateDevice(const std::string& source); +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMDEVICEFACTORY_P_H diff --git a/src/api/internal/io/BamFile_p.cpp b/src/api/internal/io/BamFile_p.cpp new file mode 100644 index 0000000..4130bab --- /dev/null +++ b/src/api/internal/io/BamFile_p.cpp @@ -0,0 +1,73 @@ +// *************************************************************************** +// BamFile_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides BAM file-specific IO behavior +// *************************************************************************** + +#include "api/internal/io/BamFile_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstdio> +#include <iostream> + +BamFile::BamFile(const std::string& filename) + : ILocalIODevice() + , m_filename(filename) +{} + +BamFile::~BamFile() {} + +void BamFile::Close() +{ + if (IsOpen()) { + m_filename.clear(); + ILocalIODevice::Close(); + } +} + +bool BamFile::IsRandomAccess() const +{ + return true; +} + +bool BamFile::Open(const IBamIODevice::OpenMode mode) +{ + + // make sure we're starting with a fresh file stream + Close(); + + // attempt to open FILE* depending on requested openmode + if (mode == IBamIODevice::ReadOnly) + m_stream = fopen(m_filename.c_str(), "rb"); + else if (mode == IBamIODevice::WriteOnly) + m_stream = fopen(m_filename.c_str(), "wb"); + else if (mode == IBamIODevice::ReadWrite) + m_stream = fopen(m_filename.c_str(), "w+b"); + else { + SetErrorString("BamFile::Open", "unknown open mode requested"); + return false; + } + + // check that we obtained a valid FILE* + if (m_stream == 0) { + const std::string message_base = std::string("could not open file handle for "); + const std::string message = + message_base + ((m_filename.empty()) ? "empty filename" : m_filename); + SetErrorString("BamFile::Open", message); + return false; + } + + // store current IO mode & return success + m_mode = mode; + return true; +} + +bool BamFile::Seek(const int64_t& position, const int origin) +{ + BT_ASSERT_X(m_stream, "BamFile::Seek() - null stream"); + return (fseek64(m_stream, position, origin) == 0); +} diff --git a/src/api/internal/io/BamFile_p.h b/src/api/internal/io/BamFile_p.h new file mode 100644 index 0000000..47119b3 --- /dev/null +++ b/src/api/internal/io/BamFile_p.h @@ -0,0 +1,52 @@ +// *************************************************************************** +// BamFile_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides BAM file-specific IO behavior +// *************************************************************************** + +#ifndef BAMFILE_P_H +#define BAMFILE_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <string> +#include "api/internal/io/ILocalIODevice_p.h" + +namespace BamTools { +namespace Internal { + +class BamFile : public ILocalIODevice +{ + + // ctor & dtor +public: + BamFile(const std::string& filename); + ~BamFile(); + + // ILocalIODevice implementation +public: + void Close(); + bool IsRandomAccess() const; + bool Open(const IBamIODevice::OpenMode mode); + bool Seek(const int64_t& position, const int origin = SEEK_SET); + + // data members +private: + std::string m_filename; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMFILE_P_H diff --git a/src/api/internal/io/BamFtp_p.cpp b/src/api/internal/io/BamFtp_p.cpp new file mode 100644 index 0000000..43dade7 --- /dev/null +++ b/src/api/internal/io/BamFtp_p.cpp @@ -0,0 +1,491 @@ +// *************************************************************************** +// BamFtp_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 8 December 2011 (DB) +// --------------------------------------------------------------------------- +// Provides reading/writing of BAM files on FTP server +// *************************************************************************** + +#include "api/internal/io/BamFtp_p.h" +#include "api/BamAux.h" +#include "api/internal/io/TcpSocket_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cctype> +#include <cstddef> +#include <cstdlib> +#include <sstream> +#include <vector> + +namespace BamTools { +namespace Internal { + +// ----------- +// constants +// ----------- + +static const uint16_t FTP_PORT = 21; +static const std::string FTP_PREFIX = "ftp://"; +static const std::size_t FTP_PREFIX_LENGTH = 6; +static const std::string FTP_NEWLINE = "\r\n"; + +static const std::string DEFAULT_USER = "anonymous"; +static const std::string DEFAULT_PASS = "anonymous@"; + +static const std::string ABOR_CMD = "ABOR"; +static const std::string USER_CMD = "USER"; +static const std::string PASS_CMD = "PASS"; +static const std::string PASV_CMD = "PASV"; +static const std::string REIN_CMD = "REIN"; +static const std::string REST_CMD = "REST"; +static const std::string RETR_CMD = "RETR"; +static const std::string TYPE_CMD = "TYPE"; + +static const char CMD_SEPARATOR = ' '; +static const char HOST_SEPARATOR = '/'; +static const char IP_SEPARATOR = '.'; + +static const char MULTILINE_CONTINUE = '-'; + +static const char PASV_REPLY_PREFIX = '('; +static const char PASV_REPLY_SEPARATOR = ','; +static const char PASV_REPLY_SUFFIX = ')'; + +// ----------------- +// utility methods +// ----------------- + +static inline std::vector<std::string> split(const std::string& source, const char delim) +{ + + std::stringstream ss(source); + std::string field; + std::vector<std::string> fields; + + while (std::getline(ss, field, delim)) + fields.push_back(field); + return fields; +} + +static inline bool startsWith(const std::string& source, const std::string& pattern) +{ + return (source.find(pattern) == 0); +} + +static inline std::string toLower(const std::string& s) +{ + std::string out; + const std::size_t sSize = s.size(); + out.resize(sSize); + for (std::size_t i = 0; i < sSize; ++i) + out[i] = tolower(s[i]); + return out; +} + +} // namespace Internal +} // namespace BamTools + +// ----------------------- +// BamFtp implementation +// ----------------------- + +BamFtp::BamFtp(const std::string& url) + : IBamIODevice() + , m_commandSocket(new TcpSocket) + , m_dataSocket(new TcpSocket) + , m_port(FTP_PORT) + , m_dataPort(0) + , m_username(DEFAULT_USER) + , m_password(DEFAULT_PASS) + , m_isUrlParsed(false) + , m_filePosition(-1) +{ + ParseUrl(url); +} + +BamFtp::~BamFtp() +{ + + // close connection & clean up + Close(); + if (m_commandSocket) delete m_commandSocket; + if (m_dataSocket) delete m_dataSocket; +} + +void BamFtp::Close() +{ + + // disconnect socket + m_commandSocket->DisconnectFromHost(); + m_dataSocket->DisconnectFromHost(); + + // reset state - necessary?? + m_isUrlParsed = false; + m_filePosition = -1; + m_username = DEFAULT_USER; + m_password = DEFAULT_PASS; + m_dataHostname.clear(); + m_dataPort = 0; +} + +bool BamFtp::ConnectCommandSocket() +{ + + BT_ASSERT_X(m_commandSocket, "null command socket?"); + + // connect to FTP server + if (!m_commandSocket->ConnectToHost(m_hostname, m_port, m_mode)) { + SetErrorString("BamFtp::ConnectCommandSocket", "could not connect to host - "); + return false; + } + + // receive initial reply from host + if (!ReceiveReply()) { + Close(); + return false; + } + + // send USER command + std::string userCommand = USER_CMD + CMD_SEPARATOR + m_username + FTP_NEWLINE; + if (!SendCommand(userCommand, true)) { + Close(); + return false; + } + + // send PASS command + std::string passwordCommand = PASS_CMD + CMD_SEPARATOR + m_password + FTP_NEWLINE; + if (!SendCommand(passwordCommand, true)) { + Close(); + return false; + } + + // send TYPE command + std::string typeCommand = TYPE_CMD + CMD_SEPARATOR + 'I' + FTP_NEWLINE; + if (!SendCommand(typeCommand, true)) { + Close(); + return false; + } + + // return success + return true; +} + +bool BamFtp::ConnectDataSocket() +{ + + // failure if can't connect to command socket first + if (!m_commandSocket->IsConnected()) { + if (!ConnectCommandSocket()) return false; + } + + // make sure we're starting with a fresh data channel + if (m_dataSocket->IsConnected()) m_dataSocket->DisconnectFromHost(); + + // send passive connection command + const std::string passiveCommand = PASV_CMD + FTP_NEWLINE; + if (!SendCommand(passiveCommand, true)) { + // TODO: set error string + return false; + } + + // retrieve passive connection port + if (!ParsePassiveResponse()) { + // TODO: set error string + return false; + } + + // set up restart command (tell server where to start fetching bytes from) + if (m_filePosition >= 0) { + + std::stringstream fpStream; + fpStream << m_filePosition; + std::string restartCommand = REST_CMD + CMD_SEPARATOR + fpStream.str() + FTP_NEWLINE; + if (!SendCommand(restartCommand, true)) { + // TODO: set error string + return false; + } + } + + // main file retrieval request + std::string retrieveCommand = RETR_CMD + CMD_SEPARATOR + m_filename + FTP_NEWLINE; + if (!SendCommand(retrieveCommand, false)) { + // TODO: set error string + return false; + } + + // make data channel connection + if (!m_dataSocket->ConnectToHost(m_dataHostname, m_dataPort)) { + // TODO: set error string + return false; + } + + // fetch intial reply from server + if (!ReceiveReply()) { + // TODO: set error string + m_dataSocket->DisconnectFromHost(); + return false; + } + + // make sure we have reply code 150 (all good) + if (!startsWith(m_response, "150")) { + // TODO: set error string + m_dataSocket->DisconnectFromHost(); + return false; + } + + // return success + return true; +} + +bool BamFtp::IsOpen() const +{ + return IBamIODevice::IsOpen() && m_isUrlParsed; +} + +bool BamFtp::IsRandomAccess() const +{ + return true; +} + +bool BamFtp::Open(const IBamIODevice::OpenMode mode) +{ + + // BamFtp only supports read-only access + if (mode != IBamIODevice::ReadOnly) { + SetErrorString("BamFtp::Open", "writing on this device is not supported"); + return false; + } + + // initialize basic valid state + m_mode = mode; + m_filePosition = 0; + + // attempt connection to command & data sockets + return (ConnectCommandSocket() && ConnectDataSocket()); +} + +bool BamFtp::ParsePassiveResponse() +{ + + // fail if empty + if (m_response.empty()) return false; + + // find parentheses + const std::size_t leftParenFound = m_response.find(PASV_REPLY_PREFIX); + const std::size_t rightParenFound = m_response.find(PASV_REPLY_SUFFIX); + if (leftParenFound == std::string::npos || rightParenFound == std::string::npos) return false; + + // grab everything between ( should be "h1,h2,h3,h4,p1,p2" ) + std::string::const_iterator responseBegin = m_response.begin(); + const std::string hostAndPort(responseBegin + leftParenFound + 1, + responseBegin + rightParenFound); + + // parse into string fields + std::vector<std::string> fields = split(hostAndPort, PASV_REPLY_SEPARATOR); + if (fields.size() != 6) return false; + + // fetch passive connection IP + m_dataHostname = + fields[0] + IP_SEPARATOR + fields[1] + IP_SEPARATOR + fields[2] + IP_SEPARATOR + fields[3]; + + // fetch passive connection port + const uint8_t portUpper = static_cast<uint8_t>(std::atoi(fields[4].c_str())); + const uint8_t portLower = static_cast<uint8_t>(std::atoi(fields[5].c_str())); + m_dataPort = (portUpper << 8) + portLower; + + // return success + return true; +} + +void BamFtp::ParseUrl(const std::string& url) +{ + + // clear flag to start + m_isUrlParsed = false; + + // make sure url starts with "ftp://", case-insensitive + std::string tempUrl(url); + toLower(tempUrl); + const std::size_t prefixFound = tempUrl.find(FTP_PREFIX); + if (prefixFound == std::string::npos) return; + + // find end of host name portion (first '/' hit after the prefix) + const std::size_t firstSlashFound = tempUrl.find(HOST_SEPARATOR, FTP_PREFIX_LENGTH); + if (firstSlashFound == std::string::npos) { + ; // no slash found... no filename given along with host? + } + + // fetch hostname + std::string hostname = tempUrl.substr(FTP_PREFIX_LENGTH, (firstSlashFound - FTP_PREFIX_LENGTH)); + m_hostname = hostname; + m_port = FTP_PORT; + + // store remainder of URL as filename (must be non-empty) + std::string filename = tempUrl.substr(firstSlashFound); + if (filename.empty()) return; + m_filename = filename; + + // set parsed OK flag + m_isUrlParsed = true; +} + +int64_t BamFtp::Read(char* data, const unsigned int numBytes) +{ + + // if BamHttp not in a valid state + if (!IsOpen()) return -1; + + // read until hit desired @numBytes + int64_t bytesReadSoFar = 0; + while (bytesReadSoFar < numBytes) { + + // calculate number of bytes we're going to try to read this iteration + const std::size_t remainingBytes = (numBytes - bytesReadSoFar); + + // if either disconnected somehow, or (more likely) we have seeked since last read + if (!m_dataSocket->IsConnected()) { + if (!ConnectDataSocket()) { + // TODO: set error string + return -1; + } + } + + // read bytes from data socket + const int64_t socketBytesRead = ReadDataSocket(data + bytesReadSoFar, remainingBytes); + if (socketBytesRead < 0) // error + return -1; + else if (socketBytesRead == 0) // EOF + return bytesReadSoFar; + bytesReadSoFar += socketBytesRead; + m_filePosition += socketBytesRead; + } + + // return actual number bytes successfully read + return bytesReadSoFar; +} + +int64_t BamFtp::ReadCommandSocket(char* data, const unsigned int maxNumBytes) +{ + return m_commandSocket->Read(data, maxNumBytes); +} + +int64_t BamFtp::ReadDataSocket(char* data, const unsigned int maxNumBytes) +{ + return m_dataSocket->Read(data, maxNumBytes); +} + +bool BamFtp::ReceiveReply() +{ + + // failure if not connected + if (!m_commandSocket->IsConnected()) { + SetErrorString("BamFtp::ReceiveReply()", "command socket not connected"); + return false; + } + + m_response.clear(); + + // read header data (& discard for now) + bool headerEnd = false; + while (!headerEnd) { + + const std::string headerLine = m_commandSocket->ReadLine(); + m_response += headerLine; + + // if line is of form 'xyz ', quit reading lines + if ((headerLine.length() >= 4) && isdigit(headerLine[0]) && isdigit(headerLine[1]) && + isdigit(headerLine[2]) && (headerLine[3] != MULTILINE_CONTINUE)) { + headerEnd = true; + } + } + + // return success, depending on response + if (m_response.empty()) { + SetErrorString("BamFtp::ReceiveReply", "error reading server reply"); + return false; + } + return true; +} + +bool BamFtp::Seek(const int64_t& position, const int origin) +{ + + // if FTP device not in a valid state + if (!IsOpen()) { + // TODO: set error string + return false; + } + + // ---------------------- + // UGLY !! but works?? + // ---------------------- + // disconnect from server + m_dataSocket->DisconnectFromHost(); + m_commandSocket->DisconnectFromHost(); + + // update file position & return success + if (origin == SEEK_CUR) + m_filePosition += position; + else if (origin == SEEK_SET) + m_filePosition = position; + else { + // TODO: set error string + return false; + } + return true; +} + +bool BamFtp::SendCommand(const std::string& command, bool waitForReply) +{ + + // failure if not connected + if (!m_commandSocket->IsConnected()) { + SetErrorString("BamFtp::SendCommand", "command socket not connected"); + return false; + } + + // write command to 'command socket' + if (WriteCommandSocket(command.c_str(), command.length()) == -1) { + SetErrorString("BamFtp::SendCommand", "error writing to socket"); + // get actual error from command socket?? + return false; + } + + // if we sent a command that receives a response + if (waitForReply) return ReceiveReply(); + + // return success + return true; +} + +int64_t BamFtp::Tell() const +{ + return (IsOpen() ? m_filePosition : -1); +} + +int64_t BamFtp::Write(const char* data, const unsigned int numBytes) +{ + (void)data; + (void)numBytes; + BT_ASSERT_X(false, "BamFtp::Write : write-mode not supported on this device"); + SetErrorString("BamFtp::Write", "write-mode not supported on this device"); + return -1; +} + +int64_t BamFtp::WriteCommandSocket(const char* data, const unsigned int numBytes) +{ + if (!m_commandSocket->IsConnected()) return -1; + m_commandSocket->ClearBuffer(); + return m_commandSocket->Write(data, numBytes); +} + +int64_t BamFtp::WriteDataSocket(const char* data, const unsigned int numBytes) +{ + (void)data; + (void)numBytes; + BT_ASSERT_X(false, "BamFtp::WriteDataSocket: write-mode not supported on this device"); + SetErrorString("BamFtp::Write", "write-mode not supported on this device"); + return -1; +} diff --git a/src/api/internal/io/BamFtp_p.h b/src/api/internal/io/BamFtp_p.h new file mode 100644 index 0000000..563299a --- /dev/null +++ b/src/api/internal/io/BamFtp_p.h @@ -0,0 +1,91 @@ +// *************************************************************************** +// BamFtp_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides reading/writing of BAM files on FTP server +// *************************************************************************** + +#ifndef BAMFTP_P_H +#define BAMFTP_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <string> +#include "api/IBamIODevice.h" + +namespace BamTools { +namespace Internal { + +class TcpSocket; + +class BamFtp : public IBamIODevice +{ + + // ctor & dtor +public: + BamFtp(const std::string& url); + ~BamFtp(); + + // IBamIODevice implementation +public: + void Close(); + bool IsOpen() const; + bool IsRandomAccess() const; + bool Open(const IBamIODevice::OpenMode mode); + int64_t Read(char* data, const unsigned int numBytes); + bool Seek(const int64_t& position, const int origin = SEEK_SET); + int64_t Tell() const; + int64_t Write(const char* data, const unsigned int numBytes); + + // internal methods +private: + bool ConnectCommandSocket(); + bool ConnectDataSocket(); + bool ParsePassiveResponse(); + void ParseUrl(const std::string& url); + int64_t ReadCommandSocket(char* data, const unsigned int numBytes); + int64_t ReadDataSocket(char* data, const unsigned int numBytes); + bool ReceiveReply(); + bool SendCommand(const std::string& command, bool waitForReply); + int64_t WriteCommandSocket(const char* data, const unsigned int numBytes); + int64_t WriteDataSocket(const char* data, const unsigned int numBytes); + + // data members +private: + // our main sockets + TcpSocket* m_commandSocket; + TcpSocket* m_dataSocket; + + // our connection data + std::string m_hostname; + uint16_t m_port; + std::string m_dataHostname; + uint16_t m_dataPort; + std::string m_filename; + + std::string m_username; + std::string m_password; + + std::string m_response; + + // internal state flags + bool m_isUrlParsed; + + // file position + int64_t m_filePosition; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMFTP_P_H diff --git a/src/api/internal/io/BamHttp_p.cpp b/src/api/internal/io/BamHttp_p.cpp new file mode 100644 index 0000000..81017be --- /dev/null +++ b/src/api/internal/io/BamHttp_p.cpp @@ -0,0 +1,554 @@ +// *************************************************************************** +// BamHttp_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 24 July 2013 (DB) +// --------------------------------------------------------------------------- +// Provides reading/writing of BAM files on HTTP server +// *************************************************************************** + +#include "api/internal/io/BamHttp_p.h" +#include "api/BamAux.h" +#include "api/internal/io/HttpHeader_p.h" +#include "api/internal/io/TcpSocket_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <algorithm> +#include <cassert> +#include <cctype> +#include <cstddef> +#include <cstdlib> +#include <sstream> + +namespace BamTools { +namespace Internal { + +// ----------- +// constants +// ----------- + +static const std::string HTTP_PORT = "80"; +static const std::string HTTP_PREFIX = "http://"; +static const std::size_t HTTP_PREFIX_LENGTH = 7; + +static const std::string DOUBLE_NEWLINE = "\n\n"; + +static const std::string GET_METHOD = "GET"; +static const std::string HEAD_METHOD = "HEAD"; +static const std::string HOST_HEADER = "Host"; +static const std::string RANGE_HEADER = "Range"; +static const std::string BYTES_PREFIX = "bytes="; +static const std::string CONTENT_LENGTH_HEADER = "Content-Length"; + +static const char HOST_SEPARATOR = '/'; +static const char PROXY_SEPARATOR = ':'; + +// ----------------- +// utility methods +// ----------------- + +static inline bool endsWith(const std::string& source, const std::string& pattern) +{ + return (source.find(pattern) == (source.length() - pattern.length())); +} + +static inline std::string toLower(const std::string& s) +{ + std::string out; + const std::size_t sSize = s.size(); + out.reserve(sSize); + for (std::size_t i = 0; i < sSize; ++i) + out[i] = tolower(s[i]); + return out; +} + +} // namespace Internal +} // namespace BamTools + +// ------------------------ +// BamHttp implementation +// ------------------------ + +BamHttp::BamHttp(const std::string& url) + : IBamIODevice() + , m_socket(new TcpSocket) + , m_port(HTTP_PORT) + , m_request(0) + , m_response(0) + , m_isUrlParsed(false) + , m_filePosition(-1) + , m_fileEndPosition(-1) + , m_rangeEndPosition(-1) +{ + ParseUrl(url); +} + +BamHttp::~BamHttp() +{ + + // close connection & clean up + Close(); + if (m_socket) delete m_socket; +} + +void BamHttp::ClearResponse() +{ + if (m_response) { + delete m_response; + m_response = 0; + } +} + +void BamHttp::Close() +{ + + // disconnect socket & clear related resources + DisconnectSocket(); + + // reset state + m_isUrlParsed = false; + m_filePosition = -1; + m_fileEndPosition = -1; + m_rangeEndPosition = -1; + m_mode = IBamIODevice::NotOpen; +} + +bool BamHttp::ConnectSocket() +{ + + BT_ASSERT_X(m_socket, "null socket?"); + + // any state checks, etc? + if (!m_socket->ConnectToHost(m_hostname, m_port, m_mode)) { + SetErrorString("BamHttp::ConnectSocket", m_socket->GetErrorString()); + return false; + } + + // return success + return true; +} + +void BamHttp::DisconnectSocket() +{ + + // disconnect socket & clean up + m_socket->DisconnectFromHost(); + ClearResponse(); + if (m_request) { + delete m_request; + m_request = 0; + } +} + +bool BamHttp::EnsureSocketConnection() +{ + if (m_socket->IsConnected()) return true; + return ConnectSocket(); +} + +bool BamHttp::IsOpen() const +{ + return IBamIODevice::IsOpen() && m_isUrlParsed; +} + +bool BamHttp::IsRandomAccess() const +{ + return true; +} + +bool BamHttp::Open(const IBamIODevice::OpenMode mode) +{ + + // BamHttp only supports read-only access + if (mode != IBamIODevice::ReadOnly) { + SetErrorString("BamHttp::Open", "writing on this device is not supported"); + return false; + } + m_mode = mode; + + // attempt connection to socket + if (!ConnectSocket()) { + SetErrorString("BamHttp::Open", m_socket->GetErrorString()); + return false; + } + + // initialize our file positions + m_filePosition = 0; + m_fileEndPosition = 0; + m_rangeEndPosition = 0; + + // attempt to send initial request (just 'HEAD' to check connection) + if (!SendHeadRequest()) { + SetErrorString("BamHttp::Open", m_socket->GetErrorString()); + return false; + } + + // clear response from HEAD request, not needed + ClearResponse(); + + // return success + return true; +} + +void BamHttp::ParseUrl(const std::string& url) +{ + + // clear flag to start + m_isUrlParsed = false; + + // make sure url starts with "http://", case-insensitive + std::string tempUrl(url); + toLower(tempUrl); + const std::size_t prefixFound = tempUrl.find(HTTP_PREFIX); + if (prefixFound == std::string::npos) return; + + // find end of host name portion (first '/' hit after the prefix) + const std::size_t firstSlashFound = tempUrl.find(HOST_SEPARATOR, HTTP_PREFIX_LENGTH); + if (firstSlashFound == std::string::npos) { + ; // no slash found... no filename given along with host? + } + + // fetch hostname (check for proxy port) + std::string hostname = + tempUrl.substr(HTTP_PREFIX_LENGTH, (firstSlashFound - HTTP_PREFIX_LENGTH)); + const std::size_t colonFound = hostname.find(PROXY_SEPARATOR); + if (colonFound != std::string::npos) { + ; // TODO: handle proxy port (later, just skip for now) + } else { + m_hostname = hostname; + m_port = HTTP_PORT; + } + + // store remainder of URL as filename (must be non-empty) + std::string filename = tempUrl.substr(firstSlashFound); + if (filename.empty()) return; + m_filename = filename; + + // set parsed OK flag + m_isUrlParsed = true; +} + +int64_t BamHttp::Read(char* data, const unsigned int numBytes) +{ + + // if BamHttp not in a valid state + if (!IsOpen()) return -1; + + int64_t numBytesReadSoFar = 0; + while (numBytesReadSoFar < numBytes) { + + const std::size_t remaining = static_cast<std::size_t>(numBytes - numBytesReadSoFar); + + // if we're not holding a valid GET reponse, get one + if (m_response == 0) { + if (!SendGetRequest(remaining)) return -1; + } + BT_ASSERT_X(m_response, "null HTTP response"); + + // check response status code + const int statusCode = m_response->GetStatusCode(); + + // if we receieved full file contents in response + if (statusCode == 200) { + + // try to read 'remaining' bytes from socket + const int64_t socketBytesRead = ReadFromSocket(data + numBytesReadSoFar, remaining); + + // if error + if (socketBytesRead < 0) { + SetErrorString("BamHttp::Read", m_socket->GetErrorString()); + return -1; + } + + // EOF + else if (socketBytesRead == 0) + return numBytesReadSoFar; + + // update counters + numBytesReadSoFar += socketBytesRead; + m_filePosition += socketBytesRead; + + } + + // else if we received a range of bytes in response + else if (statusCode == 206) { + + // if we've exhausted the last request + if (m_filePosition == m_rangeEndPosition) { + if (!SendGetRequest(remaining)) return -1; + } + + else { + + // try to read 'remaining' bytes from socket + const int64_t socketBytesRead = ReadFromSocket(data + numBytesReadSoFar, remaining); + + // if error + if (socketBytesRead < 0) { + SetErrorString("BamHttp::Read", m_socket->GetErrorString()); + return -1; + } + + // maybe EOF + else if (socketBytesRead == 0) { + + // if we know we're not at end position, fire off a new request + if (m_fileEndPosition > 0 && m_filePosition < m_fileEndPosition) { + if (!SendGetRequest()) return -1; + } else + return numBytesReadSoFar; + } + + // update counters + numBytesReadSoFar += socketBytesRead; + m_filePosition += socketBytesRead; + } + } + + // else some other HTTP status + else { + SetErrorString("BamHttp::Read", "unsupported status code in response"); + return -1; + } + } + + // return actual number of bytes read + return numBytesReadSoFar; +} + +int64_t BamHttp::ReadFromSocket(char* data, const unsigned int maxNumBytes) +{ + return m_socket->Read(data, maxNumBytes); +} + +bool BamHttp::ReceiveResponse() +{ + + // fetch header, up until double new line + std::string responseHeader; + do { + + // make sure we can read a line + if (!m_socket->WaitForReadLine()) return false; + + // read line & append to full header + const std::string headerLine = m_socket->ReadLine(); + responseHeader += headerLine; + + } while (!endsWith(responseHeader, DOUBLE_NEWLINE)); + + // sanity check + if (responseHeader.empty()) { + SetErrorString("BamHttp::ReceiveResponse", "empty HTTP response"); + Close(); + return false; + } + + // create response from header text + m_response = new HttpResponseHeader(responseHeader); + if (!m_response->IsValid()) { + SetErrorString("BamHttp::ReceiveResponse", "could not parse HTTP response"); + Close(); + return false; + } + + // if we get here, success + return true; +} + +bool BamHttp::Seek(const int64_t& position, const int origin) +{ + + // if HTTP device not in a valid state + if (!IsOpen()) { + SetErrorString("BamHttp::Seek", "cannot seek on unopen connection"); + return false; + } + + // reset the connection + DisconnectSocket(); + if (!ConnectSocket()) { + SetErrorString("BamHttp::Seek", m_socket->GetErrorString()); + return false; + } + + // udpate file position + switch (origin) { + case SEEK_CUR: + m_filePosition += position; + break; + case SEEK_SET: + m_filePosition = position; + break; + default: + SetErrorString("BamHttp::Seek", "unsupported seek origin"); + return false; + } + + // return success + return true; +} + +bool BamHttp::SendGetRequest(const std::size_t numBytes) +{ + + // clear previous data + ClearResponse(); + if (m_request) delete m_request; + m_socket->ClearBuffer(); + + // make sure we're connected + if (!EnsureSocketConnection()) return false; + + // create range string + const int64_t endPosition = + m_filePosition + std::max(static_cast<std::size_t>(0x10000), numBytes); + std::stringstream range; + range << BYTES_PREFIX << m_filePosition << '-' << endPosition; + + // create request + m_request = new HttpRequestHeader(GET_METHOD, m_filename); + m_request->SetField(HOST_HEADER, m_hostname); + m_request->SetField(RANGE_HEADER, range.str()); + + // send request + const std::string requestHeader = m_request->ToString(); + const int64_t headerSize = requestHeader.size(); + if (WriteToSocket(requestHeader.c_str(), headerSize) != headerSize) { + SetErrorString("BamHttp::SendHeadRequest", m_socket->GetErrorString()); + return false; + } + + // ensure clean buffer + m_socket->ClearBuffer(); + + // wait for response + if (!ReceiveResponse()) { + SetErrorString("BamHttp::SendGetRequest", m_socket->GetErrorString()); + Close(); + return false; + } + BT_ASSERT_X(m_response, "BamHttp::SendGetRequest : null HttpResponse"); + BT_ASSERT_X(m_response->IsValid(), "BamHttp::SendGetRequest : invalid HttpResponse"); + + // check response status code + const int statusCode = m_response->GetStatusCode(); + switch (statusCode) { + + // ranged response, as requested + case 206: + // get content length if available + if (m_response->ContainsKey(CONTENT_LENGTH_HEADER)) { + const std::string contentLengthString = m_response->GetValue(CONTENT_LENGTH_HEADER); + m_rangeEndPosition = m_filePosition + std::atoi(contentLengthString.c_str()); + } + return true; + + // full contents, not range + case 200: { + // skip up to current file position + RaiiBuffer tmp(0x8000); + int64_t numBytesRead = 0; + while (numBytesRead < m_filePosition) { + + // read data from response + const int64_t remaining = m_filePosition - numBytesRead; + const std::size_t bytesToRead = + static_cast<std::size_t>((remaining > 0x8000) ? 0x8000 : remaining); + const int64_t socketBytesRead = ReadFromSocket(tmp.Buffer, bytesToRead); + + // if error + if (socketBytesRead < 0) { + SetErrorString("BamHttp::SendGetRequest", m_socket->GetErrorString()); + Close(); + return false; + } + + // else if EOF + else if (socketBytesRead == 0 && m_socket->BufferBytesAvailable() == 0) + break; + + // update byte counter + numBytesRead += socketBytesRead; + } + + // return success + return (numBytesRead == m_filePosition); + } + + // any other status codes + default: + break; + } + + // fail on unexpected status code + SetErrorString("BamHttp::SendGetRequest", "unsupported status code in response"); + Close(); + return false; +} + +bool BamHttp::SendHeadRequest() +{ + + // ensure clean slate + ClearResponse(); + if (m_request) delete m_request; + m_socket->ClearBuffer(); + + // make sure we're connected + if (!EnsureSocketConnection()) return false; + + // create request + m_request = new HttpRequestHeader(HEAD_METHOD, m_filename); + m_request->SetField(HOST_HEADER, m_hostname); + + // send request + const std::string requestHeader = m_request->ToString(); + const int64_t headerSize = requestHeader.size(); + if (WriteToSocket(requestHeader.c_str(), headerSize) != headerSize) { + SetErrorString("BamHttp::SendHeadRequest", m_socket->GetErrorString()); + return false; + } + + m_socket->ClearBuffer(); + + // wait for response from server + if (!ReceiveResponse()) { + SetErrorString("BamHttp::SendHeadRequest", m_socket->GetErrorString()); + Close(); + return false; + } + BT_ASSERT_X(m_response, "BamHttp::SendHeadRequest : null HttpResponse"); + BT_ASSERT_X(m_response->IsValid(), "BamHttp::SendHeadRequest : invalid HttpResponse"); + + // get content length if available + if (m_response->ContainsKey(CONTENT_LENGTH_HEADER)) { + const std::string contentLengthString = m_response->GetValue(CONTENT_LENGTH_HEADER); + m_fileEndPosition = std::atoi(contentLengthString.c_str()) - 1; + } + + // return whether we found any errors + return m_socket->GetError() == TcpSocket::NoError; +} + +int64_t BamHttp::Tell() const +{ + return (IsOpen() ? m_filePosition : -1); +} + +int64_t BamHttp::Write(const char* data, const unsigned int numBytes) +{ + (void)data; + (void)numBytes; + BT_ASSERT_X(false, "BamHttp::Write : write-mode not supported on this device"); + SetErrorString("BamHttp::Write", "write-mode not supported on this device"); + return -1; +} + +int64_t BamHttp::WriteToSocket(const char* data, const unsigned int numBytes) +{ + if (!m_socket->IsConnected()) return -1; + m_socket->ClearBuffer(); + return m_socket->Write(data, numBytes); +} diff --git a/src/api/internal/io/BamHttp_p.h b/src/api/internal/io/BamHttp_p.h new file mode 100644 index 0000000..62d0d7b --- /dev/null +++ b/src/api/internal/io/BamHttp_p.h @@ -0,0 +1,92 @@ +// *************************************************************************** +// BamHttp_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides reading/writing of BAM files on HTTP server +// *************************************************************************** + +#ifndef BAMHTTP_P_H +#define BAMHTTP_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <cstddef> +#include <string> +#include "api/IBamIODevice.h" + +namespace BamTools { +namespace Internal { + +class HttpRequestHeader; +class HttpResponseHeader; +class TcpSocket; + +class BamHttp : public IBamIODevice +{ + + // ctor & dtor +public: + BamHttp(const std::string& url); + ~BamHttp(); + + // IBamIODevice implementation +public: + void Close(); + bool IsOpen() const; + bool IsRandomAccess() const; + bool Open(const IBamIODevice::OpenMode mode); + int64_t Read(char* data, const unsigned int numBytes); + bool Seek(const int64_t& position, const int origin = SEEK_SET); + int64_t Tell() const; + int64_t Write(const char* data, const unsigned int numBytes); + + // internal methods +private: + void ClearResponse(); + bool ConnectSocket(); + void DisconnectSocket(); + bool EnsureSocketConnection(); + void ParseUrl(const std::string& url); + int64_t ReadFromSocket(char* data, const unsigned int numBytes); + bool ReceiveResponse(); + bool SendGetRequest(const std::size_t numBytes = 0x10000); + bool SendHeadRequest(); + int64_t WriteToSocket(const char* data, const unsigned int numBytes); + + // data members +private: + // our main socket + TcpSocket* m_socket; + + // our connection data + std::string m_hostname; + std::string m_port; + std::string m_filename; + + // our last (active) request & response info + HttpRequestHeader* m_request; + HttpResponseHeader* m_response; + + // internal state flags + bool m_isUrlParsed; + + // file position + int64_t m_filePosition; + int64_t m_fileEndPosition; + int64_t m_rangeEndPosition; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMHTTP_P_H diff --git a/src/api/internal/io/BamPipe_p.cpp b/src/api/internal/io/BamPipe_p.cpp new file mode 100644 index 0000000..3dd2c94 --- /dev/null +++ b/src/api/internal/io/BamPipe_p.cpp @@ -0,0 +1,73 @@ +// *************************************************************************** +// BamPipe_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 18 October 2012 (DB) +// --------------------------------------------------------------------------- +// Provides BAM pipe-specific IO behavior +// *************************************************************************** + +#include "api/internal/io/BamPipe_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstdio> +#include <iostream> + +BamPipe::BamPipe() + : ILocalIODevice() +{} + +BamPipe::~BamPipe() {} + +bool BamPipe::IsRandomAccess() const +{ + return false; +} + +bool BamPipe::Open(const IBamIODevice::OpenMode mode) +{ + + // make sure we're starting with a fresh pipe + Close(); + + // open stdin/stdout depending on requested openmode +#if defined(SYSTEM_NODEJS) && SYSTEM_NODEJS == 1 + if (mode == IBamIODevice::ReadOnly) + m_stream = stdin; + else if (mode == IBamIODevice::WriteOnly) + m_stream = stdout; +#else + if (mode == IBamIODevice::ReadOnly) + m_stream = freopen(0, "rb", stdin); + else if (mode == IBamIODevice::WriteOnly) + m_stream = freopen(0, "wb", stdout); +#endif // SYSTEM_NODEJS + + else { + const std::string errorType = + std::string((mode == IBamIODevice::ReadWrite) ? "unsupported" : "unknown"); + const std::string message = errorType + " open mode requested"; + SetErrorString("BamPipe::Open", message); + return false; + } + + // check that we obtained a valid FILE* + if (m_stream == 0) { + const std::string message_base = std::string("could not open handle on "); + const std::string message = + message_base + ((mode == IBamIODevice::ReadOnly) ? "stdin" : "stdout"); + SetErrorString("BamPipe::Open", message); + return false; + } + + // store current IO mode & return success + m_mode = mode; + return true; +} + +bool BamPipe::Seek(const int64_t&, const int) +{ + SetErrorString("BamPipe::Seek", "random access not allowed in FIFO pipe"); + return false; +} diff --git a/src/api/internal/io/BamPipe_p.h b/src/api/internal/io/BamPipe_p.h new file mode 100644 index 0000000..764823d --- /dev/null +++ b/src/api/internal/io/BamPipe_p.h @@ -0,0 +1,47 @@ +// *************************************************************************** +// BamPipe_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides BAM pipe-specific IO behavior +// *************************************************************************** + +#ifndef BAMPIPE_P_H +#define BAMPIPE_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <string> +#include "api/internal/io/ILocalIODevice_p.h" + +namespace BamTools { +namespace Internal { + +class BamPipe : public ILocalIODevice +{ + + // ctor & dtor +public: + BamPipe(); + ~BamPipe(); + + // IBamIODevice implementation +public: + bool IsRandomAccess() const; + bool Open(const IBamIODevice::OpenMode mode); + bool Seek(const int64_t& position, const int origin = SEEK_SET); +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMPIPE_P_H diff --git a/src/api/internal/io/BgzfStream_p.cpp b/src/api/internal/io/BgzfStream_p.cpp new file mode 100644 index 0000000..1adf87e --- /dev/null +++ b/src/api/internal/io/BgzfStream_p.cpp @@ -0,0 +1,468 @@ +// *************************************************************************** +// BgzfStream_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 17 January 2012(DB) +// --------------------------------------------------------------------------- +// Based on BGZF routines developed at the Broad Institute. +// Provides the basic functionality for reading & writing BGZF files +// Replaces the old BGZF.* files to avoid clashing with other toolkits +// *************************************************************************** + +#include "api/internal/io/BgzfStream_p.h" +#include "api/BamAux.h" +#include "api/BamConstants.h" +#include "api/internal/io/BamDeviceFactory_p.h" +#include "api/internal/utils/BamException_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <zlib.h> + +#include <algorithm> +#include <cstddef> +#include <cstring> +#include <iostream> +#include <sstream> + +// --------------------------- +// BgzfStream implementation +// --------------------------- + +// constructor +BgzfStream::BgzfStream() + : m_blockLength(0) + , m_blockOffset(0) + , m_blockAddress(0) + , m_isWriteCompressed(true) + , m_device(0) + , m_uncompressedBlock(Constants::BGZF_DEFAULT_BLOCK_SIZE) + , m_compressedBlock(Constants::BGZF_MAX_BLOCK_SIZE) +{} + +// destructor +BgzfStream::~BgzfStream() +{ + Close(); +} + +// checks BGZF block header +bool BgzfStream::CheckBlockHeader(char* header) +{ + return (header[0] == Constants::GZIP_ID1 && header[1] == Constants::GZIP_ID2 && + header[2] == Z_DEFLATED && (header[3] & Constants::FLG_FEXTRA) != 0 && + BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN && + header[12] == Constants::BGZF_ID1 && header[13] == Constants::BGZF_ID2 && + BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN); +} + +// closes BGZF file +void BgzfStream::Close() +{ + + // skip if no device open + if (m_device == 0) return; + + // if writing to file, flush the current BGZF block, + // then write an empty block (as EOF marker) + if (m_device->IsOpen() && (m_device->Mode() == IBamIODevice::WriteOnly)) { + FlushBlock(); + const std::size_t blockLength = DeflateBlock(0); + m_device->Write(m_compressedBlock.Buffer, blockLength); + } + + // close device + m_device->Close(); + delete m_device; + m_device = 0; + + // ensure our buffers are cleared out + m_uncompressedBlock.Clear(); + m_compressedBlock.Clear(); + + // reset state + m_blockLength = 0; + m_blockOffset = 0; + m_blockAddress = 0; + m_isWriteCompressed = true; +} + +// compresses the current block +std::size_t BgzfStream::DeflateBlock(int32_t blockLength) +{ + + // initialize the gzip header + char* buffer = m_compressedBlock.Buffer; + memset(buffer, 0, 18); + buffer[0] = Constants::GZIP_ID1; + buffer[1] = Constants::GZIP_ID2; + buffer[2] = Constants::CM_DEFLATE; + buffer[3] = Constants::FLG_FEXTRA; + buffer[9] = Constants::OS_UNKNOWN; + buffer[10] = Constants::BGZF_XLEN; + buffer[12] = Constants::BGZF_ID1; + buffer[13] = Constants::BGZF_ID2; + buffer[14] = Constants::BGZF_LEN; + + // set compression level + const int compressionLevel = (m_isWriteCompressed ? Z_DEFAULT_COMPRESSION : 0); + + // loop to retry for blocks that do not compress enough + int inputLength = blockLength; + std::size_t compressedLength = 0; + const unsigned int bufferSize = Constants::BGZF_MAX_BLOCK_SIZE; + + while (true) { + + // initialize zstream values + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = (Bytef*)m_uncompressedBlock.Buffer; + zs.avail_in = inputLength; + zs.next_out = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH]; + zs.avail_out = + bufferSize - Constants::BGZF_BLOCK_HEADER_LENGTH - Constants::BGZF_BLOCK_FOOTER_LENGTH; + + // initialize the zlib compression algorithm + int status = deflateInit2(&zs, compressionLevel, Z_DEFLATED, Constants::GZIP_WINDOW_BITS, + Constants::Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (status != Z_OK) + throw BamException("BgzfStream::DeflateBlock", "zlib deflateInit2 failed"); + + // compress the data + status = deflate(&zs, Z_FINISH); + + // if not at stream end + if (status != Z_STREAM_END) { + + deflateEnd(&zs); + + // there was not enough space available in buffer + // try to reduce the input length & re-start loop + if (status == Z_OK) { + inputLength -= 1024; + if (inputLength < 0) + throw BamException("BgzfStream::DeflateBlock", "input reduction failed"); + continue; + } + + throw BamException("BgzfStream::DeflateBlock", "zlib deflate failed"); + } + + // finalize the compression routine + status = deflateEnd(&zs); + if (status != Z_OK) + throw BamException("BgzfStream::DeflateBlock", "zlib deflateEnd failed"); + + // update compressedLength + compressedLength = zs.total_out + Constants::BGZF_BLOCK_HEADER_LENGTH + + Constants::BGZF_BLOCK_FOOTER_LENGTH; + if (compressedLength > Constants::BGZF_MAX_BLOCK_SIZE) + throw BamException("BgzfStream::DeflateBlock", "deflate overflow"); + + // quit while loop + break; + } + + // store the compressed length + BamTools::PackUnsignedShort(&buffer[16], static_cast<uint16_t>(compressedLength - 1)); + + // store the CRC32 checksum + uint32_t crc = crc32(0, NULL, 0); + crc = crc32(crc, (Bytef*)m_uncompressedBlock.Buffer, inputLength); + BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc); + BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength); + + // ensure that we have less than a block of data left + int remaining = blockLength - inputLength; + if (remaining > 0) { + if (remaining > inputLength) + throw BamException("BgzfStream::DeflateBlock", "after deflate, remainder too large"); + memcpy(m_uncompressedBlock.Buffer, m_uncompressedBlock.Buffer + inputLength, remaining); + } + + // update block data + m_blockOffset = remaining; + + // return result + return compressedLength; +} + +// flushes the data in the BGZF block +void BgzfStream::FlushBlock() +{ + + BT_ASSERT_X(m_device, "BgzfStream::FlushBlock() - attempting to flush to null device"); + + // flush all of the remaining blocks + while (m_blockOffset > 0) { + + // compress the data block + const std::size_t blockLength = DeflateBlock(m_blockOffset); + + // flush the data to our output device + const int64_t numBytesWritten = m_device->Write(m_compressedBlock.Buffer, blockLength); + + // check for device error + if (numBytesWritten < 0) { + const std::string message = std::string("device error: ") + m_device->GetErrorString(); + throw BamException("BgzfStream::FlushBlock", message); + } + + // check that we wrote expected numBytes + if (numBytesWritten != static_cast<int64_t>(blockLength)) { + std::stringstream s; + s << "expected to write " << blockLength << " bytes during flushing, but wrote " + << numBytesWritten; + throw BamException("BgzfStream::FlushBlock", s.str()); + } + + // update block data + m_blockAddress += blockLength; + } +} + +// decompresses the current block +std::size_t BgzfStream::InflateBlock(const std::size_t& blockLength) +{ + + // setup zlib stream object + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = (Bytef*)m_compressedBlock.Buffer + 18; + zs.avail_in = blockLength - 16; + zs.next_out = (Bytef*)m_uncompressedBlock.Buffer; + zs.avail_out = Constants::BGZF_DEFAULT_BLOCK_SIZE; + + // initialize + int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS); + if (status != Z_OK) throw BamException("BgzfStream::InflateBlock", "zlib inflateInit failed"); + + // decompress + status = inflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + inflateEnd(&zs); + throw BamException("BgzfStream::InflateBlock", "zlib inflate failed"); + } + + // finalize + status = inflateEnd(&zs); + if (status != Z_OK) { + inflateEnd(&zs); + throw BamException("BgzfStream::InflateBlock", "zlib inflateEnd failed"); + } + + // return result + return zs.total_out; +} + +bool BgzfStream::IsOpen() const +{ + if (m_device == 0) return false; + return m_device->IsOpen(); +} + +void BgzfStream::Open(const std::string& filename, const IBamIODevice::OpenMode mode) +{ + + // close current device if necessary + Close(); + BT_ASSERT_X((m_device == 0), + "BgzfStream::Open() - unable to properly close previous IO device"); + + // retrieve new IO device depending on filename + m_device = BamDeviceFactory::CreateDevice(filename); + BT_ASSERT_X(m_device, "BgzfStream::Open() - unable to create IO device from filename"); + + // if device fails to open + if (!m_device->Open(mode)) { + const std::string deviceError = m_device->GetErrorString(); + const std::string message = std::string("could not open BGZF stream: \n\t") + deviceError; + throw BamException("BgzfStream::Open", message); + } +} + +// reads BGZF data into a byte buffer +std::size_t BgzfStream::Read(char* data, const std::size_t dataLength) +{ + + if (dataLength == 0) return 0; + + // if stream not open for reading + BT_ASSERT_X(m_device, "BgzfStream::Read() - trying to read from null device"); + if (!m_device->IsOpen() || (m_device->Mode() != IBamIODevice::ReadOnly)) return 0; + + // read blocks as needed until desired data length is retrieved + char* output = data; + std::size_t numBytesRead = 0; + while (numBytesRead < dataLength) { + + // determine bytes available in current block + int bytesAvailable = m_blockLength - m_blockOffset; + + // read (and decompress) next block if needed + if (bytesAvailable <= 0) { + ReadBlock(); + bytesAvailable = m_blockLength - m_blockOffset; + if (bytesAvailable <= 0) break; + } + + // copy data from uncompressed source buffer into data destination buffer + const std::size_t copyLength = + std::min((dataLength - numBytesRead), static_cast<std::size_t>(bytesAvailable)); + memcpy(output, m_uncompressedBlock.Buffer + m_blockOffset, copyLength); + + // update counters + m_blockOffset += copyLength; + output += copyLength; + numBytesRead += copyLength; + } + + // update block data + if (m_blockOffset == m_blockLength) { + m_blockAddress = m_device->Tell(); + m_blockOffset = 0; + m_blockLength = 0; + } + + // return actual number of bytes read + return numBytesRead; +} + +// reads a BGZF block +void BgzfStream::ReadBlock() +{ + + BT_ASSERT_X(m_device, "BgzfStream::ReadBlock() - trying to read from null IO device"); + + // store block's starting address + const int64_t blockAddress = m_device->Tell(); + + // read block header from file + char header[Constants::BGZF_BLOCK_HEADER_LENGTH]; + int64_t numBytesRead = m_device->Read(header, Constants::BGZF_BLOCK_HEADER_LENGTH); + + // check for device error + if (numBytesRead < 0) { + const std::string message = std::string("device error: ") + m_device->GetErrorString(); + throw BamException("BgzfStream::ReadBlock", message); + } + + // if block header empty + if (numBytesRead == 0) { + m_blockLength = 0; + return; + } + + // if block header invalid size + if (numBytesRead != static_cast<int8_t>(Constants::BGZF_BLOCK_HEADER_LENGTH)) + throw BamException("BgzfStream::ReadBlock", "invalid block header size"); + + // validate block header contents + if (!BgzfStream::CheckBlockHeader(header)) + throw BamException("BgzfStream::ReadBlock", "invalid block header contents"); + + // copy header contents to compressed buffer + const std::size_t blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1; + memcpy(m_compressedBlock.Buffer, header, Constants::BGZF_BLOCK_HEADER_LENGTH); + + // read remainder of block + const std::size_t remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH; + numBytesRead = + m_device->Read(&m_compressedBlock.Buffer[Constants::BGZF_BLOCK_HEADER_LENGTH], remaining); + + // check for device error + if (numBytesRead < 0) { + const std::string message = std::string("device error: ") + m_device->GetErrorString(); + throw BamException("BgzfStream::ReadBlock", message); + } + + // check that we read in expected numBytes + if (numBytesRead != static_cast<int64_t>(remaining)) + throw BamException("BgzfStream::ReadBlock", "could not read data from block"); + + // decompress block data + const std::size_t newBlockLength = InflateBlock(blockLength); + + // update block data + if (m_blockLength != 0) m_blockOffset = 0; + m_blockAddress = blockAddress; + m_blockLength = newBlockLength; +} + +// seek to position in BGZF file +void BgzfStream::Seek(const int64_t& position) +{ + + BT_ASSERT_X(m_device, "BgzfStream::Seek() - trying to seek on null IO device"); + + // skip if device is not open + if (!IsOpen()) return; + + // determine adjusted offset & address + int blockOffset = (position & 0xFFFF); + int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; + + // attempt seek in file + if (m_device->IsRandomAccess() && m_device->Seek(blockAddress)) { + + // update block data & return success + m_blockLength = 0; + m_blockAddress = blockAddress; + m_blockOffset = blockOffset; + } else { + std::stringstream s; + s << "unable to seek to position: " << position; + throw BamException("BgzfStream::Seek", s.str()); + } +} + +void BgzfStream::SetWriteCompressed(bool ok) +{ + m_isWriteCompressed = ok; +} + +// get file position in BGZF file +int64_t BgzfStream::Tell() const +{ + if (!IsOpen()) return 0; + return ((m_blockAddress << 16) | (m_blockOffset & 0xFFFF)); +} + +// writes the supplied data into the BGZF buffer +std::size_t BgzfStream::Write(const char* data, const std::size_t dataLength) +{ + + BT_ASSERT_X(m_device, "BgzfStream::Write() - trying to write to null IO device"); + BT_ASSERT_X((m_device->Mode() == IBamIODevice::WriteOnly), + "BgzfStream::Write() - trying to write to non-writable IO device"); + + // skip if file not open for writing + if (!IsOpen()) return 0; + + // write blocks as needed til all data is written + std::size_t numBytesWritten = 0; + const char* input = data; + const std::size_t blockLength = Constants::BGZF_DEFAULT_BLOCK_SIZE; + while (numBytesWritten < dataLength) { + + // copy data contents to uncompressed output buffer + unsigned int copyLength = + std::min(blockLength - m_blockOffset, dataLength - numBytesWritten); + char* buffer = m_uncompressedBlock.Buffer; + memcpy(buffer + m_blockOffset, input, copyLength); + + // update counter + m_blockOffset += copyLength; + input += copyLength; + numBytesWritten += copyLength; + + // flush (& compress) output buffer when full + if (m_blockOffset == static_cast<int32_t>(blockLength)) FlushBlock(); + } + + // return actual number of bytes written + return numBytesWritten; +} diff --git a/src/api/internal/io/BgzfStream_p.h b/src/api/internal/io/BgzfStream_p.h new file mode 100644 index 0000000..abf3290 --- /dev/null +++ b/src/api/internal/io/BgzfStream_p.h @@ -0,0 +1,95 @@ +// *************************************************************************** +// BgzfStream_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 17 January 2012(DB) +// --------------------------------------------------------------------------- +// Based on BGZF routines developed at the Broad Institute. +// Provides the basic functionality for reading & writing BGZF files +// Replaces the old BGZF.* files to avoid clashing with other toolkits +// *************************************************************************** + +#ifndef BGZFSTREAM_P_H +#define BGZFSTREAM_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <cstddef> +#include <string> +#include "api/BamAux.h" +#include "api/IBamIODevice.h" +#include "api/api_global.h" + +namespace BamTools { +namespace Internal { + +class BgzfStream +{ + + // constructor & destructor +public: + BgzfStream(); + ~BgzfStream(); + + // main interface methods +public: + // closes BGZF file + void Close(); + // returns true if BgzfStream open for IO + bool IsOpen() const; + // opens the BGZF file + void Open(const std::string& filename, const IBamIODevice::OpenMode mode); + // reads BGZF data into a byte buffer + std::size_t Read(char* data, const std::size_t dataLength); + // seek to position in BGZF file + void Seek(const int64_t& position); + // sets IO device (closes previous, if any, but does not attempt to open) + void SetIODevice(IBamIODevice* device); + // enable/disable compressed output + void SetWriteCompressed(bool ok); + // get file position in BGZF file + int64_t Tell() const; + // writes the supplied data into the BGZF buffer + std::size_t Write(const char* data, const std::size_t dataLength); + + // internal methods +private: + // compresses the current block + std::size_t DeflateBlock(int32_t blockLength); + // flushes the data in the BGZF block + void FlushBlock(); + // de-compresses the current block + std::size_t InflateBlock(const std::size_t& blockLength); + // reads a BGZF block + void ReadBlock(); + + // static 'utility' methods +public: + // checks BGZF block header + static bool CheckBlockHeader(char* header); + + // data members +public: + int32_t m_blockLength; + int32_t m_blockOffset; + int64_t m_blockAddress; + + bool m_isWriteCompressed; + IBamIODevice* m_device; + + RaiiBuffer m_uncompressedBlock; + RaiiBuffer m_compressedBlock; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BGZFSTREAM_P_H diff --git a/src/api/internal/io/ByteArray_p.cpp b/src/api/internal/io/ByteArray_p.cpp new file mode 100644 index 0000000..c8d3e8f --- /dev/null +++ b/src/api/internal/io/ByteArray_p.cpp @@ -0,0 +1,120 @@ +// *************************************************************************** +// ByteArray_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides a dynamic, variable-length byte buffer +// *************************************************************************** + +#include "api/internal/io/ByteArray_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstddef> +#include <cstdlib> +#include <cstring> + +// -------------------------- +// ByteArray implementation +// -------------------------- + +ByteArray::ByteArray() + : m_data() +{} + +ByteArray::ByteArray(const std::string& value) + : m_data(value.begin(), value.end()) +{} + +ByteArray::ByteArray(const std::vector<char>& value) + : m_data(value) +{} + +ByteArray::ByteArray(const char* value, std::size_t n) +{ + const std::string s(value, n); + m_data.assign(s.begin(), s.end()); +} + +ByteArray::ByteArray(const ByteArray& other) + : m_data(other.m_data) +{} + +ByteArray::~ByteArray() {} + +ByteArray& ByteArray::operator=(const ByteArray& other) +{ + m_data = other.m_data; + return *this; +} + +void ByteArray::Clear() +{ + m_data.clear(); +} + +const char* ByteArray::ConstData() const +{ + return &m_data[0]; +} + +char* ByteArray::Data() +{ + return &m_data[0]; +} + +const char& ByteArray::operator[](std::size_t i) const +{ + return m_data[i]; +} + +char& ByteArray::operator[](std::size_t i) +{ + return m_data[i]; +} + +std::size_t ByteArray::IndexOf(const char c, const std::size_t from, const std::size_t to) const +{ + const std::size_t size = ((to == 0) ? m_data.size() : to); + for (std::size_t i = from; i < size; ++i) { + if (m_data.at(i) == c) return i; + } + return m_data.size(); +} + +ByteArray& ByteArray::Remove(std::size_t from, std::size_t n) +{ + + // if 'from' outside range, just return + const std::size_t originalSize = m_data.size(); + if (from >= originalSize) return *this; + + // if asked to clip from 'from' to end (or beyond), simply resize + if (from + n >= originalSize) Resize(from); + + // otherwise, shift data & resize + else { + memmove(&m_data[from], &m_data[from + n], (originalSize - from - n)); + Resize(originalSize - n); + } + + // return reference to modified byte array + return *this; +} + +void ByteArray::Resize(std::size_t n) +{ + m_data.resize(n, 0); +} + +std::size_t ByteArray::Size() const +{ + return m_data.size(); +} + +void ByteArray::Squeeze() +{ + std::vector<char> t(m_data); + t.swap(m_data); +} diff --git a/src/api/internal/io/ByteArray_p.h b/src/api/internal/io/ByteArray_p.h new file mode 100644 index 0000000..9f0f527 --- /dev/null +++ b/src/api/internal/io/ByteArray_p.h @@ -0,0 +1,70 @@ +// *************************************************************************** +// ByteArray_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides a dynamic, variable-length byte buffer +// *************************************************************************** + +#ifndef BYTEARRAY_P_H +#define BYTEARRAY_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <cstddef> +#include <string> +#include <vector> +#include "api/api_global.h" + +namespace BamTools { +namespace Internal { + +// provides a wrapper around a byte vector +class ByteArray +{ + + // ctors & dtor +public: + ByteArray(); + ByteArray(const std::string& value); + ByteArray(const std::vector<char>& value); + ByteArray(const char* value, std::size_t n); + ByteArray(const ByteArray& other); + ~ByteArray(); + + ByteArray& operator=(const ByteArray& other); + + // ByteArray interface +public: + // data access + const char* ConstData() const; + char* Data(); + const char& operator[](std::size_t i) const; + char& operator[](std::size_t i); + + // byte array manipulation + void Clear(); + std::size_t IndexOf(const char c, const std::size_t from = 0, const std::size_t to = 0) const; + ByteArray& Remove(std::size_t from, std::size_t n); + void Resize(std::size_t n); + std::size_t Size() const; + void Squeeze(); + + // data members +private: + std::vector<char> m_data; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BYTEARRAY_P_H diff --git a/src/api/internal/io/CMakeLists.txt b/src/api/internal/io/CMakeLists.txt new file mode 100644 index 0000000..28153d5 --- /dev/null +++ b/src/api/internal/io/CMakeLists.txt @@ -0,0 +1,48 @@ +# ========================== +# BamTools CMakeLists.txt +# (c) 2011 Derek Barnett +# +# src/api/internal/io +# ========================== + +set( InternalIODir "${InternalDir}/io" ) + +#-------------------------- +# platform-independent IO +#-------------------------- +set( CommonIOSources + ${InternalIODir}/BamDeviceFactory_p.cpp + ${InternalIODir}/BamFile_p.cpp + ${InternalIODir}/BamFtp_p.cpp + ${InternalIODir}/BamHttp_p.cpp + ${InternalIODir}/BamPipe_p.cpp + ${InternalIODir}/BgzfStream_p.cpp + ${InternalIODir}/ByteArray_p.cpp + ${InternalIODir}/HostAddress_p.cpp + ${InternalIODir}/HostInfo_p.cpp + ${InternalIODir}/HttpHeader_p.cpp + ${InternalIODir}/ILocalIODevice_p.cpp + ${InternalIODir}/RollingBuffer_p.cpp + ${InternalIODir}/TcpSocket_p.cpp + ${InternalIODir}/TcpSocketEngine_p.cpp +) + +#------------------------ +# platform-dependent IO +#------------------------ +if( WIN32 ) + set( PlatformIOSources ${InternalIODir}/TcpSocketEngine_win_p.cpp ) +else() + set( PlatformIOSources ${InternalIODir}/TcpSocketEngine_unix_p.cpp ) +endif() + +#--------------------------- +# make build-specific list +#--------------------------- +set( InternalIOSources + ${CommonIOSources} + ${PlatformIOSources} + + PARENT_SCOPE # <-- leave this last +) + diff --git a/src/api/internal/io/HostAddress_p.cpp b/src/api/internal/io/HostAddress_p.cpp new file mode 100644 index 0000000..3a3f43e --- /dev/null +++ b/src/api/internal/io/HostAddress_p.cpp @@ -0,0 +1,393 @@ +// *************************************************************************** +// HostAddress_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 8 December 2011 (DB) +// --------------------------------------------------------------------------- +// Provides a generic IP address container +// *************************************************************************** + +#include "api/internal/io/HostAddress_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cctype> +#include <cstddef> +#include <cstdlib> +#include <sstream> +#include <vector> + +// ------------------------ +// static utility methods +// ------------------------ + +namespace BamTools { +namespace Internal { + +// split a string into fields, on delimiter character +static inline std::vector<std::string> Split(const std::string& source, char delim) +{ + std::stringstream ss(source); + std::string field; + std::vector<std::string> fields; + while (std::getline(ss, field, delim)) + fields.push_back(field); + return fields; +} + +// return number of occurrences of @pattern in @source +static inline uint8_t CountHits(const std::string& source, const std::string& pattern) +{ + + uint8_t count(0); + std::size_t found = source.find(pattern); + while (found != std::string::npos) { + ++count; + found = source.find(pattern, found + 1); + } + return count; +} + +static bool ParseIp4(const std::string& address, uint32_t& maybeIp4) +{ + + // split IP address into string fields + std::vector<std::string> addressFields = Split(address, '.'); + if (addressFields.size() != 4) return false; + + // convert each field to integer value + uint32_t ipv4(0); + for (uint8_t i = 0; i < 4; ++i) { + + const std::string& field = addressFields.at(i); + const std::size_t fieldSize = field.size(); + for (std::size_t j = 0; j < fieldSize; ++j) { + if (!isdigit(field[j])) return false; + } + + int value = std::atoi(addressFields.at(i).c_str()); + if (value < 0 || value > 255) return false; + + // append byte value + ipv4 <<= 8; + ipv4 += value; + } + + // store 32-bit IP address & return success + maybeIp4 = ipv4; + return true; +} + +static bool ParseIp6(const std::string& address, uint8_t* maybeIp6) +{ + + std::string tmp = address; + + // look for '%' char (if found, lop off that part of address) + // we're going to ignore any link-local zone index, for now at least + const std::size_t percentFound = tmp.rfind('%'); + if (percentFound != std::string::npos) tmp = tmp.substr(0, percentFound); + + // split IP address into string fields + std::vector<std::string> fields = Split(tmp, ':'); + const uint8_t numFields = fields.size(); + if (numFields < 3 || numFields > 8) return false; + + // get number of '::' separators + const uint8_t numColonColons = CountHits(tmp, "::"); + if (numFields == 8 && numColonColons > 1) return false; + + // check valid IPv6 'compression' + // must be valid 'pure' IPv6 or mixed IPv4/6 notation + const std::size_t dotFound = tmp.find('.'); + const bool isMixed = (dotFound != std::string::npos); + if (numColonColons != 1 && (numFields < (isMixed ? 7 : 8))) return false; + + // iterate over provided fields + std::size_t index = 16; + std::size_t fillCount = 9 - numFields; + for (int8_t i = numFields - 1; i >= 0; --i) { + if (index == 0) return false; + const std::string& field = fields.at(i); + + // if field empty + if (field.empty()) { + + // if last field empty + if (i == numFields - 1) { + const std::string& previousField = fields.at(i - 1); + if (previousField.empty()) return false; + maybeIp6[--index] = 0; + maybeIp6[--index] = 0; + } + + // if first field empty + else if (i == 0) { + // make sure ':' isn't first character + const std::string& nextField = fields.at(i + 1); + if (nextField.empty()) return false; + maybeIp6[--index] = 0; + maybeIp6[--index] = 0; + } + + // fill in 'compressed' 0s + else { + for (uint8_t j = 0; j < fillCount; ++j) { + if (index == 0) return false; + maybeIp6[--index] = 0; + maybeIp6[--index] = 0; + } + } + } + + // field has data + else { + uint32_t value = static_cast<uint32_t>(strtoul(field.c_str(), 0, 16)); + + if (value <= 0xffff) { + maybeIp6[--index] = value & 0xff; + maybeIp6[--index] = (value >> 8) & 0xff; + } + + // possible mixed IPv4/6 notation + else { + + // mixed field must be last + if (i != numFields - 1) return false; + + // parse the IPv4 section + uint32_t maybeIp4; + if (!ParseIp4(field, maybeIp4)) return false; + + // store IPv4 fields in IPv6 container + maybeIp6[--index] = maybeIp4 & 0xff; + maybeIp6[--index] = (maybeIp4 >> 8) & 0xff; + maybeIp6[--index] = (maybeIp4 >> 16) & 0xff; + maybeIp6[--index] = (maybeIp4 >> 24) & 0xff; + --fillCount; + } + } + } + + // should have parsed OK, return success + return true; +} + +} // namespace Internal +} // namespace BamTools + +// ---------------------------- +// HostAddress implementation +// ---------------------------- + +HostAddress::HostAddress() + : m_protocol(HostAddress::UnknownNetworkProtocol) + , m_ip4Address(0) + , m_hasIpAddress(true) +{} + +HostAddress::HostAddress(const uint32_t ip4Address) + : m_protocol(HostAddress::UnknownNetworkProtocol) + , m_ip4Address(0) + , m_hasIpAddress(true) +{ + SetAddress(ip4Address); +} + +HostAddress::HostAddress(const uint8_t* ip6Address) + : m_protocol(HostAddress::UnknownNetworkProtocol) + , m_ip4Address(0) + , m_hasIpAddress(true) +{ + SetAddress(ip6Address); +} + +HostAddress::HostAddress(const IPv6Address& ip6Address) + : m_protocol(HostAddress::UnknownNetworkProtocol) + , m_ip4Address(0) + , m_hasIpAddress(true) +{ + SetAddress(ip6Address); +} + +HostAddress::HostAddress(const std::string& address) + : m_protocol(HostAddress::UnknownNetworkProtocol) + , m_ip4Address(0) +{ + SetAddress(address); +} + +HostAddress::HostAddress(const HostAddress& other) + : m_protocol(other.m_protocol) + , m_ip4Address(other.m_ip4Address) + , m_ip6Address(other.m_ip6Address) + , m_ipString(other.m_ipString) + , m_hasIpAddress(other.m_hasIpAddress) +{} + +HostAddress::~HostAddress() {} + +bool HostAddress::operator==(const HostAddress& other) const +{ + + // if self is IPv4 + if (m_protocol == HostAddress::IPv4Protocol) { + return (other.m_protocol == HostAddress::IPv4Protocol && + m_ip4Address == other.m_ip4Address); + } + + // if self is IPv6 + else if (m_protocol == HostAddress::IPv6Protocol) { + return (other.m_protocol == HostAddress::IPv6Protocol && + memcmp(&m_ip6Address, &other.m_ip6Address, sizeof(IPv6Address)) == 0); + } + + // otherwise compare protocols + else + return m_protocol == other.m_protocol; +} + +bool HostAddress::operator<(const HostAddress& other) const +{ + + // if self is IPv4 + if (m_protocol == HostAddress::IPv4Protocol) { + if (other.m_protocol == HostAddress::IPv4Protocol) return m_ip4Address < other.m_ip4Address; + } + + // if self is IPv6 + else if (m_protocol == HostAddress::IPv6Protocol) { + if (other.m_protocol == HostAddress::IPv6Protocol) + return (memcmp(&m_ip6Address, &other.m_ip6Address, sizeof(IPv6Address)) < 0); + } + + // otherwise compare protocol types + return m_protocol < other.m_protocol; +} + +void HostAddress::Clear() +{ + + m_protocol = HostAddress::UnknownNetworkProtocol; + m_ip4Address = 0; + memset(&m_ip6Address, 0, sizeof(IPv6Address)); + m_ipString.clear(); + + // this may feel funny, but cleared IP (equivalent to '0.0.0.0') is technically valid + // and that's not really what this flag is checking anyway + // + // this flag is false *iff* the string passed in is a 'plain-text' hostname (www.foo.bar) + m_hasIpAddress = true; +} + +bool HostAddress::HasIPAddress() const +{ + return m_hasIpAddress; +} + +bool HostAddress::IsNull() const +{ + return m_protocol == HostAddress::UnknownNetworkProtocol; +} + +uint32_t HostAddress::GetIPv4Address() const +{ + return m_ip4Address; +} + +IPv6Address HostAddress::GetIPv6Address() const +{ + return m_ip6Address; +} + +std::string HostAddress::GetIPString() const +{ + + std::stringstream ss; + + // IPv4 format + if (m_protocol == HostAddress::IPv4Protocol) { + ss << ((m_ip4Address >> 24) & 0xff) << '.' << ((m_ip4Address >> 16) & 0xff) << '.' + << ((m_ip4Address >> 8) & 0xff) << '.' << (m_ip4Address & 0xff); + + } + + // IPv6 format + else if (m_protocol == HostAddress::IPv6Protocol) { + for (uint8_t i = 0; i < 8; ++i) { + if (i != 0) ss << ':'; + ss << std::hex + << ((uint16_t(m_ip6Address[2 * i]) << 8) | (uint16_t(m_ip6Address[2 * i + 1]))); + } + } + + // return result (empty string if unknown protocol) + return ss.str(); +} + +HostAddress::NetworkProtocol HostAddress::GetProtocol() const +{ + return m_protocol; +} + +bool HostAddress::ParseAddress() +{ + + // all IPv6 addresses should have a ':' + std::string s = m_ipString; + std::size_t found = s.find(':'); + if (found != std::string::npos) { + // try parse IP6 address + uint8_t maybeIp6[16]; + if (ParseIp6(s, maybeIp6)) { + SetAddress(maybeIp6); + m_protocol = HostAddress::IPv6Protocol; + return true; + } + } + + // all IPv4 addresses should have a '.' + found = s.find('.'); + if (found != std::string::npos) { + uint32_t maybeIp4(0); + if (ParseIp4(s, maybeIp4)) { + SetAddress(maybeIp4); + m_protocol = HostAddress::IPv4Protocol; + return true; + } + } + + // else likely just a plain-text host name "www.foo.bar" + // will need to look up IP address info later + m_protocol = HostAddress::UnknownNetworkProtocol; + return false; +} + +void HostAddress::SetAddress(const uint32_t ip4Address) +{ + m_ip4Address = ip4Address; + m_protocol = HostAddress::IPv4Protocol; + m_hasIpAddress = true; +} + +void HostAddress::SetAddress(const uint8_t* ip6Address) +{ + for (uint8_t i = 0; i < 16; ++i) + m_ip6Address[i] = ip6Address[i]; + m_protocol = HostAddress::IPv6Protocol; + m_hasIpAddress = true; +} + +void HostAddress::SetAddress(const IPv6Address& ip6Address) +{ + m_ip6Address = ip6Address; + m_ip4Address = 0; + m_protocol = HostAddress::IPv6Protocol; + m_hasIpAddress = true; +} + +void HostAddress::SetAddress(const std::string& address) +{ + m_ipString = address; + m_hasIpAddress = ParseAddress(); +} diff --git a/src/api/internal/io/HostAddress_p.h b/src/api/internal/io/HostAddress_p.h new file mode 100644 index 0000000..c330200 --- /dev/null +++ b/src/api/internal/io/HostAddress_p.h @@ -0,0 +1,117 @@ +// *************************************************************************** +// HostAddress_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides a generic IP address container +// *************************************************************************** + +#ifndef HOSTADDRESS_P_H +#define HOSTADDRESS_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <cstddef> +#include <cstring> +#include <string> +#include "api/api_global.h" + +namespace BamTools { +namespace Internal { + +struct IPv6Address +{ + + // ctor + inline IPv6Address() + { + memset(&data, 0, sizeof(uint8_t) * 16); + } + + // data access (no bounds checking) + inline uint8_t& operator[](std::size_t index) + { + return data[index]; + } + inline uint8_t operator[](std::size_t index) const + { + return data[index]; + } + + // data + uint8_t data[16]; +}; + +class HostAddress +{ + + // enums +public: + enum NetworkProtocol + { + UnknownNetworkProtocol = -1, + IPv4Protocol = 0, + IPv6Protocol + }; + + // ctors & dtor +public: + HostAddress(); + explicit HostAddress(const uint32_t ip4Address); + explicit HostAddress(const uint8_t* ip6Address); + explicit HostAddress(const IPv6Address& ip6Address); + explicit HostAddress(const std::string& address); + HostAddress(const HostAddress& other); + ~HostAddress(); + + // HostAddress interface +public: + void Clear(); + bool HasIPAddress() const; // returns whether string address could be converted to IP address + bool IsNull() const; + + uint32_t GetIPv4Address() const; + IPv6Address GetIPv6Address() const; + std::string GetIPString() const; + HostAddress::NetworkProtocol GetProtocol() const; + + void SetAddress(const uint32_t ip4Address); + void SetAddress(const uint8_t* ip6Address); + void SetAddress(const IPv6Address& ip6Address); + void SetAddress(const std::string& address); + + // HostAddress comparison operators +public: + bool operator==(const HostAddress& other) const; + bool operator!=(const HostAddress& other) const + { + return !(operator==(other)); + } + bool operator<(const HostAddress& other) const; + + // internal methods +private: + bool ParseAddress(); + + // data members +private: + HostAddress::NetworkProtocol m_protocol; + uint32_t m_ip4Address; + IPv6Address m_ip6Address; + std::string m_ipString; + bool m_hasIpAddress; // true until string passed in, then signifies whether string was an IP +}; + +} // namespace Internal +} // namespace BamTools + +#endif // HOSTADDRESS_P_H diff --git a/src/api/internal/io/HostInfo_p.cpp b/src/api/internal/io/HostInfo_p.cpp new file mode 100644 index 0000000..56b5165 --- /dev/null +++ b/src/api/internal/io/HostInfo_p.cpp @@ -0,0 +1,229 @@ +// *************************************************************************** +// HostInfo_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 8 December 2011 (DB) +// --------------------------------------------------------------------------- +// Provides DNS lookup functionality for hostname & its discovered addresses +// *************************************************************************** + +#include "api/internal/io/HostInfo_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +// platorm-specifics +#ifdef _WIN32 +#include "api/internal/io/NetWin_p.h" +#else +#include "api/internal/io/NetUnix_p.h" +#endif + +// standard C++ includes +#include <cstdlib> +#include <cstring> +#include <set> + +// ------------------------- +// HostInfo implementation +// ------------------------- + +HostInfo::HostInfo() + : m_error(HostInfo::NoError) +{} + +HostInfo::HostInfo(const HostInfo& other) + : m_hostName(other.m_hostName) + , m_addresses(other.m_addresses) + , m_error(other.m_error) + , m_errorString(other.m_errorString) +{} + +HostInfo::~HostInfo() {} + +std::vector<HostAddress> HostInfo::Addresses() const +{ + return m_addresses; +} + +HostInfo::ErrorType HostInfo::GetError() const +{ + return m_error; +} + +std::string HostInfo::GetErrorString() const +{ + return m_errorString; +} + +std::string HostInfo::HostName() const +{ + return m_hostName; +} + +void HostInfo::SetAddresses(const std::vector<HostAddress>& addresses) +{ + m_addresses = addresses; +} + +void HostInfo::SetError(const HostInfo::ErrorType error) +{ + m_error = error; +} + +void HostInfo::SetErrorString(const std::string& errorString) +{ + m_errorString = errorString; +} + +void HostInfo::SetHostName(const std::string& name) +{ + m_hostName = name; +} + +// --------------------------------- +// HostInfo::Lookup(host, port) +// - the real "heavy-lifter" here +// --------------------------------- + +HostInfo HostInfo::Lookup(const std::string& hostname, const std::string& port) +{ + + HostInfo result; + result.SetHostName(hostname); + std::set<HostAddress> uniqueAddresses; + +#ifdef _WIN32 + WindowsSockInit init; +#endif + + HostAddress address; + address.SetAddress(hostname); + + // if hostname is an IP string ('0.0.0.0' or IPv6 format) + // do reverse lookup for host domain name + // + // TODO: might just remove this... not sure if proper 'hostname' from IP string is needed + // + // so far, haven't been able to successfully fetch a domain name with reverse DNS + // getnameinfo() on test sites just returns original IP string. BUT this is likely a rare + // case that client code tries to use an IP string and the connection should work fine + // anyway. GetHostName() just won't quite show what I was hoping for. :( + if (address.HasIPAddress()) { + + const uint16_t portNum = static_cast<uint16_t>(std::atoi(port.c_str())); + + sockaddr_in sa4; + sockaddr_in6 sa6; + sockaddr* sa = 0; + BT_SOCKLEN_T saSize = 0; + + // IPv4 + if (address.GetProtocol() == HostAddress::IPv4Protocol) { + sa = (sockaddr*)&sa4; + saSize = sizeof(sa4); + memset(&sa4, 0, sizeof(sa4)); + sa4.sin_family = AF_INET; + sa4.sin_addr.s_addr = htonl(address.GetIPv4Address()); + sa4.sin_port = htons(portNum); + } + + // IPv6 + else if (address.GetProtocol() == HostAddress::IPv4Protocol) { + sa = (sockaddr*)&sa6; + saSize = sizeof(sa6); + memset(&sa6, 0, sizeof(sa6)); + sa6.sin6_family = AF_INET6; + memcpy(sa6.sin6_addr.s6_addr, address.GetIPv6Address().data, + sizeof(sa6.sin6_addr.s6_addr)); + sa6.sin6_port = htons(portNum); + } + + // unknown (should be unreachable) + else + BT_ASSERT_X(false, "HostInfo::Lookup: unknown network protocol"); + + // lookup name for IP + char hbuf[NI_MAXHOST]; + char serv[NI_MAXSERV]; + if (sa && (getnameinfo(sa, saSize, hbuf, sizeof(hbuf), serv, sizeof(serv), 0) == 0)) + result.SetHostName(std::string(hbuf)); + + // if no domain name found, just use the original address's IP string + if (result.HostName().empty()) result.SetHostName(address.GetIPString()); + + // store address in HostInfo + uniqueAddresses.insert(address); + } + + // otherwise, hostname is a domain name ('www.foo.bar') + // do 'normal' lookup + else { + + // setup address lookup 'hints' + addrinfo hints; + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; // allow either IPv4 or IPv6 + hints.ai_socktype = SOCK_STREAM; // for TCP + hints.ai_protocol = IPPROTO_TCP; + + // fetch addresses for requested hostname/port + addrinfo* res; + int status = getaddrinfo(hostname.c_str(), port.c_str(), &hints, &res); + + // if everything OK + if (status == 0) { + + // iterate over all IP addresses found + addrinfo* p = res; + for (; p != NULL; p = p->ai_next) { + + // IPv4 + if (p->ai_family == AF_INET) { + sockaddr_in* ipv4 = (sockaddr_in*)p->ai_addr; + HostAddress a(ntohl(ipv4->sin_addr.s_addr)); + uniqueAddresses.insert(a); + } + + // IPv6 + else if (p->ai_family == AF_INET6) { + sockaddr_in6* ipv6 = (sockaddr_in6*)p->ai_addr; + HostAddress a(ipv6->sin6_addr.s6_addr); + uniqueAddresses.insert(a); + } + } + + // if we iterated, but no addresses were stored + if (uniqueAddresses.empty() && (p == NULL)) { + result.SetError(HostInfo::UnknownError); + result.SetErrorString("HostInfo: unknown address types found"); + } + } + + // handle error cases + else if ( +#ifndef _WIN32 + status == EAI_NONAME || status == EAI_FAIL +#ifdef EAI_NODATA + || status == EAI_NODATA // officially deprecated, but just in case we happen to hit it +#endif // EAI_NODATA + +#else // _WIN32 + WSAGetLastError() == WSAHOST_NOT_FOUND || WSAGetLastError() == WSANO_DATA || + WSAGetLastError() == WSANO_RECOVERY +#endif // _WIN32 + ) { + result.SetError(HostInfo::HostNotFound); + result.SetErrorString("HostInfo: host not found"); + } else { + result.SetError(HostInfo::UnknownError); + result.SetErrorString("HostInfo: unknown error encountered"); + } + + // cleanup + freeaddrinfo(res); + } + + // store fetched addresses (converting set -> vector) in result & return + result.SetAddresses(std::vector<HostAddress>(uniqueAddresses.begin(), uniqueAddresses.end())); + return result; +} diff --git a/src/api/internal/io/HostInfo_p.h b/src/api/internal/io/HostInfo_p.h new file mode 100644 index 0000000..677073a --- /dev/null +++ b/src/api/internal/io/HostInfo_p.h @@ -0,0 +1,78 @@ +// *************************************************************************** +// HostInfo_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides DNS lookup functionality for hostname/IP addresses +// *************************************************************************** + +#ifndef HOSTINFO_P_H +#define HOSTINFO_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <string> +#include <vector> +#include "api/internal/io/HostAddress_p.h" + +namespace BamTools { +namespace Internal { + +class HostInfo +{ + +public: + enum ErrorType + { + NoError = 0, + HostNotFound, + UnknownError + }; + + // ctors & dtor +public: + HostInfo(); + HostInfo(const HostInfo& other); + ~HostInfo(); + + // HostInfo interface +public: + std::string HostName() const; + void SetHostName(const std::string& name); + + std::vector<HostAddress> Addresses() const; + void SetAddresses(const std::vector<HostAddress>& addresses); + + HostInfo::ErrorType GetError() const; + std::string GetErrorString() const; + + // internal methods +private: + void SetError(const HostInfo::ErrorType error); + void SetErrorString(const std::string& errorString); + + // static methods +public: + static HostInfo Lookup(const std::string& hostname, const std::string& port); + + // data members +private: + std::string m_hostName; + std::vector<HostAddress> m_addresses; + HostInfo::ErrorType m_error; + std::string m_errorString; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // HOSTINFO_P_H diff --git a/src/api/internal/io/HttpHeader_p.cpp b/src/api/internal/io/HttpHeader_p.cpp new file mode 100644 index 0000000..bd25f2e --- /dev/null +++ b/src/api/internal/io/HttpHeader_p.cpp @@ -0,0 +1,403 @@ +// *************************************************************************** +// HttpHeader_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 13 January 2012 (DB) +// --------------------------------------------------------------------------- +// Provides a generic interface for parsing/generating HTTP headers, along +// with specialized request & response header types +// *************************************************************************** + +#include "api/internal/io/HttpHeader_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstddef> +#include <cstdlib> +#include <sstream> +#include <vector> + +namespace BamTools { + +// ----------- +// constants +// ----------- + +namespace Constants { + +static const char CAR_RET_CHAR = '\r'; +static const char COLON_CHAR = ':'; +static const char DOT_CHAR = '.'; +static const char NEWLINE_CHAR = '\n'; +static const char SPACE_CHAR = ' '; + +static const std::string FIELD_NEWLINE = "\r\n"; +static const std::string FIELD_SEPARATOR = ": "; +static const std::string HTTP_STRING = "HTTP/"; + +} // namespace Constants + +// ------------------------ +// static utility methods +// ------------------------ + +namespace Internal { + +static inline bool IsSpace(const char c) +{ + const int n = static_cast<int>(c); + return (n == 0 || (n <= 13 && n >= 9)); +} + +// split on hitting single char delim +static std::vector<std::string> Split(const std::string& source, const char delim) +{ + std::stringstream ss(source); + std::string field; + std::vector<std::string> fields; + while (std::getline(ss, field, delim)) + fields.push_back(field); + return fields; +} + +static std::string Trim(const std::string& source) +{ + + // skip if empty string + if (source.empty()) return source; + + // fetch string data + const char* s = source.data(); // ignoring null-term on purpose + const std::size_t size = source.size(); + std::size_t start = 0; + std::size_t end = size - 1; + + // skip if no spaces at start or end + if (!IsSpace(s[start]) && !IsSpace(s[end])) return source; + + // remove leading whitespace + while ((start != end) && IsSpace(s[start])) + ++start; + + // remove trailing whitespace + if (start <= end) { + while (end && IsSpace(s[end])) + --end; + } + + // return result + return std::string(s + start, (end - start) + 1); +} + +} // namespace Internal +} // namespace BamTools + +// --------------------------- +// HttpHeader implementation +// --------------------------- + +HttpHeader::HttpHeader() + : m_isValid(true) + , m_majorVersion(1) + , m_minorVersion(1) +{} + +HttpHeader::HttpHeader(const std::string& s) + : m_isValid(true) + , m_majorVersion(1) + , m_minorVersion(1) +{ + Parse(s); +} + +HttpHeader::~HttpHeader() {} + +bool HttpHeader::ContainsKey(const std::string& key) const +{ + return (m_fields.find(key) != m_fields.end()); +} + +int HttpHeader::GetMajorVersion() const +{ + return m_majorVersion; +} + +int HttpHeader::GetMinorVersion() const +{ + return m_minorVersion; +} + +std::string HttpHeader::GetValue(const std::string& key) +{ + if (ContainsKey(key)) + return m_fields[key]; + else + return std::string(); +} + +bool HttpHeader::IsValid() const +{ + return m_isValid; +} + +void HttpHeader::Parse(const std::string& s) +{ + + // trim whitespace from input string + const std::string trimmed = Trim(s); + + // split into list of header lines + std::vector<std::string> rawFields = Split(trimmed, Constants::NEWLINE_CHAR); + + // prep our 'cleaned' fields container + std::vector<std::string> cleanFields; + cleanFields.reserve(rawFields.size()); + + // remove any empty fields and clean any trailing windows-style carriage returns ('\r') + std::vector<std::string>::iterator rawFieldIter = rawFields.begin(); + std::vector<std::string>::iterator rawFieldEnd = rawFields.end(); + for (; rawFieldIter != rawFieldEnd; ++rawFieldIter) { + std::string& field = (*rawFieldIter); + + // skip empty fields + if (field.empty()) continue; + + // remove carriage returns + const std::size_t fieldSize = field.size(); + if (field[fieldSize - 1] == Constants::CAR_RET_CHAR) field.resize(fieldSize - 1); + + // store cleaned field + cleanFields.push_back(field); + } + + // skip add'l processing if nothing here + if (cleanFields.empty()) return; + + // parse header lines + int lineNumber = 0; + std::vector<std::string>::const_iterator fieldIter = cleanFields.begin(); + std::vector<std::string>::const_iterator fieldEnd = cleanFields.end(); + for (; fieldIter != fieldEnd; ++fieldIter, ++lineNumber) { + if (!ParseLine((*fieldIter), lineNumber)) { + m_isValid = false; + return; + } + } +} + +bool HttpHeader::ParseLine(const std::string& line, int) +{ + + // find colon position, return failure if not found + const std::size_t colonFound = line.find(Constants::COLON_CHAR); + if (colonFound == std::string::npos) return false; + + // store key/value (without leading/trailing whitespace) & return success + const std::string key = Trim(line.substr(0, colonFound)); + const std::string value = Trim(line.substr(colonFound + 1)); + m_fields[key] = value; + return true; +} + +void HttpHeader::RemoveField(const std::string& key) +{ + m_fields.erase(key); +} + +void HttpHeader::SetField(const std::string& key, const std::string& value) +{ + m_fields[key] = value; +} + +void HttpHeader::SetValid(bool ok) +{ + m_isValid = ok; +} + +void HttpHeader::SetVersion(int major, int minor) +{ + m_majorVersion = major; + m_minorVersion = minor; +} + +std::string HttpHeader::ToString() const +{ + std::string result; + if (m_isValid) { + std::map<std::string, std::string>::const_iterator fieldIter = m_fields.begin(); + std::map<std::string, std::string>::const_iterator fieldEnd = m_fields.end(); + for (; fieldIter != fieldEnd; ++fieldIter) { + const std::string& key = (*fieldIter).first; + const std::string& value = (*fieldIter).second; + const std::string& line = + key + Constants::FIELD_SEPARATOR + value + Constants::FIELD_NEWLINE; + result += line; + } + } + return result; +} + +// ---------------------------------- +// HttpRequestHeader implementation +// ---------------------------------- + +HttpRequestHeader::HttpRequestHeader(const std::string& method, const std::string& resource, + int majorVersion, int minorVersion) + : HttpHeader() + , m_method(method) + , m_resource(resource) +{ + SetVersion(majorVersion, minorVersion); +} + +HttpRequestHeader::~HttpRequestHeader() {} + +std::string HttpRequestHeader::GetMethod() const +{ + return m_method; +} + +std::string HttpRequestHeader::GetResource() const +{ + return m_resource; +} + +bool HttpRequestHeader::ParseLine(const std::string& line, int lineNumber) +{ + + // if not 'request line', just let base class parse + if (lineNumber != 0) return HttpHeader::ParseLine(line, lineNumber); + + // fail if empty line + if (line.empty()) return false; + + // walk through request line, storing positions + // GET /path/to/resource HTTP/1.1 + // ^ ^^ ^^ + const std::size_t foundMethod = + line.find_first_not_of(Constants::SPACE_CHAR); // skip any leading whitespace + if (foundMethod == std::string::npos) return false; + const std::size_t foundFirstSpace = line.find(Constants::SPACE_CHAR, foundMethod + 1); + if (foundFirstSpace == std::string::npos) return false; + const std::size_t foundResource = + line.find_first_not_of(Constants::SPACE_CHAR, foundFirstSpace + 1); + if (foundResource == std::string::npos) return false; + const std::size_t foundSecondSpace = line.find(Constants::SPACE_CHAR, foundResource + 1); + if (foundSecondSpace == std::string::npos) return false; + const std::size_t foundVersion = + line.find_first_not_of(Constants::SPACE_CHAR, foundSecondSpace + 1); + if (foundVersion == std::string::npos) return false; + + // parse out method & resource + m_method = line.substr(foundMethod, foundFirstSpace - foundMethod); + m_resource = line.substr(foundResource, foundSecondSpace - foundResource); + + // parse out version numbers + const std::string temp = line.substr(foundVersion); + if ((temp.find(Constants::HTTP_STRING) != 0) || (temp.size() != 8)) return false; + const int major = static_cast<int>(temp.at(5) - '0'); + const int minor = static_cast<int>(temp.at(7) - '0'); + SetVersion(major, minor); + + // if we get here, return success + return true; +} + +std::string HttpRequestHeader::ToString() const +{ + std::stringstream request; + request << m_method << Constants::SPACE_CHAR << m_resource << Constants::SPACE_CHAR + << Constants::HTTP_STRING << GetMajorVersion() << Constants::DOT_CHAR + << GetMinorVersion() << Constants::FIELD_NEWLINE << HttpHeader::ToString() + << Constants::FIELD_NEWLINE; + return request.str(); +} + +// ----------------------------------- +// HttpResponseHeader implementation +// ----------------------------------- + +HttpResponseHeader::HttpResponseHeader(const int statusCode, const std::string& reason, + int majorVersion, int minorVersion) + + : HttpHeader() + , m_statusCode(statusCode) + , m_reason(reason) +{ + SetVersion(majorVersion, minorVersion); +} + +HttpResponseHeader::HttpResponseHeader(const std::string& s) + : HttpHeader() + , m_statusCode(0) +{ + Parse(s); +} + +HttpResponseHeader::~HttpResponseHeader() {} + +std::string HttpResponseHeader::GetReason() const +{ + return m_reason; +} + +int HttpResponseHeader::GetStatusCode() const +{ + return m_statusCode; +} + +bool HttpResponseHeader::ParseLine(const std::string& line, int lineNumber) +{ + + // if not 'status line', just let base class + if (lineNumber != 0) return HttpHeader::ParseLine(line, lineNumber); + + // fail if empty line + if (line.empty()) return false; + + // walk through status line, storing positions + // HTTP/1.1 200 OK + // ^ ^^ ^^ + + const std::size_t foundVersion = + line.find_first_not_of(Constants::SPACE_CHAR); // skip any leading whitespace + if (foundVersion == std::string::npos) return false; + const std::size_t foundFirstSpace = line.find(Constants::SPACE_CHAR, foundVersion + 1); + if (foundFirstSpace == std::string::npos) return false; + const std::size_t foundStatusCode = + line.find_first_not_of(Constants::SPACE_CHAR, foundFirstSpace + 1); + if (foundStatusCode == std::string::npos) return false; + const std::size_t foundSecondSpace = line.find(Constants::SPACE_CHAR, foundStatusCode + 1); + if (foundSecondSpace == std::string::npos) return false; + const std::size_t foundReason = + line.find_first_not_of(Constants::SPACE_CHAR, foundSecondSpace + 1); + if (foundReason == std::string::npos) return false; + + // parse version numbers + std::string temp = line.substr(foundVersion, foundFirstSpace - foundVersion); + if ((temp.find(Constants::HTTP_STRING) != 0) || (temp.size() != 8)) return false; + const int major = static_cast<int>(temp.at(5) - '0'); + const int minor = static_cast<int>(temp.at(7) - '0'); + SetVersion(major, minor); + + // parse status code + temp = line.substr(foundStatusCode, foundSecondSpace - foundStatusCode); + if (temp.size() != 3) return false; + m_statusCode = std::atoi(temp.c_str()); + + // reason phrase should be everything else left + m_reason = line.substr(foundReason); + + // if we get here, return success + return true; +} + +std::string HttpResponseHeader::ToString() const +{ + std::stringstream response; + response << Constants::HTTP_STRING << GetMajorVersion() << Constants::DOT_CHAR + << GetMinorVersion() << Constants::SPACE_CHAR << m_statusCode << Constants::SPACE_CHAR + << m_reason << Constants::FIELD_NEWLINE << HttpHeader::ToString() + << Constants::FIELD_NEWLINE; + return response.str(); +} diff --git a/src/api/internal/io/HttpHeader_p.h b/src/api/internal/io/HttpHeader_p.h new file mode 100644 index 0000000..c7c4617 --- /dev/null +++ b/src/api/internal/io/HttpHeader_p.h @@ -0,0 +1,136 @@ +// *************************************************************************** +// HttpHeader_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 13 January 2012 (DB) +// --------------------------------------------------------------------------- +// Provides a generic interface for parsing/generating HTTP headers, along +// with specialized request & response header types +// *************************************************************************** + +#ifndef HTTP_HEADER_P_H +#define HTTP_HEADER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <map> +#include <string> +#include "api/api_global.h" + +namespace BamTools { +namespace Internal { + +class HttpHeader +{ + + // ctors & dtor +public: + HttpHeader(); + HttpHeader(const std::string& s); + virtual ~HttpHeader(); + + // HttpHeader interface +public: + // header field=>value access + bool ContainsKey(const std::string& key) const; + std::string GetValue(const std::string& key); + void RemoveField(const std::string& key); + void SetField(const std::string& key, const std::string& value); + + // get formatted header string + virtual std::string ToString() const; + + // query HTTP version used + int GetMajorVersion() const; + int GetMinorVersion() const; + + // see if header was parsed OK + bool IsValid() const; + + // internal methods +protected: + void Parse(const std::string& s); + virtual bool ParseLine(const std::string& line, int lineNumber); + void SetValid(bool ok); + void SetVersion(int major, int minor); + + // data members +private: + std::map<std::string, std::string> m_fields; + + bool m_isValid; // should usually be true, only false if error processing a header line + int m_majorVersion; + int m_minorVersion; +}; + +class HttpRequestHeader : public HttpHeader +{ + + // ctor & dtor +public: + HttpRequestHeader(const std::string& method, // "GET", "HEAD", ... + const std::string& resource, // filename + int majorVersion = 1, // version info + int minorVersion = 1); + ~HttpRequestHeader(); + + // HttpRequestHeader interface +public: + std::string GetMethod() const; + std::string GetResource() const; + + // HttpHeader implementation +public: + std::string ToString() const; + +protected: + bool ParseLine(const std::string& line, int lineNumber); + + // data members +private: + std::string m_method; + std::string m_resource; +}; + +class HttpResponseHeader : public HttpHeader +{ + + // ctor & dtor +public: + HttpResponseHeader(const int statusCode, // 200, 404, etc + const std::string& reason = std::string(), // 'reason phrase' for code + int majorVersion = 1, // version info + int minorVersion = 1); + HttpResponseHeader(const std::string& s); + ~HttpResponseHeader(); + + // HttpRequestHeader interface +public: + std::string GetReason() const; + int GetStatusCode() const; + + // HttpHeader implementation +public: + std::string ToString() const; + +protected: + bool ParseLine(const std::string& line, int lineNumber); + + // data members +private: + int m_statusCode; + std::string m_reason; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // HTTP_HEADER_P_H diff --git a/src/api/internal/io/ILocalIODevice_p.cpp b/src/api/internal/io/ILocalIODevice_p.cpp new file mode 100644 index 0000000..9e81eeb --- /dev/null +++ b/src/api/internal/io/ILocalIODevice_p.cpp @@ -0,0 +1,61 @@ +// *************************************************************************** +// ILocalIODevice_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 27 July 2012 (DB) +// --------------------------------------------------------------------------- +// Provides shared behavior for files & pipes +// *************************************************************************** + +#include "api/internal/io/ILocalIODevice_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstdio> + +ILocalIODevice::ILocalIODevice() + : IBamIODevice() + , m_stream(0) +{} + +ILocalIODevice::~ILocalIODevice() +{ + Close(); +} + +void ILocalIODevice::Close() +{ + + // skip if not open + if (!IsOpen()) return; + + // flush & close FILE* + fflush(m_stream); + fclose(m_stream); + m_stream = 0; + + // reset other device state + m_mode = IBamIODevice::NotOpen; +} + +int64_t ILocalIODevice::Read(char* data, const unsigned int numBytes) +{ + BT_ASSERT_X(m_stream, "ILocalIODevice::Read: trying to read from null stream"); + BT_ASSERT_X((m_mode & IBamIODevice::ReadOnly), + "ILocalIODevice::Read: device not in read-able mode"); + return static_cast<int64_t>(fread(data, sizeof(char), numBytes, m_stream)); +} + +int64_t ILocalIODevice::Tell() const +{ + BT_ASSERT_X(m_stream, "ILocalIODevice::Tell: trying to get file position fromnull stream"); + return ftell64(m_stream); +} + +int64_t ILocalIODevice::Write(const char* data, const unsigned int numBytes) +{ + BT_ASSERT_X(m_stream, "ILocalIODevice::Write: tryint to write to null stream"); + BT_ASSERT_X((m_mode & IBamIODevice::WriteOnly), + "ILocalIODevice::Write: device not in write-able mode"); + return static_cast<int64_t>(fwrite(data, sizeof(char), numBytes, m_stream)); +} diff --git a/src/api/internal/io/ILocalIODevice_p.h b/src/api/internal/io/ILocalIODevice_p.h new file mode 100644 index 0000000..64fc634 --- /dev/null +++ b/src/api/internal/io/ILocalIODevice_p.h @@ -0,0 +1,51 @@ +// *************************************************************************** +// ILocalIODevice_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides shared behavior for files & pipes +// *************************************************************************** + +#ifndef ILOCALIODEVICE_P_H +#define ILOCALIODEVICE_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/IBamIODevice.h" + +namespace BamTools { +namespace Internal { + +class ILocalIODevice : public IBamIODevice +{ + + // ctor & dtor +public: + ILocalIODevice(); + virtual ~ILocalIODevice(); + + // IBamIODevice implementation +public: + virtual void Close(); + virtual int64_t Read(char* data, const unsigned int numBytes); + virtual int64_t Tell() const; + virtual int64_t Write(const char* data, const unsigned int numBytes); + + // data members +protected: + FILE* m_stream; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // ILOCALIODEVICE_P_H diff --git a/src/api/internal/io/NetUnix_p.h b/src/api/internal/io/NetUnix_p.h new file mode 100644 index 0000000..bb13cef --- /dev/null +++ b/src/api/internal/io/NetUnix_p.h @@ -0,0 +1,43 @@ +// *************************************************************************** +// NetUnix_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides common networking-related includes, etc. for all UNIX-like systems +// *************************************************************************** + +#ifndef NETUNIX_P_H +#define NETUNIX_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#ifndef _WIN32 // <-- source files only include the proper Net*_p.h, but this is a double-check + +#include <arpa/inet.h> +#include <netdb.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include <unistd.h> + +#ifdef __FreeBSD__ +#include <netinet/in.h> +#endif + +#ifndef BT_SOCKLEN_T +#define BT_SOCKLEN_T socklen_t +#endif + +#endif // _WIN32 +#endif // NETUNIX_P_H diff --git a/src/api/internal/io/NetWin_p.h b/src/api/internal/io/NetWin_p.h new file mode 100644 index 0000000..909b254 --- /dev/null +++ b/src/api/internal/io/NetWin_p.h @@ -0,0 +1,62 @@ +// *************************************************************************** +// NetWin_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 8 December 2011 (DB) +// --------------------------------------------------------------------------- +// Provides common networking-related includes, etc. for Windows systems +// +// Note: requires Windows XP or later +// *************************************************************************** + +#ifndef NETWIN_P_H +#define NETWIN_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#ifdef _WIN32 // <-- source files only include the proper Net*_p.h, but this is a double-check + +#include <Ws2tcpip.h> +#include <winsock2.h> // <-- should bring 'windows.h' along with it + +#ifndef BT_SOCKLEN_T +#define BT_SOCKLEN_T int +#endif + +#ifdef _MSC_VER +#pragma comment(lib, "ws2_32.lib") +#endif + +namespace BamTools { +namespace Internal { + +// use RAII to ensure WSA is initialized +class WindowsSockInit +{ +public: + WindowsSockInit() + { + WSAData wsadata; + WSAStartup(MAKEWORD(2, 2), &wsadata); // catch error ? + } + + ~WindowsSockInit() + { + WSACleanup(); + } +}; + +} // namespace Internal +} // namespace BamTools + +#endif // _WIN32 + +#endif // NETWIN_P_H diff --git a/src/api/internal/io/RollingBuffer_p.cpp b/src/api/internal/io/RollingBuffer_p.cpp new file mode 100644 index 0000000..3cbfd1a --- /dev/null +++ b/src/api/internal/io/RollingBuffer_p.cpp @@ -0,0 +1,317 @@ +// *************************************************************************** +// RollingBuffer_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 8 December 2011 (DB) +// --------------------------------------------------------------------------- +// Provides a dynamic I/O FIFO byte queue, which removes bytes as they are +// read from the front of the buffer and grows to accept bytes being written +// to buffer end. +// +// implementation note: basically a 'smart' wrapper around 1..* ByteArrays +// *************************************************************************** + +#include "api/internal/io/RollingBuffer_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <algorithm> +#include <climits> +#include <cstddef> +#include <cstring> +#include <string> + +// ------------------------------ +// RollingBuffer implementation +// ------------------------------ + +RollingBuffer::RollingBuffer(std::size_t growth) + : m_bufferGrowth(growth) +{ + // buffer always contains at least 1 (maybe empty) byte array + m_data.push_back(ByteArray()); + + // set cleared state + Clear(); +} + +RollingBuffer::~RollingBuffer() {} + +std::size_t RollingBuffer::BlockSize() const +{ + + // if only one byte array in buffer <- needed? + if (m_tailBufferIndex == 0) return m_tail - m_head; + + // otherwise return remaining num bytes in first array + const ByteArray& first = m_data.front(); + return first.Size() - m_head; +} + +bool RollingBuffer::CanReadLine() const +{ + return IndexOf('\n') != std::string::npos; +} + +void RollingBuffer::Chop(std::size_t n) +{ + + // update buffer size + if (n > m_totalBufferSize) + m_totalBufferSize = 0; + else + m_totalBufferSize -= n; + + // loop until target case hit + for (;;) { + + // if only one array, decrement tail + if (m_tailBufferIndex == 0) { + m_tail -= n; + + // if all data chopped + if (m_tail <= m_head) { + m_head = 0; + m_tail = 0; + } + return; + } + + // if there's room in last byte array to 'chop', just decrement tail + if (n <= m_tail) { + m_tail -= n; + return; + } + + // otherwise we're going to overlap our internal byte arrays + // reduce our chop amount by the amount of data in the last byte array + n -= m_tail; + + // remove last byte array & set tail to it's end + m_data.pop_back(); + --m_tailBufferIndex; + m_tail = m_data.at(m_tailBufferIndex).Size(); + } + + // if buffer is now empty, reset state & clear up memory + if (IsEmpty()) Clear(); +} + +void RollingBuffer::Clear() +{ + + // remove all byte arrays (except first) + m_data.erase(m_data.begin() + 1, m_data.end()); + + // clear out first byte array + m_data[0].Resize(0); + m_data[0].Squeeze(); + + // reset index & size markers + m_head = 0; + m_tail = 0; + m_tailBufferIndex = 0; + m_totalBufferSize = 0; +} + +void RollingBuffer::Free(std::size_t n) +{ + + // update buffer size + if (n > m_totalBufferSize) + m_totalBufferSize = 0; + else + m_totalBufferSize -= n; + + // loop until target case hit + for (;;) { + + const std::size_t blockSize = BlockSize(); + + // if there's room in current array + if (n < blockSize) { + + // shift 'head' over @n bytes + m_head += n; + + // check for emptied, single byte array + if (m_head == m_tail && m_tailBufferIndex == 0) { + m_head = 0; + m_tail = 0; + } + + break; + } + + // otherwise we need to check next byte array + // first update amount to remove + n -= blockSize; + + // special case - there was only 1 array + if (m_data.size() == 1) { + if (m_data.at(0).Size() != m_bufferGrowth) m_data[0].Resize(m_bufferGrowth); + m_head = 0; + m_tail = 0; + m_tailBufferIndex = 0; + break; + } + + // otherwise, remove first array and move to next iteration + m_data.pop_front(); + --m_tailBufferIndex; + m_head = 0; + } + + // if buffer is now empty, reset state & clear up memory + if (IsEmpty()) Clear(); +} + +std::size_t RollingBuffer::IndexOf(char c) const +{ + + // skip processing if empty buffer + if (IsEmpty()) return std::string::npos; + + std::size_t index(0); + + // iterate over byte arrays + const std::size_t numBuffers = m_data.size(); + for (std::size_t i = 0; i < numBuffers; ++i) { + const ByteArray& current = m_data.at(i); + + // if on first array, use head; else 0 + const std::size_t start = ((i == 0) ? m_head : 0); + + // if on last array, set end; else use current byte array size + const std::size_t end = ((i == m_tailBufferIndex) ? m_tail : current.Size()); + + // look through this iteration's byte array for @c + const char* p = current.ConstData() + start; + for (std::size_t j = start; j < end; ++j) { + if (*p++ == c) return index; + ++index; + } + } + + // no match found + return std::string::npos; +} + +bool RollingBuffer::IsEmpty() const +{ + return (m_tailBufferIndex == 0) && (m_tail == 0); +} + +std::size_t RollingBuffer::Read(char* dest, std::size_t max) +{ + + std::size_t bytesToRead = std::min(Size(), max); + std::size_t bytesReadSoFar = 0; + + while (bytesReadSoFar < bytesToRead) { + const char* readPtr = ReadPointer(); + std::size_t blockBytes = std::min((bytesToRead - bytesReadSoFar), BlockSize()); + if (dest) memcpy(dest + bytesReadSoFar, readPtr, blockBytes); + bytesReadSoFar += blockBytes; + Free(blockBytes); + } + + return bytesReadSoFar; +} + +std::size_t RollingBuffer::ReadLine(char* dest, std::size_t max) +{ + + // if we can't read line or if max is 0 + if (!CanReadLine() || max == 0) return 0; + + // otherwise, read until we hit newline + std::size_t bytesReadSoFar = 0; + bool finished = false; + while (!finished) { + + const std::size_t index = IndexOf('\n'); + const char* readPtr = ReadPointer(); + std::size_t bytesToRead = std::min((index + 1) - bytesReadSoFar, BlockSize()); + bytesToRead = std::min(bytesToRead, (max - 1) - bytesReadSoFar); + memcpy(dest + bytesReadSoFar, readPtr, bytesToRead); + bytesReadSoFar += bytesToRead; + Free(bytesToRead); + + if (!((bytesReadSoFar < index + 1) && (bytesReadSoFar < max - 1))) finished = true; + } + + // null terminate 'dest' & return numBytesRead + dest[bytesReadSoFar] = '\0'; + return bytesReadSoFar; +} + +const char* RollingBuffer::ReadPointer() const +{ + + // return null if empty buffer + if (m_data.empty()) return 0; + + // otherwise return pointer to current position + const ByteArray& first = m_data.front(); + return first.ConstData() + m_head; +} + +char* RollingBuffer::Reserve(std::size_t n) +{ + + // if empty buffer + if (m_totalBufferSize == 0) { + m_data[0].Resize(std::max(m_bufferGrowth, n)); + m_totalBufferSize += n; + m_tail = n; + return m_data[m_tailBufferIndex].Data(); + } + + // increment buffer's byte count + m_totalBufferSize += n; + + // if buffer already contains enough space to fit @n more bytes + if ((m_tail + n) <= m_data.at(m_tailBufferIndex).Size()) { + + // fetch write pointer at current 'tail', increment tail by @n & return + char* ptr = m_data[m_tailBufferIndex].Data(); //+ m_tail; + m_tail += n; + return ptr; + } + + // if last byte array isn't half full + if (m_tail < m_data.at(m_tailBufferIndex).Size() / 2) { + + // we'll allow simple resize + m_data[m_tailBufferIndex].Resize(m_tail + n); + + // fetch write pointer at current 'tail', increment tail by @n & return + char* ptr = m_data[m_tailBufferIndex].Data(); //+ m_tail; + m_tail += n; + return ptr; + } + + // otherwise, shrink last byte array to current used size + m_data[m_tailBufferIndex].Resize(m_tail); + + // then append new byte array + m_data.push_back(ByteArray()); + ++m_tailBufferIndex; + m_data[m_tailBufferIndex].Resize(std::max(m_bufferGrowth, n)); + m_tail = n; + + // return write-able pointer on new array + return m_data[m_tailBufferIndex].Data(); +} + +std::size_t RollingBuffer::Size() const +{ + return m_totalBufferSize; +} + +void RollingBuffer::Write(const char* src, std::size_t n) +{ + char* writePtr = Reserve(n); + memcpy(writePtr, src, n); +} diff --git a/src/api/internal/io/RollingBuffer_p.h b/src/api/internal/io/RollingBuffer_p.h new file mode 100644 index 0000000..2e22426 --- /dev/null +++ b/src/api/internal/io/RollingBuffer_p.h @@ -0,0 +1,88 @@ +// *************************************************************************** +// RollingBuffer_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 7 December 2011 (DB) +// --------------------------------------------------------------------------- +// Provides a dynamic I/O FIFO byte queue, which removes bytes as they are +// read from the front of the buffer and grows to accept bytes being written +// to buffer end. +// +// implementation note: basically a 'smart' wrapper around 1..* ByteArrays +// *************************************************************************** + +#ifndef ROLLINGBUFFER_P_H +#define ROLLINGBUFFER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <cstddef> +#include <deque> +#include <string> +#include "api/api_global.h" +#include "api/internal/io/ByteArray_p.h" + +namespace BamTools { +namespace Internal { + +class RollingBuffer +{ + + // ctors & dtor +public: + RollingBuffer(std::size_t growth); + ~RollingBuffer(); + + // RollingBuffer interface +public: + // returns current buffer size + std::size_t BlockSize() const; + // checks buffer for new line + bool CanReadLine() const; + // frees @n bytes from end of buffer + void Chop(std::size_t n); + // clears entire buffer structure + void Clear(); + // frees @n bytes from front of buffer + void Free(std::size_t n); + // checks buffer for @c + std::size_t IndexOf(char c) const; + // returns whether buffer contains data + bool IsEmpty() const; + // reads up to @maxLen bytes into @dest + // returns exactly how many bytes were read from buffer + std::size_t Read(char* dest, std::size_t max); + // reads until newline (or up to @maxLen bytes) + // returns exactly how many bytes were read from buffer + std::size_t ReadLine(char* dest, std::size_t max); + // returns a C-fxn compatible char* to byte data + const char* ReadPointer() const; + // ensures that buffer contains space for @n incoming bytes, returns write-able char* + char* Reserve(std::size_t n); + // returns current number of bytes stored in buffer + std::size_t Size() const; + // reserves space for @n bytes, then appends contents of @src to buffer + void Write(const char* src, std::size_t n); + + // data members +private: + std::size_t m_head; // index into current data (next char) + std::size_t m_tail; // index into last data position + std::size_t m_tailBufferIndex; // m_data::size() - 1 + std::size_t m_totalBufferSize; // total buffer size + std::size_t m_bufferGrowth; // new buffers are typically initialized with this size + std::deque<ByteArray> m_data; // basic 'buffer of buffers' +}; + +} // namespace Internal +} // namespace BamTools + +#endif // ROLLINGBUFFER_P_H diff --git a/src/api/internal/io/TcpSocketEngine_p.cpp b/src/api/internal/io/TcpSocketEngine_p.cpp new file mode 100644 index 0000000..de373c4 --- /dev/null +++ b/src/api/internal/io/TcpSocketEngine_p.cpp @@ -0,0 +1,212 @@ +// *************************************************************************** +// TcpSocketEngine_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 8 December 2011 (DB) +// --------------------------------------------------------------------------- +// Provides low-level implementation of TCP I/O +// *************************************************************************** + +// N.B. - this file contains the top-level, platform-independent logic. "Native" methods +// are called as needed from the TcpSocketEngine_<X>.cpp files. Selection of the proper +// native method file should have been handled at build-time by CMake. + +#include "api/internal/io/TcpSocketEngine_p.h" +#include "api/internal/io/HostInfo_p.h" + +#include <cstddef> + +using namespace BamTools; +using namespace BamTools::Internal; + +TcpSocketEngine::TcpSocketEngine() + : m_socketDescriptor(-1) + // , m_localPort(0) + , m_remotePort(0) + , m_socketError(TcpSocket::UnknownSocketError) + , m_socketState(TcpSocket::UnconnectedState) +{} + +TcpSocketEngine::TcpSocketEngine(const TcpSocketEngine& other) + : m_socketDescriptor(other.m_socketDescriptor) + // , m_localAddress(other.m_localAddress) + , m_remoteAddress(other.m_remoteAddress) + // , m_localPort(other.m_localPort) + , m_remotePort(other.m_remotePort) + , m_socketError(other.m_socketError) + , m_socketState(other.m_socketState) + , m_errorString(other.m_errorString) +{} + +TcpSocketEngine::~TcpSocketEngine() +{ + Close(); +} + +void TcpSocketEngine::Close() +{ + + // close socket if we have valid FD + if (m_socketDescriptor != -1) { + nativeClose(); + m_socketDescriptor = -1; + } + + // reset state + m_socketState = TcpSocket::UnconnectedState; + // m_localAddress.Clear(); + m_remoteAddress.Clear(); + // m_localPort = 0; + m_remotePort = 0; +} + +bool TcpSocketEngine::Connect(const HostAddress& address, const uint16_t port) +{ + + // return failure if invalid FD or already connected + if (!IsValid() || (m_socketState == TcpSocket::ConnectedState)) { + // TODO: set error string + return false; + } + + // attempt to connect to host address on requested port + if (!nativeConnect(address, port)) { + // TODO: set error string + return false; + } + + // if successful, store remote host address port & return success + // TODO: (later) fetch proxied remote & local host/port here + m_remoteAddress = address; + m_remotePort = port; + return true; +} + +std::string TcpSocketEngine::GetErrorString() const +{ + return m_errorString; +} + +//HostAddress TcpSocketEngine::GetLocalAddress() const { +// return m_localAddress; +//} + +//uint16_t TcpSocketEngine::GetLocalPort() const { +// return m_localPort; +//} + +HostAddress TcpSocketEngine::GetRemoteAddress() const +{ + return m_remoteAddress; +} + +uint16_t TcpSocketEngine::GetRemotePort() const +{ + return m_remotePort; +} + +int TcpSocketEngine::GetSocketDescriptor() const +{ + return m_socketDescriptor; +} + +TcpSocket::SocketError TcpSocketEngine::GetSocketError() +{ + return m_socketError; +} + +TcpSocket::SocketState TcpSocketEngine::GetSocketState() +{ + return m_socketState; +} + +bool TcpSocketEngine::Initialize(HostAddress::NetworkProtocol protocol) +{ + + // close current socket if we have one open + if (IsValid()) Close(); + + // attempt to create new socket + return nativeCreateSocket(protocol); +} + +bool TcpSocketEngine::IsValid() const +{ + return (m_socketDescriptor != -1); +} + +int64_t TcpSocketEngine::NumBytesAvailable() const +{ + + // return 0 if socket FD is invalid + if (!IsValid()) { + // TODO: set error string + return -1; + } + + // otherwise check socket to see how much is ready + return nativeNumBytesAvailable(); +} + +int64_t TcpSocketEngine::Read(char* dest, std::size_t max) +{ + + // return failure if can't read + if (!IsValid() || (m_socketState != TcpSocket::ConnectedState)) return -1; + + // otherwise return number of bytes read + return nativeRead(dest, max); +} + +bool TcpSocketEngine::WaitForRead(int msec, bool* timedOut) +{ + + // reset timedOut flag + *timedOut = false; + + // need to wait for our socket to be ready to read + const int ret = nativeSelect(msec, true); + + // if timed out + if (ret == 0) { + *timedOut = true; + m_socketError = TcpSocket::SocketTimeoutError; + m_errorString = "socket timed out"; + } + + // return if any sockets available for reading + return (ret > 0); +} + +bool TcpSocketEngine::WaitForWrite(int msec, bool* timedOut) +{ + + // reset timedOut flag + *timedOut = false; + + // need to wait for our socket to be ready to write + const int ret = nativeSelect(msec, false); + + // if timed out + if (ret == 0) { + *timedOut = true; + m_socketError = TcpSocket::SocketTimeoutError; + m_errorString = "socket timed out"; + } + + // return if any sockets available for reading + return (ret > 0); +} + +int64_t TcpSocketEngine::Write(const char* data, std::size_t length) +{ + + // return failure if can't write + if (!IsValid() || (m_socketState != TcpSocket::ConnectedState)) { + // TODO: set error string + return -1; + } + + // otherwise return number of bytes written + return nativeWrite(data, length); +} diff --git a/src/api/internal/io/TcpSocketEngine_p.h b/src/api/internal/io/TcpSocketEngine_p.h new file mode 100644 index 0000000..b3a6495 --- /dev/null +++ b/src/api/internal/io/TcpSocketEngine_p.h @@ -0,0 +1,105 @@ +// *************************************************************************** +// TcpSocketEngine_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 8 December 2011 (DB) +// --------------------------------------------------------------------------- +// Provides low-level implementation of TCP I/O +// *************************************************************************** + +#ifndef TCPSOCKETENGINE_P_H +#define TCPSOCKETENGINE_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/internal/io/HostAddress_p.h" +#include "api/internal/io/TcpSocket_p.h" + +#ifdef _WIN32 +#include "api/internal/io/NetWin_p.h" +#endif + +#include <cstddef> + +namespace BamTools { +namespace Internal { + +class TcpSocketEngine +{ + + // ctors & dtor +public: + TcpSocketEngine(); + TcpSocketEngine(const TcpSocketEngine& other); + ~TcpSocketEngine(); + + // TcpSocketEngine interface +public: + // connection-related methods + void Close(); + bool Connect(const HostAddress& address, const uint16_t port); + bool Initialize(HostAddress::NetworkProtocol protocol); + bool IsValid() const; + + // IO-related methods + int64_t NumBytesAvailable() const; + int64_t Read(char* dest, std::size_t max); + int64_t Write(const char* data, std::size_t length); + + bool WaitForRead(int msec, bool* timedOut); + bool WaitForWrite(int msec, bool* timedOut); + + // query connection state + // HostAddress GetLocalAddress() const; + // uint16_t GetLocalPort() const; + HostAddress GetRemoteAddress() const; + uint16_t GetRemotePort() const; + + int GetSocketDescriptor() const; + TcpSocket::SocketError GetSocketError(); + TcpSocket::SocketState GetSocketState(); + + std::string GetErrorString() const; + + // platform-dependent internal methods + // provided in the corresponding TcpSocketEngine_<OS>_p.cpp +private: + void nativeClose(); + bool nativeConnect(const HostAddress& address, const uint16_t port); + bool nativeCreateSocket(HostAddress::NetworkProtocol protocol); + void nativeDisconnect(); + int64_t nativeNumBytesAvailable() const; + int64_t nativeRead(char* dest, std::size_t max); + int nativeSelect(int msecs, bool isRead) const; + int64_t nativeWrite(const char* data, std::size_t length); + + // data members +private: + int m_socketDescriptor; + + // HostAddress m_localAddress; + HostAddress m_remoteAddress; + // uint16_t m_localPort; + uint16_t m_remotePort; + + TcpSocket::SocketError m_socketError; + TcpSocket::SocketState m_socketState; + std::string m_errorString; + +#ifdef _WIN32 + WindowsSockInit m_win; +#endif +}; + +} // namespace Internal +} // namespace BamTools + +#endif // TCPSOCKETENGINE_P_H diff --git a/src/api/internal/io/TcpSocketEngine_unix_p.cpp b/src/api/internal/io/TcpSocketEngine_unix_p.cpp new file mode 100644 index 0000000..35cd307 --- /dev/null +++ b/src/api/internal/io/TcpSocketEngine_unix_p.cpp @@ -0,0 +1,220 @@ +// *************************************************************************** +// TcpSocketEngine_unix_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 15 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides low-level implementation of TCP I/O for all UNIX-like systems +// *************************************************************************** + +#include "api/internal/io/NetUnix_p.h" +#include "api/internal/io/TcpSocketEngine_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#ifdef SUN_OS +#include <sys/filio.h> +#endif + +#include <cerrno> +#include <cstddef> +#include <ctime> +#include <iostream> + +// ------------------------ +// static utility methods +// ------------------------ + +namespace BamTools { +namespace Internal {} // namespace Internal +} // namespace BamTools + +// -------------------------------- +// TcpSocketEngine implementation +// -------------------------------- + +void TcpSocketEngine::nativeClose() +{ + close(m_socketDescriptor); +} + +bool TcpSocketEngine::nativeConnect(const HostAddress& address, const uint16_t port) +{ + + // setup connection parameters from address/port + sockaddr_in sockAddrIPv4; + sockaddr_in6 sockAddrIPv6; + sockaddr* sockAddrPtr = 0; + BT_SOCKLEN_T sockAddrSize = 0; + + // IPv6 + if (address.GetProtocol() == HostAddress::IPv6Protocol) { + + memset(&sockAddrIPv6, 0, sizeof(sockAddrIPv6)); + sockAddrIPv6.sin6_family = AF_INET6; + sockAddrIPv6.sin6_port = htons(port); + + IPv6Address ip6 = address.GetIPv6Address(); + memcpy(&sockAddrIPv6.sin6_addr.s6_addr, &ip6, sizeof(ip6)); + + sockAddrSize = sizeof(sockAddrIPv6); + sockAddrPtr = (sockaddr*)&sockAddrIPv6; + } + + // IPv4 + else if (address.GetProtocol() == HostAddress::IPv4Protocol) { + + memset(&sockAddrIPv4, 0, sizeof(sockAddrIPv4)); + sockAddrIPv4.sin_family = AF_INET; + sockAddrIPv4.sin_port = htons(port); + sockAddrIPv4.sin_addr.s_addr = htonl(address.GetIPv4Address()); + + sockAddrSize = sizeof(sockAddrIPv4); + sockAddrPtr = (sockaddr*)&sockAddrIPv4; + } + + // unknown (should be unreachable) + else + BT_ASSERT_X(false, "TcpSocketEngine::nativeConnect() : unknown network protocol"); + + // attempt connection + int connectResult = connect(m_socketDescriptor, sockAddrPtr, sockAddrSize); + + // if failed, handle error + if (connectResult == -1) { + + // ensure state is set before checking errno + m_socketState = TcpSocket::UnconnectedState; + + // set error type/message depending on errno + switch ( + errno) { // <-- potential thread issues later? but can't get error type from connectResult + + case EISCONN: + m_socketState = TcpSocket::ConnectedState; // socket was already connected + break; + case ECONNREFUSED: + case EINVAL: + m_socketError = TcpSocket::ConnectionRefusedError; + m_errorString = "connection refused"; + break; + case ETIMEDOUT: + m_socketError = TcpSocket::NetworkError; + m_errorString = "connection timed out"; + break; + case EHOSTUNREACH: + m_socketError = TcpSocket::NetworkError; + m_errorString = "host unreachable"; + break; + case ENETUNREACH: + m_socketError = TcpSocket::NetworkError; + m_errorString = "network unreachable"; + break; + case EADDRINUSE: + m_socketError = TcpSocket::SocketResourceError; + m_errorString = "address already in use"; + break; + case EACCES: + case EPERM: + m_socketError = TcpSocket::SocketAccessError; + m_errorString = "permission denied"; + break; + default: + break; + } + + // double check that we're not in 'connected' state; if so, return failure + if (m_socketState != TcpSocket::ConnectedState) return false; + } + + // otherwise, we should be good + // update state & return success + m_socketState = TcpSocket::ConnectedState; + return true; +} + +bool TcpSocketEngine::nativeCreateSocket(HostAddress::NetworkProtocol protocol) +{ + + // get protocol value for requested protocol type + const int protocolNum = ((protocol == HostAddress::IPv6Protocol) ? AF_INET6 : AF_INET); + + // attempt to create socket + int socketFd = socket(protocolNum, SOCK_STREAM, IPPROTO_TCP); + + // if we fetched an invalid socket descriptor + if (socketFd <= 0) { + + // see what error we got + switch (errno) { + case EPROTONOSUPPORT: + case EAFNOSUPPORT: + case EINVAL: + m_socketError = TcpSocket::UnsupportedSocketOperationError; + m_errorString = "protocol not supported"; + break; + case ENFILE: + case EMFILE: + case ENOBUFS: + case ENOMEM: + m_socketError = TcpSocket::SocketResourceError; + m_errorString = "out of resources"; + break; + case EACCES: + m_socketError = TcpSocket::SocketAccessError; + m_errorString = "permission denied"; + break; + default: + break; + } + + // return failure + return false; + } + + // otherwise, store our socket FD & return success + m_socketDescriptor = socketFd; + return true; +} + +int64_t TcpSocketEngine::nativeNumBytesAvailable() const +{ + + // fetch number of bytes, return 0 on error + int numBytes(0); + if (ioctl(m_socketDescriptor, FIONREAD, (char*)&numBytes) < 0) return -1; + return static_cast<int64_t>(numBytes); +} + +int64_t TcpSocketEngine::nativeRead(char* dest, std::size_t max) +{ + const std::size_t ret = read(m_socketDescriptor, dest, max); + return static_cast<int64_t>(ret); +} + +// negative value for msecs will block (forever) until ready +int TcpSocketEngine::nativeSelect(int msecs, bool isRead) const +{ + + // set up FD set + fd_set fds; + FD_ZERO(&fds); + FD_SET(m_socketDescriptor, &fds); + + // setup our timeout + timeval tv; + tv.tv_sec = msecs / 1000; + tv.tv_usec = (msecs % 1000) * 1000; + + // do 'select' + if (isRead) + return select(m_socketDescriptor + 1, &fds, 0, 0, (msecs < 0 ? 0 : &tv)); + else + return select(m_socketDescriptor + 1, 0, &fds, 0, (msecs < 0 ? 0 : &tv)); +} + +int64_t TcpSocketEngine::nativeWrite(const char* data, std::size_t length) +{ + const std::size_t writtenBytes = write(m_socketDescriptor, data, length); + return static_cast<int64_t>(writtenBytes); +} diff --git a/src/api/internal/io/TcpSocketEngine_win_p.cpp b/src/api/internal/io/TcpSocketEngine_win_p.cpp new file mode 100644 index 0000000..6cc257a --- /dev/null +++ b/src/api/internal/io/TcpSocketEngine_win_p.cpp @@ -0,0 +1,242 @@ +// *************************************************************************** +// TcpSocketEngine_win_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 8 December 2011 (DB) +// --------------------------------------------------------------------------- +// Provides low-level implementation of TCP I/O for all Windows systems +// *************************************************************************** + +#include "api/internal/io/NetWin_p.h" +#include "api/internal/io/TcpSocketEngine_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstddef> +#include <cstring> +#include <iostream> +#include <sstream> + +// -------------------------------- +// TcpSocketEngine implementation +// -------------------------------- + +void TcpSocketEngine::nativeClose() +{ + closesocket(m_socketDescriptor); +} + +bool TcpSocketEngine::nativeConnect(const HostAddress& address, const uint16_t port) +{ + + // setup connection parameters from address/port + sockaddr_in sockAddrIPv4; + sockaddr_in6 sockAddrIPv6; + sockaddr* sockAddrPtr = 0; + BT_SOCKLEN_T sockAddrSize = 0; + + // IPv6 + if (address.GetProtocol() == HostAddress::IPv6Protocol) { + + memset(&sockAddrIPv6, 0, sizeof(sockAddrIPv6)); + sockAddrIPv6.sin6_family = AF_INET6; + sockAddrIPv6.sin6_port = htons(port); + + IPv6Address ip6 = address.GetIPv6Address(); + memcpy(&sockAddrIPv6.sin6_addr.s6_addr, &ip6, sizeof(ip6)); + + sockAddrSize = sizeof(sockAddrIPv6); + sockAddrPtr = (sockaddr*)&sockAddrIPv6; + } + + // IPv4 + else if (address.GetProtocol() == HostAddress::IPv4Protocol) { + + memset(&sockAddrIPv4, 0, sizeof(sockAddrIPv4)); + sockAddrIPv4.sin_family = AF_INET; + sockAddrIPv4.sin_port = htons(port); + sockAddrIPv4.sin_addr.s_addr = htonl(address.GetIPv4Address()); + + sockAddrSize = sizeof(sockAddrIPv4); + sockAddrPtr = (sockaddr*)&sockAddrIPv4; + } + + // unknown (should be unreachable) + else + BT_ASSERT_X(false, "TcpSocketEngine::nativeConnect() : unknown network protocol"); + + // attempt conenction + const int connectResult = WSAConnect(m_socketDescriptor, sockAddrPtr, sockAddrSize, 0, 0, 0, 0); + + // if failed, handle error + if (connectResult == SOCKET_ERROR) { + + // ensure state is set before checking error code + m_socketState = TcpSocket::UnconnectedState; + + // set error type/message depending on errorCode + const int errorCode = WSAGetLastError(); + switch (errorCode) { + case WSANOTINITIALISED: + m_socketError = TcpSocket::UnknownSocketError; + m_errorString = "Windows socket functionality not properly initialized"; + break; + case WSAEISCONN: + m_socketState = TcpSocket::ConnectedState; // socket already connected + break; + case WSAECONNREFUSED: + case WSAEINVAL: + m_socketError = TcpSocket::ConnectionRefusedError; + m_errorString = "connection refused"; + break; + case WSAETIMEDOUT: + m_socketError = TcpSocket::NetworkError; + m_errorString = "connection timed out"; + break; + case WSAEHOSTUNREACH: + m_socketError = TcpSocket::NetworkError; + m_errorString = "host unreachable"; + break; + case WSAENETUNREACH: + m_socketError = TcpSocket::NetworkError; + m_errorString = "network unreachable"; + break; + case WSAEADDRINUSE: + m_socketError = TcpSocket::SocketResourceError; + m_errorString = "address already in use"; + break; + case WSAEACCES: + m_socketError = TcpSocket::SocketAccessError; + m_errorString = "permission denied"; + break; + default: + break; + } + + // double check that we're not in 'connected' state; if so, return failure + if (m_socketState != TcpSocket::ConnectedState) return false; + } + + // otherwise, we should be good + // update state & return success + m_socketState = TcpSocket::ConnectedState; + return true; +} + +bool TcpSocketEngine::nativeCreateSocket(HostAddress::NetworkProtocol protocol) +{ + + // get protocol value for requested protocol type + const int protocolNum = ((protocol == HostAddress::IPv6Protocol) ? AF_INET6 : AF_INET); + + // attempt to create socket + SOCKET socketFd = WSASocket(protocolNum, SOCK_STREAM, IPPROTO_TCP, 0, 0, WSA_FLAG_OVERLAPPED); + + // if we fetched an invalid socket descriptor + if (socketFd == INVALID_SOCKET) { + + // set error type/message depending on error code + const int errorCode = WSAGetLastError(); + switch (errorCode) { + case WSANOTINITIALISED: + m_socketError = TcpSocket::UnknownSocketError; + m_errorString = "Windows socket functionality not properly initialized"; + break; + case WSAEAFNOSUPPORT: + case WSAESOCKTNOSUPPORT: + case WSAEPROTOTYPE: + case WSAEINVAL: + m_socketError = TcpSocket::UnsupportedSocketOperationError; + m_errorString = "protocol not supported"; + break; + case WSAEMFILE: + case WSAENOBUFS: + m_socketError = TcpSocket::SocketResourceError; + m_errorString = "out of resources"; + break; + default: + m_socketError = TcpSocket::UnknownSocketError; + std::stringstream errStream; + errStream << "WSA ErrorCode: " << errorCode; + m_errorString = errStream.str(); + break; + } + + // return failure + return false; + } + + // otherwise, store our socket FD & return success + m_socketDescriptor = static_cast<int>(socketFd); + return true; +} + +int64_t TcpSocketEngine::nativeNumBytesAvailable() const +{ + + int64_t numBytes(0); + int64_t dummy(0); + DWORD bytesWritten(0); + + const int ioctlResult = WSAIoctl(m_socketDescriptor, FIONREAD, &dummy, sizeof(dummy), &numBytes, + sizeof(numBytes), &bytesWritten, 0, 0); + return (ioctlResult == SOCKET_ERROR ? -1 : numBytes); +} + +int64_t TcpSocketEngine::nativeRead(char* dest, std::size_t max) +{ + + // skip if invalid socket + if (!IsValid()) return -1; + + // set up our WSA output buffer + WSABUF buf; + buf.buf = dest; + buf.len = max; + + // attempt to read bytes + DWORD flags = 0; + DWORD bytesRead = 0; + const int readResult = WSARecv(m_socketDescriptor, &buf, 1, &bytesRead, &flags, 0, 0); + if (readResult == SOCKET_ERROR) return -1; + + // return number of bytes read + return static_cast<int64_t>(bytesRead); +} + +// negative value for msecs will block (forever) until +int TcpSocketEngine::nativeSelect(int msecs, bool isRead) const +{ + + fd_set fds; + FD_ZERO(&fds); + FD_SET(m_socketDescriptor, &fds); + + timeval tv; + tv.tv_sec = msecs / 1000; + tv.tv_usec = (msecs % 1000) * 1000; + + // do 'select' + if (isRead) + return select(0, &fds, 0, 0, (msecs < 0 ? 0 : &tv)); + else + return select(0, 0, &fds, 0, (msecs < 0 ? 0 : &tv)); +} + +int64_t TcpSocketEngine::nativeWrite(const char* data, std::size_t length) +{ + + // setup our WSA write buffer + WSABUF buf; + buf.buf = (char*)data; + buf.len = length; + + // attempt to write bytes + DWORD flags = 0; + DWORD bytesWritten = 0; + const int writeResult = WSASend(m_socketDescriptor, &buf, 1, &bytesWritten, flags, 0, 0); + if (writeResult == SOCKET_ERROR) return -1; + + // return number of bytes written + return static_cast<int64_t>(bytesWritten); +} diff --git a/src/api/internal/io/TcpSocket_p.cpp b/src/api/internal/io/TcpSocket_p.cpp new file mode 100644 index 0000000..fee7823 --- /dev/null +++ b/src/api/internal/io/TcpSocket_p.cpp @@ -0,0 +1,446 @@ +// *************************************************************************** +// TcpSocket_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 5 January 2012 (DB) +// --------------------------------------------------------------------------- +// Provides basic TCP I/O interface +// *************************************************************************** + +#include "api/internal/io/TcpSocket_p.h" +#include "api/internal/io/ByteArray_p.h" +#include "api/internal/io/TcpSocketEngine_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <algorithm> +#include <climits> +#include <cstddef> +#include <sstream> +#include <vector> + +// Windows is brain-damaged and pollutes the entire global namespace with +// its min() macro, which in turn causes MSVC to go haywire on std::min. +#undef min + +// ------------------------------------ +// static utility methods & constants +// ------------------------------------ + +namespace BamTools { +namespace Internal { + +// constants +static const std::size_t DEFAULT_BUFFER_SIZE = 0x10000; +static const int64_t DEFAULT_BUFFER_SIZE64 = DEFAULT_BUFFER_SIZE; + +} // namespace Internal +} // namespace BamTools + +// -------------------------- +// TcpSocket implementation +// -------------------------- + +TcpSocket::TcpSocket() + : m_mode(IBamIODevice::NotOpen) + // , m_localPort(0) + , m_remotePort(0) + , m_engine(0) + , m_cachedSocketDescriptor(-1) + , m_readBuffer(DEFAULT_BUFFER_SIZE) + , m_error(TcpSocket::NoError) + , m_state(TcpSocket::UnconnectedState) +{} + +TcpSocket::~TcpSocket() +{ + if (m_state == TcpSocket::ConnectedState) DisconnectFromHost(); +} + +std::size_t TcpSocket::BufferBytesAvailable() const +{ + return m_readBuffer.Size(); +} + +bool TcpSocket::CanReadLine() const +{ + return m_readBuffer.CanReadLine(); +} + +void TcpSocket::ClearBuffer() +{ + m_readBuffer.Clear(); +} + +bool TcpSocket::ConnectImpl(const HostInfo& hostInfo, const std::string& port, + IBamIODevice::OpenMode mode) +{ + // skip if we're already connected + if (m_state == TcpSocket::ConnectedState) { + m_error = TcpSocket::SocketResourceError; + m_errorString = "socket already connected"; + return false; + } + + // reset socket state + m_hostName = hostInfo.HostName(); + m_mode = mode; + m_state = TcpSocket::UnconnectedState; + m_error = TcpSocket::NoError; + // m_localPort = 0; + m_remotePort = 0; + // m_localAddress.Clear(); + m_remoteAddress.Clear(); + m_readBuffer.Clear(); + + // fetch candidate addresses for requested host + std::vector<HostAddress> addresses = hostInfo.Addresses(); + if (addresses.empty()) { + m_error = TcpSocket::HostNotFoundError; + m_errorString = "no IP addresses found for host"; + return false; + } + + // convert port string to integer + std::stringstream ss(port); + uint16_t portNumber(0); + ss >> portNumber; + + // iterate through adddresses + std::vector<HostAddress>::const_iterator addrIter = addresses.begin(); + std::vector<HostAddress>::const_iterator addrEnd = addresses.end(); + for (; addrIter != addrEnd; ++addrIter) { + const HostAddress& addr = (*addrIter); + + // try to initialize socket engine with this address + if (!InitializeSocketEngine(addr.GetProtocol())) { + // failure to initialize is OK here + // we'll just try the next available address + continue; + } + + // attempt actual connection + if (m_engine->Connect(addr, portNumber)) { + + // if connection successful, update our state & return true + m_mode = mode; + // m_localAddress = m_engine->GetLocalAddress(); + // m_localPort = m_engine->GetLocalPort(); + m_remoteAddress = m_engine->GetRemoteAddress(); + m_remotePort = m_engine->GetRemotePort(); + m_cachedSocketDescriptor = m_engine->GetSocketDescriptor(); + m_state = TcpSocket::ConnectedState; + return true; + } + } + + // if we get here, no connection could be made + m_error = TcpSocket::HostNotFoundError; + m_errorString = "could not connect to any host addresses"; + return false; +} + +bool TcpSocket::ConnectToHost(const std::string& hostName, uint16_t port, + IBamIODevice::OpenMode mode) +{ + std::stringstream ss; + ss << port; + return ConnectToHost(hostName, ss.str(), mode); +} + +bool TcpSocket::ConnectToHost(const std::string& hostName, const std::string& port, + IBamIODevice::OpenMode mode) +{ + // create new address object with requested host name + HostAddress hostAddress; + hostAddress.SetAddress(hostName); + + HostInfo info; + // if host name was IP address ("x.x.x.x" or IPv6 format) + // otherwise host name was 'plain-text' ("www.foo.bar") + // we need to look up IP address(es) + if (hostAddress.HasIPAddress()) + info.SetAddresses(std::vector<HostAddress>(1, hostAddress)); + else + info = HostInfo::Lookup(hostName, port); + + // attempt connection on requested port + return ConnectImpl(info, port, mode); +} + +void TcpSocket::DisconnectFromHost() +{ + + // close socket engine & delete + if (m_state == TcpSocket::ConnectedState) ResetSocketEngine(); + + // reset connection state + // m_localPort = 0; + m_remotePort = 0; + // m_localAddress.Clear(); + m_remoteAddress.Clear(); + m_hostName.clear(); + m_cachedSocketDescriptor = -1; + + // for future, make sure there's outgoing data that needs to be flushed + m_readBuffer.Clear(); +} + +TcpSocket::SocketError TcpSocket::GetError() const +{ + return m_error; +} + +std::string TcpSocket::GetErrorString() const +{ + return m_errorString; +} + +std::string TcpSocket::GetHostName() const +{ + return m_hostName; +} + +//HostAddress TcpSocket::GetLocalAddress() const { +// return m_localAddress; +//} + +//uint16_t TcpSocket::GetLocalPort() const { +// return m_localPort; +//} + +HostAddress TcpSocket::GetRemoteAddress() const +{ + return m_remoteAddress; +} + +uint16_t TcpSocket::GetRemotePort() const +{ + return m_remotePort; +} + +TcpSocket::SocketState TcpSocket::GetState() const +{ + return m_state; +} + +bool TcpSocket::InitializeSocketEngine(HostAddress::NetworkProtocol protocol) +{ + ResetSocketEngine(); + m_engine = new TcpSocketEngine; + return m_engine->Initialize(protocol); +} + +bool TcpSocket::IsConnected() const +{ + if (m_engine == 0) return false; + return (m_engine->IsValid() && (m_state == TcpSocket::ConnectedState)); +} + +// may be read in a look until desired data amount has been read +// returns: number of bytes read, or -1 if error +int64_t TcpSocket::Read(char* data, const unsigned int numBytes) +{ + + // if we have data in buffer, just return it + if (!m_readBuffer.IsEmpty()) { + const std::size_t bytesRead = m_readBuffer.Read(data, numBytes); + return static_cast<int64_t>(bytesRead); + } + + // otherwise, we'll need to fetch data from socket + // first make sure we have a valid socket engine + if (m_engine == 0) { + // TODO: set error string/state? + return -1; + } + + // fetch data from socket, return 0 for success, -1 for failure + // since this should be called in a loop, + // we'll pull the actual bytes from the buffer on next iteration + const int64_t socketBytesRead = ReadFromSocket(); + if (socketBytesRead < 0) { + // TODO: set error string/state ? + return -1; + } + + // we should have data now in buffer, try to fetch requested amount + // if nothing in buffer, we will return 0 bytes read (signals EOF reached) + const std::size_t numBytesRead = m_readBuffer.Read(data, numBytes); + return static_cast<int64_t>(numBytesRead); +} + +int64_t TcpSocket::ReadFromSocket() +{ + + // check for any socket engine errors + if (!m_engine->IsValid()) { + m_errorString = "TcpSocket::ReadFromSocket - socket disconnected"; + ResetSocketEngine(); + return -1; + } + + // wait for ready read + bool timedOut; + const bool isReadyRead = m_engine->WaitForRead(5000, &timedOut); + + // if not ready + if (!isReadyRead) { + + // if we simply timed out + if (timedOut) { + // TODO: get add'l error info from engine ? + m_errorString = "TcpSocket::ReadFromSocket - timed out waiting for ready read"; + } + + // otherwise, there was some other error + else { + // TODO: get add'l error info from engine ? + m_errorString = + "TcpSocket::ReadFromSocket - encountered error while waiting for ready read"; + } + + // return failure + return -1; + } + + // get number of bytes available from socket + const int64_t bytesToRead = m_engine->NumBytesAvailable(); + if (bytesToRead < 0) { + // TODO: get add'l error info from engine ? + m_errorString = + "TcpSocket::ReadFromSocket - encountered error while determining numBytesAvailable"; + return -1; + } + + // make space in buffer & read from socket + char* buffer = m_readBuffer.Reserve(bytesToRead); + const int64_t numBytesRead = m_engine->Read(buffer, bytesToRead); + if (numBytesRead == -1) { + // TODO: get add'l error info from engine ? + m_errorString = "TcpSocket::ReadFromSocket - encountered error while reading bytes"; + } + + // return number of bytes actually read + return numBytesRead; +} + +std::string TcpSocket::ReadLine(int64_t max) +{ + + // prep result byte buffer + ByteArray result; + std::size_t bufferMax = + ((max > static_cast<int64_t>(UINT_MAX)) ? UINT_MAX : static_cast<std::size_t>(max)); + result.Resize(bufferMax); + + // read data + int64_t readBytes(0); + if (result.Size() == 0) { + + if (bufferMax == 0) bufferMax = UINT_MAX; + + result.Resize(1); + + int64_t readResult; + do { + result.Resize( + static_cast<std::size_t>(std::min(bufferMax, result.Size() + DEFAULT_BUFFER_SIZE))); + readResult = ReadLine(result.Data() + readBytes, result.Size() - readBytes); + if (readResult > 0 || readBytes == 0) readBytes += readResult; + } while (readResult == DEFAULT_BUFFER_SIZE64 && + result[static_cast<std::size_t>(readBytes - 1)] != '\n'); + + } else + readBytes = ReadLine(result.Data(), result.Size()); + + // clean up byte buffer + if (readBytes <= 0) + result.Clear(); + else + result.Resize(static_cast<std::size_t>(readBytes)); + + // return byte buffer as string + return std::string(result.ConstData(), result.Size()); +} + +int64_t TcpSocket::ReadLine(char* dest, std::size_t max) +{ + + // wait for buffer to contain line contents + if (!WaitForReadLine()) { + m_errorString = "TcpSocket::ReadLine - error waiting for read line"; + return -1; + } + + // leave room for null term + if (max < 2) return -1; + --max; + + // read from buffer, handle newlines + int64_t readSoFar = m_readBuffer.ReadLine(dest, max); + if (readSoFar && dest[readSoFar - 1] == '\n') { + + // adjust for windows-style '\r\n' + if (readSoFar > 1 && dest[readSoFar - 2] == '\r') { + --readSoFar; + dest[readSoFar - 1] = '\n'; + } + } + + // null terminate & return number of bytes read + dest[readSoFar] = '\0'; + return readSoFar; +} + +void TcpSocket::ResetSocketEngine() +{ + + // shut down socket engine + if (m_engine) { + m_engine->Close(); + delete m_engine; + m_engine = 0; + } + + // reset our state & cached socket handle + m_state = TcpSocket::UnconnectedState; + m_cachedSocketDescriptor = -1; +} + +bool TcpSocket::WaitForReadLine() +{ + + // wait until we can read a line (will return immediately if already capable) + while (!CanReadLine()) { + if (!ReadFromSocket()) return false; + } + + // if we get here, success + return true; +} + +int64_t TcpSocket::Write(const char* data, const unsigned int numBytes) +{ + + // single-shot attempt at write (not buffered, just try to shove the data through socket) + // this method purely exists to send 'small' HTTP requests/FTP commands from client to server + + // wait for our socket to be write-able + bool timedOut; + const bool isReadyWrite = m_engine->WaitForWrite(3000, &timedOut); + + // if ready, return number of bytes written + if (isReadyWrite) return m_engine->Write(data, numBytes); + + // otherwise, socket not ready for writing + // set error string depending on reason & return failure + if (!timedOut) { + // TODO: get add'l error info from engine ?? + m_errorString = "TcpSocket::Write - timed out waiting for ready-write"; + } else { + // TODO: get add'l error info from engine ?? + m_errorString = "TcpSocket::Write - error encountered while waiting for ready-write"; + } + return -1; +} diff --git a/src/api/internal/io/TcpSocket_p.h b/src/api/internal/io/TcpSocket_p.h new file mode 100644 index 0000000..3ba33c6 --- /dev/null +++ b/src/api/internal/io/TcpSocket_p.h @@ -0,0 +1,132 @@ +// *************************************************************************** +// TcpSocket_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 7 December 2011 (DB) +// --------------------------------------------------------------------------- +// Provides basic TCP I/O interface +// *************************************************************************** + +#ifndef TCPSOCKET_P_H +#define TCPSOCKET_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <cstddef> +#include <string> +#include "api/IBamIODevice.h" +#include "api/internal/io/HostInfo_p.h" +#include "api/internal/io/RollingBuffer_p.h" + +namespace BamTools { +namespace Internal { + +class BamHttp; +class TcpSocketEngine; + +class TcpSocket +{ + + // enums +public: + enum SocketError + { + NoError = -2, + UnknownSocketError = -1, + ConnectionRefusedError = 0, + RemoteHostClosedError, + HostNotFoundError, + SocketAccessError, + SocketResourceError, + SocketTimeoutError, + NetworkError, + UnsupportedSocketOperationError + }; + + enum SocketState + { + UnconnectedState = 0, + ConnectedState + }; + + // ctor & dtor +public: + TcpSocket(); + ~TcpSocket(); + + // TcpSocket interface +public: + // connection methods + bool ConnectToHost(const std::string& hostName, + const uint16_t port, // Connect("host", 80) + IBamIODevice::OpenMode mode = IBamIODevice::ReadOnly); + bool ConnectToHost(const std::string& hostName, + const std::string& port, // Connect("host", "80") + IBamIODevice::OpenMode mode = IBamIODevice::ReadOnly); + void DisconnectFromHost(); + bool IsConnected() const; + + // I/O methods + std::size_t BufferBytesAvailable() const; + bool CanReadLine() const; + void ClearBuffer(); // force buffer to clear (not a 'flush', just a 'discard') + int64_t Read(char* data, const unsigned int numBytes); + std::string ReadLine(int64_t max = 0); + int64_t ReadLine(char* dest, std::size_t max); + bool WaitForReadLine(); + int64_t Write(const char* data, const unsigned int numBytes); + + // connection values + std::string GetHostName() const; + // HostAddress GetLocalAddress() const; + // uint16_t GetLocalPort() const; + HostAddress GetRemoteAddress() const; + uint16_t GetRemotePort() const; + + // connection status + TcpSocket::SocketError GetError() const; + TcpSocket::SocketState GetState() const; + std::string GetErrorString() const; + + // internal methods +private: + bool ConnectImpl(const HostInfo& hostInfo, const std::string& port, + IBamIODevice::OpenMode mode); + bool InitializeSocketEngine(HostAddress::NetworkProtocol protocol); + int64_t ReadFromSocket(); + void ResetSocketEngine(); + + // data members +private: + IBamIODevice::OpenMode m_mode; + + std::string m_hostName; + // uint16_t m_localPort; + uint16_t m_remotePort; + // HostAddress m_localAddress; + HostAddress m_remoteAddress; + + TcpSocketEngine* m_engine; + int m_cachedSocketDescriptor; + + RollingBuffer m_readBuffer; + + TcpSocket::SocketError m_error; + TcpSocket::SocketState m_state; + std::string m_errorString; + + friend class BamHttp; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // TCPSOCKET_P_H diff --git a/src/api/internal/sam/CMakeLists.txt b/src/api/internal/sam/CMakeLists.txt new file mode 100644 index 0000000..2f303bd --- /dev/null +++ b/src/api/internal/sam/CMakeLists.txt @@ -0,0 +1,17 @@ +# ========================== +# BamTools CMakeLists.txt +# (c) 2011 Derek Barnett +# +# src/api/internal/sam +# ========================== + +set( InternalSamDir "${InternalDir}/sam" ) + +set( InternalSamSources + ${InternalSamDir}/SamFormatParser_p.cpp + ${InternalSamDir}/SamFormatPrinter_p.cpp + ${InternalSamDir}/SamHeaderValidator_p.cpp + + PARENT_SCOPE # <-- leave this last +) + diff --git a/src/api/internal/sam/SamFormatParser_p.cpp b/src/api/internal/sam/SamFormatParser_p.cpp new file mode 100644 index 0000000..2370e26 --- /dev/null +++ b/src/api/internal/sam/SamFormatParser_p.cpp @@ -0,0 +1,263 @@ +// *************************************************************************** +// SamFormatParser.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 8 December 2011 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for parsing SAM header text into SamHeader object +// *************************************************************************** + +#include "api/internal/sam/SamFormatParser_p.h" +#include "api/SamConstants.h" +#include "api/SamHeader.h" +#include "api/internal/utils/BamException_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <iostream> +#include <sstream> +#include <vector> + +SamFormatParser::SamFormatParser(SamHeader& header) + : m_header(header) +{} + +SamFormatParser::~SamFormatParser() {} + +void SamFormatParser::Parse(const std::string& headerText) +{ + + // clear header's prior contents + m_header.Clear(); + + // empty header is OK, but skip processing + if (headerText.empty()) return; + + // other wise parse SAM lines + std::istringstream headerStream(headerText); + std::string headerLine; + while (std::getline(headerStream, headerLine)) + ParseSamLine(headerLine); +} + +void SamFormatParser::ParseSamLine(const std::string& line) +{ + + // skip if line is not long enough to contain true values + if (line.length() < 5) return; + + // determine token at beginning of line + const std::string firstToken = line.substr(0, 3); + const std::string restOfLine = line.substr(4); + if (firstToken == Constants::SAM_HD_BEGIN_TOKEN) + ParseHDLine(restOfLine); + else if (firstToken == Constants::SAM_SQ_BEGIN_TOKEN) + ParseSQLine(restOfLine); + else if (firstToken == Constants::SAM_RG_BEGIN_TOKEN) + ParseRGLine(restOfLine); + else if (firstToken == Constants::SAM_PG_BEGIN_TOKEN) + ParsePGLine(restOfLine); + else if (firstToken == Constants::SAM_CO_BEGIN_TOKEN) + ParseCOLine(restOfLine); +} + +void SamFormatParser::ParseHDLine(const std::string& line) +{ + + // split HD lines into tokens + std::vector<std::string> tokens = Split(line, Constants::SAM_TAB); + + // iterate over tokens + std::vector<std::string>::const_iterator tokenIter = tokens.begin(); + std::vector<std::string>::const_iterator tokenEnd = tokens.end(); + for (; tokenIter != tokenEnd; ++tokenIter) { + + // get tag/value + const std::string tokenTag = (*tokenIter).substr(0, 2); + const std::string tokenValue = (*tokenIter).substr(3); + + // set header contents + if (tokenTag == Constants::SAM_HD_VERSION_TAG) + m_header.Version = tokenValue; + else if (tokenTag == Constants::SAM_HD_SORTORDER_TAG) + m_header.SortOrder = tokenValue; + else if (tokenTag == Constants::SAM_HD_GROUPORDER_TAG) + m_header.GroupOrder = tokenValue; + else { // custom tag + CustomHeaderTag otherTag; + otherTag.TagName = tokenTag; + otherTag.TagValue = tokenValue; + m_header.CustomTags.push_back(otherTag); + } + } + + // check for required tags + if (!m_header.HasVersion()) + throw BamException("SamFormatParser::ParseHDLine", "@HD line is missing VN tag"); +} + +void SamFormatParser::ParseSQLine(const std::string& line) +{ + + SamSequence seq; + + // split SQ line into tokens + std::vector<std::string> tokens = Split(line, Constants::SAM_TAB); + + // iterate over tokens + std::vector<std::string>::const_iterator tokenIter = tokens.begin(); + std::vector<std::string>::const_iterator tokenEnd = tokens.end(); + for (; tokenIter != tokenEnd; ++tokenIter) { + + // get tag/value + const std::string tokenTag = (*tokenIter).substr(0, 2); + const std::string tokenValue = (*tokenIter).substr(3); + + // set sequence contents + if (tokenTag == Constants::SAM_SQ_NAME_TAG) + seq.Name = tokenValue; + else if (tokenTag == Constants::SAM_SQ_LENGTH_TAG) + seq.Length = tokenValue; + else if (tokenTag == Constants::SAM_SQ_ASSEMBLYID_TAG) + seq.AssemblyID = tokenValue; + else if (tokenTag == Constants::SAM_SQ_CHECKSUM_TAG) + seq.Checksum = tokenValue; + else if (tokenTag == Constants::SAM_SQ_SPECIES_TAG) + seq.Species = tokenValue; + else if (tokenTag == Constants::SAM_SQ_URI_TAG) + seq.URI = tokenValue; + else { // custom tag + CustomHeaderTag otherTag; + otherTag.TagName = tokenTag; + otherTag.TagValue = tokenValue; + seq.CustomTags.push_back(otherTag); + } + } + + // check for required tags + if (!seq.HasName()) + throw BamException("SamFormatParser::ParseSQLine", "@SQ line is missing SN tag"); + if (!seq.HasLength()) + throw BamException("SamFormatParser::ParseSQLine", "@SQ line is missing LN tag"); + + // store SAM sequence entry + m_header.Sequences.Add(seq); +} + +void SamFormatParser::ParseRGLine(const std::string& line) +{ + + SamReadGroup rg; + + // split string into tokens + std::vector<std::string> tokens = Split(line, Constants::SAM_TAB); + + // iterate over tokens + std::vector<std::string>::const_iterator tokenIter = tokens.begin(); + std::vector<std::string>::const_iterator tokenEnd = tokens.end(); + for (; tokenIter != tokenEnd; ++tokenIter) { + + // get token tag/value + const std::string tokenTag = (*tokenIter).substr(0, 2); + const std::string tokenValue = (*tokenIter).substr(3); + + // set read group contents + if (tokenTag == Constants::SAM_RG_ID_TAG) + rg.ID = tokenValue; + else if (tokenTag == Constants::SAM_RG_DESCRIPTION_TAG) + rg.Description = tokenValue; + else if (tokenTag == Constants::SAM_RG_FLOWORDER_TAG) + rg.FlowOrder = tokenValue; + else if (tokenTag == Constants::SAM_RG_KEYSEQUENCE_TAG) + rg.KeySequence = tokenValue; + else if (tokenTag == Constants::SAM_RG_LIBRARY_TAG) + rg.Library = tokenValue; + else if (tokenTag == Constants::SAM_RG_PLATFORMUNIT_TAG) + rg.PlatformUnit = tokenValue; + else if (tokenTag == Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG) + rg.PredictedInsertSize = tokenValue; + else if (tokenTag == Constants::SAM_RG_PRODUCTIONDATE_TAG) + rg.ProductionDate = tokenValue; + else if (tokenTag == Constants::SAM_RG_PROGRAM_TAG) + rg.Program = tokenValue; + else if (tokenTag == Constants::SAM_RG_SAMPLE_TAG) + rg.Sample = tokenValue; + else if (tokenTag == Constants::SAM_RG_SEQCENTER_TAG) + rg.SequencingCenter = tokenValue; + else if (tokenTag == Constants::SAM_RG_SEQTECHNOLOGY_TAG) + rg.SequencingTechnology = tokenValue; + else { // custom tag + CustomHeaderTag otherTag; + otherTag.TagName = tokenTag; + otherTag.TagValue = tokenValue; + rg.CustomTags.push_back(otherTag); + } + } + + // check for required tags + if (!rg.HasID()) + throw BamException("SamFormatParser::ParseRGLine", "@RG line is missing ID tag"); + + // store SAM read group entry + m_header.ReadGroups.Add(rg); +} + +void SamFormatParser::ParsePGLine(const std::string& line) +{ + + SamProgram pg; + + // split string into tokens + std::vector<std::string> tokens = Split(line, Constants::SAM_TAB); + + // iterate over tokens + std::vector<std::string>::const_iterator tokenIter = tokens.begin(); + std::vector<std::string>::const_iterator tokenEnd = tokens.end(); + for (; tokenIter != tokenEnd; ++tokenIter) { + + // get token tag/value + const std::string tokenTag = (*tokenIter).substr(0, 2); + const std::string tokenValue = (*tokenIter).substr(3); + + // set program record contents + if (tokenTag == Constants::SAM_PG_ID_TAG) + pg.ID = tokenValue; + else if (tokenTag == Constants::SAM_PG_NAME_TAG) + pg.Name = tokenValue; + else if (tokenTag == Constants::SAM_PG_COMMANDLINE_TAG) + pg.CommandLine = tokenValue; + else if (tokenTag == Constants::SAM_PG_PREVIOUSPROGRAM_TAG) + pg.PreviousProgramID = tokenValue; + else if (tokenTag == Constants::SAM_PG_VERSION_TAG) + pg.Version = tokenValue; + else { // custom tag + CustomHeaderTag otherTag; + otherTag.TagName = tokenTag; + otherTag.TagValue = tokenValue; + pg.CustomTags.push_back(otherTag); + } + } + + // check for required tags + if (!pg.HasID()) + throw BamException("SamFormatParser::ParsePGLine", "@PG line is missing ID tag"); + + // store SAM program entry + m_header.Programs.Add(pg); +} + +void SamFormatParser::ParseCOLine(const std::string& line) +{ + // simply add line to comments list + m_header.Comments.push_back(line); +} + +const std::vector<std::string> SamFormatParser::Split(const std::string& line, const char delim) +{ + std::vector<std::string> tokens; + std::stringstream lineStream(line); + std::string token; + while (std::getline(lineStream, token, delim)) + tokens.push_back(token); + return tokens; +} diff --git a/src/api/internal/sam/SamFormatParser_p.h b/src/api/internal/sam/SamFormatParser_p.h new file mode 100644 index 0000000..39bd44a --- /dev/null +++ b/src/api/internal/sam/SamFormatParser_p.h @@ -0,0 +1,62 @@ +// *************************************************************************** +// SamFormatParser.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for parsing SAM header text into SamHeader object +// *************************************************************************** + +#ifndef SAM_FORMAT_PARSER_H +#define SAM_FORMAT_PARSER_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <string> +#include <vector> + +namespace BamTools { + +struct SamHeader; + +namespace Internal { + +class SamFormatParser +{ + + // ctor & dtor +public: + SamFormatParser(BamTools::SamHeader& header); + ~SamFormatParser(); + + // parse text & populate header data +public: + void Parse(const std::string& headerText); + + // internal methods +private: + void ParseSamLine(const std::string& line); + void ParseHDLine(const std::string& line); + void ParseSQLine(const std::string& line); + void ParseRGLine(const std::string& line); + void ParsePGLine(const std::string& line); + void ParseCOLine(const std::string& line); + const std::vector<std::string> Split(const std::string& line, const char delim); + + // data members +private: + SamHeader& m_header; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // SAM_FORMAT_PARSER_H diff --git a/src/api/internal/sam/SamFormatPrinter_p.cpp b/src/api/internal/sam/SamFormatPrinter_p.cpp new file mode 100644 index 0000000..2b93a04 --- /dev/null +++ b/src/api/internal/sam/SamFormatPrinter_p.cpp @@ -0,0 +1,240 @@ +// *************************************************************************** +// SamFormatPrinter.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for printing formatted SAM header to string +// *************************************************************************** + +#include "api/internal/sam/SamFormatPrinter_p.h" +#include "api/SamConstants.h" +#include "api/SamHeader.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstddef> +#include <iostream> +#include <sstream> +#include <vector> + +// ------------------------ +// static utility methods +// ------------------------ + +static inline const std::string FormatTag(const std::string& tag, const std::string& value) +{ + return std::string(Constants::SAM_TAB + tag + Constants::SAM_COLON + value); +} + +// --------------------------------- +// SamFormatPrinter implementation +// --------------------------------- + +SamFormatPrinter::SamFormatPrinter(const SamHeader& header) + : m_header(header) +{} + +SamFormatPrinter::~SamFormatPrinter() {} + +const std::string SamFormatPrinter::ToString() const +{ + + // clear out stream + std::stringstream out; + + // generate formatted header text + PrintHD(out); + PrintSQ(out); + PrintRG(out); + PrintPG(out); + PrintCO(out); + + // return result + return out.str(); +} + +void SamFormatPrinter::PrintHD(std::stringstream& out) const +{ + + // if header has @HD data + if (m_header.HasVersion()) { + + // @HD VN:<Version> + out << Constants::SAM_HD_BEGIN_TOKEN + << FormatTag(Constants::SAM_HD_VERSION_TAG, m_header.Version); + + // SO:<SortOrder> + if (m_header.HasSortOrder()) + out << FormatTag(Constants::SAM_HD_SORTORDER_TAG, m_header.SortOrder); + + // GO:<GroupOrder> + if (m_header.HasGroupOrder()) + out << FormatTag(Constants::SAM_HD_GROUPORDER_TAG, m_header.GroupOrder); + + // custom tags + if (!m_header.CustomTags.empty()) { + for (std::size_t i = 0; i < m_header.CustomTags.size(); ++i) { + const CustomHeaderTag& customTag = m_header.CustomTags[i]; + out << FormatTag(customTag.TagName, customTag.TagValue); + } + } + // newline + out << std::endl; + } +} + +void SamFormatPrinter::PrintSQ(std::stringstream& out) const +{ + + // iterate over sequence entries + SamSequenceConstIterator seqIter = m_header.Sequences.ConstBegin(); + SamSequenceConstIterator seqEnd = m_header.Sequences.ConstEnd(); + for (; seqIter != seqEnd; ++seqIter) { + const SamSequence& seq = (*seqIter); + + // @SQ SN:<Name> LN:<Length> + out << Constants::SAM_SQ_BEGIN_TOKEN << FormatTag(Constants::SAM_SQ_NAME_TAG, seq.Name) + << FormatTag(Constants::SAM_SQ_LENGTH_TAG, seq.Length); + + // AS:<AssemblyID> + if (seq.HasAssemblyID()) out << FormatTag(Constants::SAM_SQ_ASSEMBLYID_TAG, seq.AssemblyID); + + // M5:<Checksum> + if (seq.HasChecksum()) out << FormatTag(Constants::SAM_SQ_CHECKSUM_TAG, seq.Checksum); + + // SP:<Species> + if (seq.HasSpecies()) out << FormatTag(Constants::SAM_SQ_SPECIES_TAG, seq.Species); + + // UR:<URI> + if (seq.HasURI()) out << FormatTag(Constants::SAM_SQ_URI_TAG, seq.URI); + + // custom tags + if (!seq.CustomTags.empty()) { + for (std::size_t i = 0; i < seq.CustomTags.size(); ++i) { + const CustomHeaderTag& customTag = seq.CustomTags[i]; + out << FormatTag(customTag.TagName, customTag.TagValue); + } + } + + // newline + out << std::endl; + } +} + +void SamFormatPrinter::PrintRG(std::stringstream& out) const +{ + + // iterate over read group entries + SamReadGroupConstIterator rgIter = m_header.ReadGroups.ConstBegin(); + SamReadGroupConstIterator rgEnd = m_header.ReadGroups.ConstEnd(); + for (; rgIter != rgEnd; ++rgIter) { + const SamReadGroup& rg = (*rgIter); + + // @RG ID:<ID> + out << Constants::SAM_RG_BEGIN_TOKEN << FormatTag(Constants::SAM_RG_ID_TAG, rg.ID); + + // CN:<SequencingCenter> + if (rg.HasSequencingCenter()) + out << FormatTag(Constants::SAM_RG_SEQCENTER_TAG, rg.SequencingCenter); + + // DS:<Description> + if (rg.HasDescription()) + out << FormatTag(Constants::SAM_RG_DESCRIPTION_TAG, rg.Description); + + // DT:<ProductionDate> + if (rg.HasProductionDate()) + out << FormatTag(Constants::SAM_RG_PRODUCTIONDATE_TAG, rg.ProductionDate); + + // FO:<FlowOrder> + if (rg.HasFlowOrder()) out << FormatTag(Constants::SAM_RG_FLOWORDER_TAG, rg.FlowOrder); + + // KS:<KeySequence> + if (rg.HasKeySequence()) + out << FormatTag(Constants::SAM_RG_KEYSEQUENCE_TAG, rg.KeySequence); + + // LB:<Library> + if (rg.HasLibrary()) out << FormatTag(Constants::SAM_RG_LIBRARY_TAG, rg.Library); + + // PG:<Program> + if (rg.HasProgram()) out << FormatTag(Constants::SAM_RG_PROGRAM_TAG, rg.Program); + + // PI:<PredictedInsertSize> + if (rg.HasPredictedInsertSize()) + out << FormatTag(Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG, rg.PredictedInsertSize); + + // PL:<SequencingTechnology> + if (rg.HasSequencingTechnology()) + out << FormatTag(Constants::SAM_RG_SEQTECHNOLOGY_TAG, rg.SequencingTechnology); + + // PU:<PlatformUnit> + if (rg.HasPlatformUnit()) + out << FormatTag(Constants::SAM_RG_PLATFORMUNIT_TAG, rg.PlatformUnit); + + // SM:<Sample> + if (rg.HasSample()) out << FormatTag(Constants::SAM_RG_SAMPLE_TAG, rg.Sample); + + // custom tags + if (!rg.CustomTags.empty()) { + for (std::size_t i = 0; i < rg.CustomTags.size(); ++i) { + const CustomHeaderTag& customTag = rg.CustomTags[i]; + out << FormatTag(customTag.TagName, customTag.TagValue); + } + } + + // newline + out << std::endl; + } +} + +void SamFormatPrinter::PrintPG(std::stringstream& out) const +{ + + // iterate over program record entries + SamProgramConstIterator pgIter = m_header.Programs.ConstBegin(); + SamProgramConstIterator pgEnd = m_header.Programs.ConstEnd(); + for (; pgIter != pgEnd; ++pgIter) { + const SamProgram& pg = (*pgIter); + + // @PG ID:<ID> + out << Constants::SAM_PG_BEGIN_TOKEN << FormatTag(Constants::SAM_PG_ID_TAG, pg.ID); + + // PN:<Name> + if (pg.HasName()) out << FormatTag(Constants::SAM_PG_NAME_TAG, pg.Name); + + // CL:<CommandLine> + if (pg.HasCommandLine()) + out << FormatTag(Constants::SAM_PG_COMMANDLINE_TAG, pg.CommandLine); + + // PP:<PreviousProgramID> + if (pg.HasPreviousProgramID()) + out << FormatTag(Constants::SAM_PG_PREVIOUSPROGRAM_TAG, pg.PreviousProgramID); + + // VN:<Version> + if (pg.HasVersion()) out << FormatTag(Constants::SAM_PG_VERSION_TAG, pg.Version); + + // custom tags + if (!pg.CustomTags.empty()) { + for (std::size_t i = 0; i < pg.CustomTags.size(); ++i) { + const CustomHeaderTag& customTag = pg.CustomTags[i]; + out << FormatTag(customTag.TagName, customTag.TagValue); + } + } + + // newline + out << std::endl; + } +} + +void SamFormatPrinter::PrintCO(std::stringstream& out) const +{ + + // iterate over comments + std::vector<std::string>::const_iterator commentIter = m_header.Comments.begin(); + std::vector<std::string>::const_iterator commentEnd = m_header.Comments.end(); + for (; commentIter != commentEnd; ++commentIter) { + + // @CO <Comment> + out << Constants::SAM_CO_BEGIN_TOKEN << Constants::SAM_TAB << (*commentIter) << std::endl; + } +} diff --git a/src/api/internal/sam/SamFormatPrinter_p.h b/src/api/internal/sam/SamFormatPrinter_p.h new file mode 100644 index 0000000..b43e5a2 --- /dev/null +++ b/src/api/internal/sam/SamFormatPrinter_p.h @@ -0,0 +1,60 @@ +// *************************************************************************** +// SamFormatPrinter.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 6 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for printing formatted SAM header to string +// *************************************************************************** + +#ifndef SAM_FORMAT_PRINTER_H +#define SAM_FORMAT_PRINTER_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <sstream> +#include <string> + +namespace BamTools { + +struct SamHeader; + +namespace Internal { + +class SamFormatPrinter +{ + + // ctor & dtor +public: + SamFormatPrinter(const BamTools::SamHeader& header); + ~SamFormatPrinter(); + + // generates SAM-formatted string from header data +public: + const std::string ToString() const; + + // internal methods +private: + void PrintHD(std::stringstream& out) const; + void PrintSQ(std::stringstream& out) const; + void PrintRG(std::stringstream& out) const; + void PrintPG(std::stringstream& out) const; + void PrintCO(std::stringstream& out) const; + + // data members +private: + const SamHeader& m_header; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // SAM_FORMAT_PRINTER_H diff --git a/src/api/internal/sam/SamHeaderValidator_p.cpp b/src/api/internal/sam/SamHeaderValidator_p.cpp new file mode 100644 index 0000000..10320b1 --- /dev/null +++ b/src/api/internal/sam/SamHeaderValidator_p.cpp @@ -0,0 +1,536 @@ +// *************************************************************************** +// SamHeaderValidator.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for validating SamHeader data +// *************************************************************************** + +#include "api/internal/sam/SamHeaderValidator_p.h" +#include "api/SamConstants.h" +#include "api/SamHeader.h" +#include "api/internal/sam/SamHeaderVersion_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cctype> +#include <cstddef> +#include <set> +#include <sstream> + +// ------------------------ +// static utility methods +// ------------------------- + +static bool caseInsensitiveCompare(const std::string& lhs, const std::string& rhs) +{ + + // can omit checking chars if lengths not equal + const int lhsLength = lhs.length(); + const int rhsLength = rhs.length(); + if (lhsLength != rhsLength) return false; + + // do *basic* toupper checks on each string char's + for (int i = 0; i < lhsLength; ++i) { + if (toupper((int)lhs.at(i)) != toupper((int)rhs.at(i))) return false; + } + + // otherwise OK + return true; +} + +// ------------------------------------------------------------------------ +// Allow validation rules to vary, as needed, between SAM header versions +// +// use SAM_VERSION_X_Y to tag important changes +// +// Together, they will allow for comparisons like: +// if ( m_version < SAM_VERSION_2_0 ) { +// // use some older rule +// else +// // use rule introduced with version 2.0 + +static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1, 0); +static const SamHeaderVersion SAM_VERSION_1_1 = SamHeaderVersion(1, 1); +static const SamHeaderVersion SAM_VERSION_1_2 = SamHeaderVersion(1, 2); +static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1, 3); +static const SamHeaderVersion SAM_VERSION_1_4 = SamHeaderVersion(1, 4); + +// TODO: This functionality is currently unused. +// Make validation "version-aware." +// +// ------------------------------------------------------------------------ + +const std::string SamHeaderValidator::ERROR_PREFIX = "ERROR: "; +const std::string SamHeaderValidator::WARN_PREFIX = "WARNING: "; +const std::string SamHeaderValidator::NEWLINE(1, '\n'); + +SamHeaderValidator::SamHeaderValidator(const SamHeader& header) + : m_header(header) +{} + +SamHeaderValidator::~SamHeaderValidator() {} + +void SamHeaderValidator::AddError(const std::string& message) +{ + m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE); +} + +void SamHeaderValidator::AddWarning(const std::string& message) +{ + m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE); +} + +void SamHeaderValidator::PrintErrorMessages(std::ostream& stream) +{ + + // skip if no error messages + if (m_errorMessages.empty()) return; + + // print error header line + stream << "* SAM header has " << m_errorMessages.size() << " errors:" << std::endl; + + // print each error message + std::vector<std::string>::const_iterator errorIter = m_errorMessages.begin(); + std::vector<std::string>::const_iterator errorEnd = m_errorMessages.end(); + for (; errorIter != errorEnd; ++errorIter) + stream << (*errorIter); +} + +void SamHeaderValidator::PrintMessages(std::ostream& stream) +{ + PrintErrorMessages(stream); + PrintWarningMessages(stream); +} + +void SamHeaderValidator::PrintWarningMessages(std::ostream& stream) +{ + + // skip if no warning messages + if (m_warningMessages.empty()) return; + + // print warning header line + stream << "* SAM header has " << m_warningMessages.size() << " warnings:" << std::endl; + + // print each warning message + std::vector<std::string>::const_iterator warnIter = m_warningMessages.begin(); + std::vector<std::string>::const_iterator warnEnd = m_warningMessages.end(); + for (; warnIter != warnEnd; ++warnIter) + stream << (*warnIter); +} + +// entry point for validation +bool SamHeaderValidator::Validate() +{ + bool isValid = true; + isValid &= ValidateMetadata(); + isValid &= ValidateSequenceDictionary(); + isValid &= ValidateReadGroupDictionary(); + isValid &= ValidateProgramChain(); + return isValid; +} + +// check all SAM header 'metadata' +bool SamHeaderValidator::ValidateMetadata() +{ + bool isValid = true; + isValid &= ValidateVersion(); + isValid &= ValidateSortOrder(); + isValid &= ValidateGroupOrder(); + return isValid; +} + +// check SAM header version tag +bool SamHeaderValidator::ValidateVersion() +{ + + const std::string& version = m_header.Version; + + // warn if version not present + if (version.empty()) { + AddWarning("Version (VN) missing. Not required, but strongly recommended"); + return true; + } + + // invalid if version does not contain a period + const std::size_t periodFound = version.find(Constants::SAM_PERIOD); + if (periodFound == std::string::npos) { + AddError("Invalid version (VN) format: " + version); + return false; + } + + // invalid if major version is empty or contains non-digits + const std::string majorVersion = version.substr(0, periodFound); + if (majorVersion.empty() || !ContainsOnlyDigits(majorVersion)) { + AddError("Invalid version (VN) format: " + version); + return false; + } + + // invalid if major version is empty or contains non-digits + const std::string minorVersion = version.substr(periodFound + 1); + if (minorVersion.empty() || !ContainsOnlyDigits(minorVersion)) { + AddError("Invalid version (VN) format: " + version); + return false; + } + + // TODO: check if version is not just syntactically OK, + // but is also a valid SAM version ( 1.0 .. CURRENT ) + + // all checked out this far, then version is OK + return true; +} + +// assumes non-empty input string +bool SamHeaderValidator::ContainsOnlyDigits(const std::string& s) +{ + const std::size_t nonDigitPosition = s.find_first_not_of(Constants::SAM_DIGITS); + return (nonDigitPosition == std::string::npos); +} + +// validate SAM header sort order tag +bool SamHeaderValidator::ValidateSortOrder() +{ + + const std::string& sortOrder = m_header.SortOrder; + + // warn if sort order not present + if (sortOrder.empty()) { + AddWarning("Sort order (SO) missing. Not required, but strongly recommended"); + return true; + } + + // if sort order is valid keyword + if (sortOrder == Constants::SAM_HD_SORTORDER_COORDINATE || + sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME || + sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED) { + return true; + } + + // otherwise + AddError("Invalid sort order (SO): " + sortOrder); + return false; +} + +// validate SAM header group order tag +bool SamHeaderValidator::ValidateGroupOrder() +{ + + const std::string& groupOrder = m_header.GroupOrder; + + // if no group order, no problem, just return OK + if (groupOrder.empty()) return true; + + // if group order is valid keyword + if (groupOrder == Constants::SAM_HD_GROUPORDER_NONE || + groupOrder == Constants::SAM_HD_GROUPORDER_QUERY || + groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE) { + return true; + } + + // otherwise + AddError("Invalid group order (GO): " + groupOrder); + return false; +} + +// validate SAM header sequence dictionary +bool SamHeaderValidator::ValidateSequenceDictionary() +{ + + bool isValid = true; + + // check for unique sequence names + isValid &= ContainsUniqueSequenceNames(); + + // iterate over sequences + const SamSequenceDictionary& sequences = m_header.Sequences; + SamSequenceConstIterator seqIter = sequences.ConstBegin(); + SamSequenceConstIterator seqEnd = sequences.ConstEnd(); + for (; seqIter != seqEnd; ++seqIter) { + const SamSequence& seq = (*seqIter); + isValid &= ValidateSequence(seq); + } + + // return validation state + return isValid; +} + +// make sure all SQ names are unique +bool SamHeaderValidator::ContainsUniqueSequenceNames() +{ + + bool isValid = true; + std::set<std::string> sequenceNames; + std::set<std::string>::iterator nameIter; + + // iterate over sequences + const SamSequenceDictionary& sequences = m_header.Sequences; + SamSequenceConstIterator seqIter = sequences.ConstBegin(); + SamSequenceConstIterator seqEnd = sequences.ConstEnd(); + for (; seqIter != seqEnd; ++seqIter) { + const SamSequence& seq = (*seqIter); + + // lookup sequence name + const std::string& name = seq.Name; + nameIter = sequenceNames.find(name); + + // error if found (duplicate entry) + if (nameIter != sequenceNames.end()) { + AddError("Sequence name (SN): " + name + " is not unique"); + isValid = false; + } + + // otherwise ok, store name + sequenceNames.insert(name); + } + + // return validation state + return isValid; +} + +// validate SAM header sequence entry +bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) +{ + bool isValid = true; + isValid &= CheckNameFormat(seq.Name); + isValid &= CheckLengthInRange(seq.Length); + return isValid; +} + +// check sequence name is valid format +bool SamHeaderValidator::CheckNameFormat(const std::string& name) +{ + + // invalid if name is empty + if (name.empty()) { + AddError("Sequence entry (@SQ) is missing SN tag"); + return false; + } + + // invalid if first character is a reserved char + const char firstChar = name.at(0); + if (firstChar == Constants::SAM_EQUAL || firstChar == Constants::SAM_STAR) { + AddError("Invalid sequence name (SN): " + name); + return false; + } + // otherwise OK + return true; +} + +// check that sequence length is within accepted range +bool SamHeaderValidator::CheckLengthInRange(const std::string& length) +{ + + // invalid if empty + if (length.empty()) { + AddError("Sequence entry (@SQ) is missing LN tag"); + return false; + } + + // convert string length to numeric + std::stringstream lengthStream(length); + unsigned int sequenceLength; + lengthStream >> sequenceLength; + + // invalid if length outside accepted range + if (sequenceLength < Constants::SAM_SQ_LENGTH_MIN || + sequenceLength > Constants::SAM_SQ_LENGTH_MAX) { + AddError("Sequence length (LN): " + length + " out of range"); + return false; + } + + // otherwise OK + return true; +} + +// validate SAM header read group dictionary +bool SamHeaderValidator::ValidateReadGroupDictionary() +{ + + bool isValid = true; + + // check for unique read group IDs & platform units + isValid &= ContainsUniqueIDsAndPlatformUnits(); + + // iterate over read groups + const SamReadGroupDictionary& readGroups = m_header.ReadGroups; + SamReadGroupConstIterator rgIter = readGroups.ConstBegin(); + SamReadGroupConstIterator rgEnd = readGroups.ConstEnd(); + for (; rgIter != rgEnd; ++rgIter) { + const SamReadGroup& rg = (*rgIter); + isValid &= ValidateReadGroup(rg); + } + + // return validation state + return isValid; +} + +// make sure RG IDs and platform units are unique +bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits() +{ + + bool isValid = true; + std::set<std::string> readGroupIds; + std::set<std::string> platformUnits; + std::set<std::string>::iterator idIter; + std::set<std::string>::iterator puIter; + + // iterate over sequences + const SamReadGroupDictionary& readGroups = m_header.ReadGroups; + SamReadGroupConstIterator rgIter = readGroups.ConstBegin(); + SamReadGroupConstIterator rgEnd = readGroups.ConstEnd(); + for (; rgIter != rgEnd; ++rgIter) { + const SamReadGroup& rg = (*rgIter); + + // -------------------------------- + // check for unique ID + + // lookup read group ID + const std::string& id = rg.ID; + idIter = readGroupIds.find(id); + + // error if found (duplicate entry) + if (idIter != readGroupIds.end()) { + AddError("Read group ID (ID): " + id + " is not unique"); + isValid = false; + } + + // otherwise ok, store id + readGroupIds.insert(id); + + // -------------------------------- + // check for unique platform unit + + // lookup platform unit + const std::string& pu = rg.PlatformUnit; + puIter = platformUnits.find(pu); + + // error if found (duplicate entry) + if (puIter != platformUnits.end()) { + AddError("Platform unit (PU): " + pu + " is not unique"); + isValid = false; + } + + // otherwise ok, store platform unit + platformUnits.insert(pu); + } + + // return validation state + return isValid; +} + +// validate SAM header read group entry +bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) +{ + bool isValid = true; + isValid &= CheckReadGroupID(rg.ID); + isValid &= CheckSequencingTechnology(rg.SequencingTechnology); + return isValid; +} + +// make sure RG ID exists +bool SamHeaderValidator::CheckReadGroupID(const std::string& id) +{ + + // invalid if empty + if (id.empty()) { + AddError("Read group entry (@RG) is missing ID tag"); + return false; + } + + // otherwise OK + return true; +} + +// make sure RG sequencing tech is one of the accepted keywords +bool SamHeaderValidator::CheckSequencingTechnology(const std::string& technology) +{ + + // if no technology provided, no problem, just return OK + if (technology.empty()) return true; + + // if technology is valid keyword + if (caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_CAPILLARY) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_HELICOS) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_IONTORRENT) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_LS454) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_PACBIO) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_SOLID)) { + return true; + } + + // otherwise + AddError("Invalid read group sequencing platform (PL): " + technology); + return false; +} + +// validate the SAM header "program chain" +bool SamHeaderValidator::ValidateProgramChain() +{ + bool isValid = true; + isValid &= ContainsUniqueProgramIds(); + isValid &= ValidatePreviousProgramIds(); + return isValid; +} + +// make sure all PG IDs are unique +bool SamHeaderValidator::ContainsUniqueProgramIds() +{ + + bool isValid = true; + std::set<std::string> programIds; + std::set<std::string>::iterator pgIdIter; + + // iterate over program records + const SamProgramChain& programs = m_header.Programs; + SamProgramConstIterator pgIter = programs.ConstBegin(); + SamProgramConstIterator pgEnd = programs.ConstEnd(); + for (; pgIter != pgEnd; ++pgIter) { + const SamProgram& pg = (*pgIter); + + // lookup program ID + const std::string& pgId = pg.ID; + pgIdIter = programIds.find(pgId); + + // error if found (duplicate entry) + if (pgIdIter != programIds.end()) { + AddError("Program ID (ID): " + pgId + " is not unique"); + isValid = false; + } + + // otherwise ok, store ID + programIds.insert(pgId); + } + + // return validation state + return isValid; +} + +// make sure that any PP tags present point to existing @PG IDs +bool SamHeaderValidator::ValidatePreviousProgramIds() +{ + + bool isValid = true; + + // iterate over program records + const SamProgramChain& programs = m_header.Programs; + SamProgramConstIterator pgIter = programs.ConstBegin(); + SamProgramConstIterator pgEnd = programs.ConstEnd(); + for (; pgIter != pgEnd; ++pgIter) { + const SamProgram& pg = (*pgIter); + + // ignore record for validation if PreviousProgramID is empty + const std::string& ppId = pg.PreviousProgramID; + if (ppId.empty()) continue; + + // see if program "chain" contains an entry for ppId + if (!programs.Contains(ppId)) { + AddError("PreviousProgramID (PP): " + ppId + " is not a known ID"); + isValid = false; + } + } + + // return validation state + return isValid; +} diff --git a/src/api/internal/sam/SamHeaderValidator_p.h b/src/api/internal/sam/SamHeaderValidator_p.h new file mode 100644 index 0000000..579726e --- /dev/null +++ b/src/api/internal/sam/SamHeaderValidator_p.h @@ -0,0 +1,103 @@ +// *************************************************************************** +// SamHeaderValidator.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 6 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for validating SamHeader data +// *************************************************************************** + +#ifndef SAM_HEADER_VALIDATOR_P_H +#define SAM_HEADER_VALIDATOR_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <iostream> +#include <string> +#include <vector> + +namespace BamTools { + +struct SamHeader; +struct SamReadGroup; +struct SamSequence; + +namespace Internal { + +class SamHeaderValidator +{ + + // ctor & dtor +public: + SamHeaderValidator(const SamHeader& header); + ~SamHeaderValidator(); + + // SamHeaderValidator interface +public: + // prints error & warning messages + void PrintMessages(std::ostream& stream); + + // validates SamHeader data, returns true/false accordingly + bool Validate(); + + // internal methods +private: + // validate header metadata + bool ValidateMetadata(); + bool ValidateVersion(); + bool ContainsOnlyDigits(const std::string& s); + bool ValidateSortOrder(); + bool ValidateGroupOrder(); + + // validate sequence dictionary + bool ValidateSequenceDictionary(); + bool ContainsUniqueSequenceNames(); + bool CheckNameFormat(const std::string& name); + bool ValidateSequence(const SamSequence& seq); + bool CheckLengthInRange(const std::string& length); + + // validate read group dictionary + bool ValidateReadGroupDictionary(); + bool ContainsUniqueIDsAndPlatformUnits(); + bool ValidateReadGroup(const SamReadGroup& rg); + bool CheckReadGroupID(const std::string& id); + bool CheckSequencingTechnology(const std::string& technology); + + // validate program data + bool ValidateProgramChain(); + bool ContainsUniqueProgramIds(); + bool ValidatePreviousProgramIds(); + + // error reporting + void AddError(const std::string& message); + void AddWarning(const std::string& message); + void PrintErrorMessages(std::ostream& stream); + void PrintWarningMessages(std::ostream& stream); + + // data members +private: + // SamHeader being validated + const SamHeader& m_header; + + // error reporting helpers + static const std::string ERROR_PREFIX; + static const std::string WARN_PREFIX; + static const std::string NEWLINE; + + // error reporting messages + std::vector<std::string> m_errorMessages; + std::vector<std::string> m_warningMessages; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // SAM_HEADER_VALIDATOR_P_H diff --git a/src/api/internal/sam/SamHeaderVersion_p.h b/src/api/internal/sam/SamHeaderVersion_p.h new file mode 100644 index 0000000..530aa46 --- /dev/null +++ b/src/api/internal/sam/SamHeaderVersion_p.h @@ -0,0 +1,154 @@ +// *************************************************************************** +// SamHeaderVersion.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for comparing SAM header versions +// ************************************************************************* + +#ifndef SAM_HEADERVERSION_P_H +#define SAM_HEADERVERSION_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <cstddef> +#include <sstream> +#include <string> +#include "api/SamConstants.h" + +namespace BamTools { +namespace Internal { + +class SamHeaderVersion +{ + + // ctors & dtor +public: + SamHeaderVersion() + : m_majorVersion(0) + , m_minorVersion(0) + {} + + explicit SamHeaderVersion(const std::string& version) + : m_majorVersion(0) + , m_minorVersion(0) + { + SetVersion(version); + } + + SamHeaderVersion(const unsigned int& major, const unsigned int& minor) + : m_majorVersion(major) + , m_minorVersion(minor) + {} + + ~SamHeaderVersion() + { + m_majorVersion = 0; + m_minorVersion = 0; + } + + // acess data +public: + unsigned int MajorVersion() const + { + return m_majorVersion; + } + unsigned int MinorVersion() const + { + return m_minorVersion; + } + + void SetVersion(const std::string& version); + std::string ToString() const; + + // data members +private: + unsigned int m_majorVersion; + unsigned int m_minorVersion; +}; + +inline void SamHeaderVersion::SetVersion(const std::string& version) +{ + + // do nothing if version is empty + if (!version.empty()) { + + std::stringstream versionStream; + + // do nothing if period not found + const std::size_t periodFound = version.find(Constants::SAM_PERIOD); + if (periodFound != std::string::npos) { + + // store major version if non-empty and contains only digits + const std::string& majorVersion = version.substr(0, periodFound); + versionStream.str(majorVersion); + if (!majorVersion.empty()) { + const std::size_t nonDigitFound = + majorVersion.find_first_not_of(Constants::SAM_DIGITS); + if (nonDigitFound == std::string::npos) versionStream >> m_majorVersion; + } + + // store minor version if non-empty and contains only digits + const std::string& minorVersion = version.substr(periodFound + 1); + versionStream.str(minorVersion); + if (!minorVersion.empty()) { + const std::size_t nonDigitFound = + minorVersion.find_first_not_of(Constants::SAM_DIGITS); + if (nonDigitFound == std::string::npos) versionStream >> m_minorVersion; + } + } + } +} + +// ----------------------------------------------------- +// printing + +inline std::string SamHeaderVersion::ToString() const +{ + std::stringstream version; + version << m_majorVersion << Constants::SAM_PERIOD << m_minorVersion; + return version.str(); +} + +// ----------------------------------------------------- +// comparison operators + +inline bool operator==(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) +{ + return (lhs.MajorVersion() == rhs.MajorVersion()) && (lhs.MinorVersion() == rhs.MinorVersion()); +} + +inline bool operator<(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) +{ + if (lhs.MajorVersion() == rhs.MajorVersion()) + return lhs.MinorVersion() < rhs.MinorVersion(); + else + return lhs.MajorVersion() < rhs.MajorVersion(); +} + +inline bool operator>(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) +{ + return rhs < lhs; +} +inline bool operator<=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) +{ + return !(lhs > rhs); +} +inline bool operator>=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) +{ + return !(lhs < rhs); +} + +} // namespace Internal +} // namespace BamTools + +#endif // SAM_HEADERVERSION_P_H diff --git a/src/api/internal/utils/BamException_p.cpp b/src/api/internal/utils/BamException_p.cpp new file mode 100644 index 0000000..3b38779 --- /dev/null +++ b/src/api/internal/utils/BamException_p.cpp @@ -0,0 +1,14 @@ +// *************************************************************************** +// BamException_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides a basic exception class for BamTools internals +// *************************************************************************** + +#include "api/internal/utils/BamException_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +const std::string BamException::SEPARATOR(": "); diff --git a/src/api/internal/utils/BamException_p.h b/src/api/internal/utils/BamException_p.h new file mode 100644 index 0000000..3a0a175 --- /dev/null +++ b/src/api/internal/utils/BamException_p.h @@ -0,0 +1,53 @@ +// *************************************************************************** +// BamException_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 6 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides a basic exception class for BamTools internals +// *************************************************************************** + +#ifndef BAMEXCEPTION_P_H +#define BAMEXCEPTION_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <exception> +#include <string> + +namespace BamTools { +namespace Internal { + +class BamException : public std::exception +{ + +public: + inline BamException(const std::string& where, const std::string& message) + : std::exception() + , m_errorString(where + SEPARATOR + message) + {} + + inline ~BamException() throw() {} + + inline const char* what() const throw() + { + return m_errorString.c_str(); + } + +private: + std::string m_errorString; + static const std::string SEPARATOR; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMEXCEPTION_P_H diff --git a/src/api/internal/utils/CMakeLists.txt b/src/api/internal/utils/CMakeLists.txt new file mode 100644 index 0000000..4b1e2c2 --- /dev/null +++ b/src/api/internal/utils/CMakeLists.txt @@ -0,0 +1,15 @@ +# ========================== +# BamTools CMakeLists.txt +# (c) 2011 Derek Barnett +# +# src/api/internal/utils +# ========================== + +set( InternalUtilsDir "${InternalDir}/utils" ) + +set( InternalUtilsSources + ${InternalUtilsDir}/BamException_p.cpp + + PARENT_SCOPE # <-- leave this last +) + diff --git a/src/bamtools.pc.in b/src/bamtools.pc.in new file mode 100644 index 0000000..59c3017 --- /dev/null +++ b/src/bamtools.pc.in @@ -0,0 +1,10 @@ +libdir=@CMAKE_INSTALL_FULL_LIBDIR@ +includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ + +Name: BamTools +Description: BamTools is a C++ library for reading and manipulating BAM files +Version: @BamTools_VERSION@ + +Requires.private: @BAMTOOLS_PRIVATE_DEPS@ +Libs: -L${libdir} -lbamtools +Cflags: -I${includedir} diff --git a/src/shared/bamtools_global.h b/src/shared/bamtools_global.h new file mode 100644 index 0000000..7b128be --- /dev/null +++ b/src/shared/bamtools_global.h @@ -0,0 +1,89 @@ +// *************************************************************************** +// bamtools_global.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the basic definitions for exporting & importing library symbols. +// Also provides some platform-specific rules for definitions. +// *************************************************************************** + +#ifndef BAMTOOLS_GLOBAL_H +#define BAMTOOLS_GLOBAL_H + +/*! \brief Library export macro + \internal +*/ +#ifndef BAMTOOLS_LIBRARY_EXPORT +#if defined(WIN32) +#define BAMTOOLS_LIBRARY_EXPORT __declspec(dllexport) +#else +#define BAMTOOLS_LIBRARY_EXPORT __attribute__((visibility("default"))) +#endif +#endif // BAMTOOLS_LIBRARY_EXPORT + +/*! \brief Library import macro + \internal +*/ +#ifndef BAMTOOLS_LIBRARY_IMPORT +#if defined(WIN32) +#define BAMTOOLS_LIBRARY_IMPORT __declspec(dllimport) +#else +#define BAMTOOLS_LIBRARY_IMPORT +#endif +#endif // BAMTOOLS_LIBRARY_IMPORT + +/*! \brief Platform-specific type definitions + \internal +*/ +#ifndef BAMTOOLS_LFS +#define BAMTOOLS_LFS +#ifdef WIN32 +#define ftell64(a) _ftelli64(a) +#define fseek64(a, b, c) _fseeki64(a, b, c) +#else +#define ftell64(a) ftello(a) +#define fseek64(a, b, c) fseeko(a, b, c) +#endif +#endif // BAMTOOLS_LFS + +/*! \def ftell64(a) + \brief Platform-independent tell() operation. + \internal +*/ +/*! \def fseek64(a,b,c) + \brief Platform-independent seek() operation. + \internal +*/ + +/*! \brief Platform-specific type definitions + \internal +*/ +#ifndef BAMTOOLS_TYPES +#define BAMTOOLS_TYPES +#include <stdint.h> +#endif // BAMTOOLS_TYPES + +//! \internal +inline void bamtools_noop() {} + +/*! \brief Assert definitions + \internal +*/ +#ifndef BAMTOOLS_ASSERTS +#define BAMTOOLS_ASSERTS +#ifdef NDEBUG +#define BT_ASSERT_UNREACHABLE bamtools_noop() +#define BT_ASSERT_X(condition, message) bamtools_noop() +#else +#include <cassert> +#include <stdexcept> +#define BT_ASSERT_UNREACHABLE assert(false) +#define BT_ASSERT_X(condition, message) \ + if (!(condition)) { \ + throw std::runtime_error(message); \ + } +#endif +#endif // BAMTOOLS_ASSERTS + +#endif // BAMTOOLS_GLOBAL_H diff --git a/src/toolkit/CMakeLists.txt b/src/toolkit/CMakeLists.txt new file mode 100644 index 0000000..7a1f676 --- /dev/null +++ b/src/toolkit/CMakeLists.txt @@ -0,0 +1,47 @@ +# ========================== +# BamTools CMakeLists.txt +# (c) 2010 Derek Barnett +# +# src/toolkit +# ========================== + +# set include path +include_directories( ${BamTools_SOURCE_DIR}/src/api + ${BamTools_SOURCE_DIR}/src/utils + ${BamTools_SOURCE_DIR}/src/third_party + ) + +# compile main bamtools application +add_executable( bamtools_cmd + bamtools_convert.cpp + bamtools_count.cpp + bamtools_coverage.cpp + bamtools_filter.cpp + bamtools_header.cpp + bamtools_index.cpp + bamtools_merge.cpp + bamtools_random.cpp + bamtools_resolve.cpp + bamtools_revert.cpp + bamtools_sort.cpp + bamtools_split.cpp + bamtools_stats.cpp + bamtools.cpp + ) + +# set BamTools application properties +set_target_properties( bamtools_cmd PROPERTIES + OUTPUT_NAME "bamtools" + ) +# make version info available in application +configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/bamtools_version.h.in ${CMAKE_CURRENT_BINARY_DIR}/bamtools_version.h ) +include_directories( ${CMAKE_CURRENT_BINARY_DIR} ) + +# set include paths for system JsonCpp +target_include_directories( bamtools_cmd PRIVATE ${JSONCPP_INCLUDE_DIRS} ) + +# define libraries to link +target_link_libraries( bamtools_cmd BamTools BamTools-utils ${JSONCPP_LDFLAGS} ) + +# set application install destinations +install( TARGETS bamtools_cmd DESTINATION "${CMAKE_INSTALL_BINDIR}" ) diff --git a/src/toolkit/bamtools.cpp b/src/toolkit/bamtools.cpp new file mode 100644 index 0000000..34a99ae --- /dev/null +++ b/src/toolkit/bamtools.cpp @@ -0,0 +1,174 @@ +// *************************************************************************** +// bamtools.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 12 October 2012 (DB) +// --------------------------------------------------------------------------- +// Integrates a number of BamTools functionalities into a single executable. +// *************************************************************************** + +#include <cstdio> +#include <cstdlib> +#include <iostream> +#include <sstream> +#include <string> +#include "bamtools_convert.h" +#include "bamtools_count.h" +#include "bamtools_coverage.h" +#include "bamtools_filter.h" +#include "bamtools_header.h" +#include "bamtools_index.h" +#include "bamtools_merge.h" +#include "bamtools_random.h" +#include "bamtools_resolve.h" +#include "bamtools_revert.h" +#include "bamtools_sort.h" +#include "bamtools_split.h" +#include "bamtools_stats.h" +#include "bamtools_version.h" +using namespace BamTools; + +// bamtools subtool names +static const std::string CONVERT = "convert"; +static const std::string COUNT = "count"; +static const std::string COVERAGE = "coverage"; +static const std::string FILTER = "filter"; +static const std::string HEADER = "header"; +static const std::string INDEX = "index"; +static const std::string MERGE = "merge"; +static const std::string RANDOM = "random"; +static const std::string RESOLVE = "resolve"; +static const std::string REVERT = "revert"; +static const std::string SORT = "sort"; +static const std::string SPLIT = "split"; +static const std::string STATS = "stats"; + +// bamtools help/version constants +static const std::string HELP = "help"; +static const std::string LONG_HELP = "--help"; +static const std::string SHORT_HELP = "-h"; +static const std::string VERSION = "version"; +static const std::string LONG_VERSION = "--version"; +static const std::string SHORT_VERSION = "-v"; + +// determine if string is a help constant +static bool IsHelp(char* str) +{ + return (str == HELP || str == LONG_HELP || str == SHORT_HELP); +} + +// determine if string is a version constant +static bool IsVersion(char* str) +{ + return (str == VERSION || str == LONG_VERSION || str == SHORT_VERSION); +} + +// subtool factory method +AbstractTool* CreateTool(const std::string& arg) +{ + + // determine tool type based on arg + if (arg == CONVERT) return new ConvertTool; + if (arg == COUNT) return new CountTool; + if (arg == COVERAGE) return new CoverageTool; + if (arg == FILTER) return new FilterTool; + if (arg == HEADER) return new HeaderTool; + if (arg == INDEX) return new IndexTool; + if (arg == MERGE) return new MergeTool; + if (arg == RANDOM) return new RandomTool; + if (arg == RESOLVE) return new ResolveTool; + if (arg == REVERT) return new RevertTool; + if (arg == SORT) return new SortTool; + if (arg == SPLIT) return new SplitTool; + if (arg == STATS) return new StatsTool; + + // unknown arg + return 0; +} + +// print help info +int Help(int argc, char* argv[]) +{ + + // check for 'bamtools help COMMAND' to print tool-specific help message + if (argc > 2) { + + // determine desired sub-tool + AbstractTool* tool = CreateTool(argv[2]); + + // if tool known, print its help screen + if (tool) return tool->Help(); + } + + // print general BamTools help message + std::cerr << std::endl; + std::cerr << "usage: bamtools [--help] COMMAND [ARGS]" << std::endl; + std::cerr << std::endl; + std::cerr << "Available bamtools commands:" << std::endl; + std::cerr << "\tconvert Converts between BAM and a number of other formats" + << std::endl; + std::cerr << "\tcount Prints number of alignments in BAM file(s)" << std::endl; + std::cerr << "\tcoverage Prints coverage statistics from the input BAM file" + << std::endl; + std::cerr << "\tfilter Filters BAM file(s) by user-specified criteria" << std::endl; + std::cerr << "\theader Prints BAM header information" << std::endl; + std::cerr << "\tindex Generates index for BAM file" << std::endl; + std::cerr << "\tmerge Merge multiple BAM files into single file" << std::endl; + std::cerr << "\trandom Select random alignments from existing BAM file(s), intended " + "more as a testing tool." + << std::endl; + std::cerr + << "\tresolve Resolves paired-end reads (marking the IsProperPair flag as needed)" + << std::endl; + std::cerr << "\trevert Removes duplicate marks and restores original base qualities" + << std::endl; + std::cerr << "\tsort Sorts the BAM file according to some criteria" << std::endl; + std::cerr << "\tsplit Splits a BAM file on user-specified property, creating a new " + "BAM output file for each value found" + << std::endl; + std::cerr << "\tstats Prints some basic statistics from input BAM file(s)" + << std::endl; + std::cerr << std::endl; + std::cerr << "See 'bamtools help COMMAND' for more information on a specific command." + << std::endl; + std::cerr << std::endl; + return EXIT_SUCCESS; +} + +// print version info +int Version() +{ + + std::stringstream versionStream; + versionStream << BAMTOOLS_VERSION_MAJOR << '.' << BAMTOOLS_VERSION_MINOR << '.' + << BAMTOOLS_VERSION_PATCH; + + std::cout << std::endl; + std::cout << "bamtools " << versionStream.str() << std::endl; + std::cout << "Part of BamTools API and toolkit" << std::endl; + std::cout << "Primary authors: Derek Barnett, Erik Garrison, Michael Stromberg" << std::endl; + std::cout << "(c) 2009-2012 Marth Lab, Biology Dept., Boston College" << std::endl; + std::cout << std::endl; + return EXIT_SUCCESS; +} + +// toolkit entry point +int main(int argc, char* argv[]) +{ + + // just 'bamtools' + if (argc == 1) return Help(argc, argv); + + // 'bamtools help', 'bamtools --help', or 'bamtools -h' + if (IsHelp(argv[1])) return Help(argc, argv); + + // 'bamtools version', 'bamtools --version', or 'bamtools -v' + if (IsVersion(argv[1])) return Version(); + + // determine desired sub-tool, run if found + AbstractTool* tool = CreateTool(argv[1]); + if (tool) return tool->Run(argc, argv); + + // no tool matched, show help + return Help(argc, argv); +} diff --git a/src/toolkit/bamtools_convert.cpp b/src/toolkit/bamtools_convert.cpp new file mode 100644 index 0000000..bd32218 --- /dev/null +++ b/src/toolkit/bamtools_convert.cpp @@ -0,0 +1,967 @@ +// *************************************************************************** +// bamtools_convert.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 December 2012 +// --------------------------------------------------------------------------- +// Converts between BAM and a number of other formats +// *************************************************************************** + +#include "bamtools_convert.h" + +#include <api/BamConstants.h> +#include <api/BamMultiReader.h> +#include <utils/bamtools_fasta.h> +#include <utils/bamtools_options.h> +#include <utils/bamtools_pileup_engine.h> +#include <utils/bamtools_utilities.h> +using namespace BamTools; + +#include <cstddef> +#include <fstream> +#include <iostream> +#include <sstream> +#include <string> +#include <vector> + +namespace BamTools { + +// --------------------------------------------- +// ConvertTool constants + +// supported conversion format command-line names +static const std::string FORMAT_BED = "bed"; +static const std::string FORMAT_FASTA = "fasta"; +static const std::string FORMAT_FASTQ = "fastq"; +static const std::string FORMAT_JSON = "json"; +static const std::string FORMAT_SAM = "sam"; +static const std::string FORMAT_PILEUP = "pileup"; +static const std::string FORMAT_YAML = "yaml"; + +// other constants +static const unsigned int FASTA_LINE_MAX = 50; + +// --------------------------------------------- +// ConvertPileupFormatVisitor declaration + +class ConvertPileupFormatVisitor : public PileupVisitor +{ + + // ctor & dtor +public: + ConvertPileupFormatVisitor(const RefVector& references, const std::string& fastaFilename, + const bool isPrintingMapQualities, std::ostream* out); + ~ConvertPileupFormatVisitor(); + + // PileupVisitor interface implementation +public: + void Visit(const PileupPosition& pileupData); + + // data members +private: + Fasta m_fasta; + bool m_hasFasta; + bool m_isPrintingMapQualities; + std::ostream* m_out; + RefVector m_references; +}; + +} // namespace BamTools + +// --------------------------------------------- +// ConvertSettings implementation + +struct ConvertTool::ConvertSettings +{ + + // flag + bool HasInput; + bool HasInputFilelist; + bool HasOutput; + bool HasFormat; + bool HasRegion; + + // pileup flags + bool HasFastaFilename; + bool IsOmittingSamHeader; + bool IsPrintingPileupMapQualities; + + // options + std::vector<std::string> InputFiles; + std::string InputFilelist; + std::string OutputFilename; + std::string Format; + std::string Region; + + // pileup options + std::string FastaFilename; + + // constructor + ConvertSettings() + : HasInput(false) + , HasInputFilelist(false) + , HasOutput(false) + , HasFormat(false) + , HasRegion(false) + , HasFastaFilename(false) + , IsOmittingSamHeader(false) + , IsPrintingPileupMapQualities(false) + , OutputFilename(Options::StandardOut()) + {} +}; + +// --------------------------------------------- +// ConvertToolPrivate implementation + +struct ConvertTool::ConvertToolPrivate +{ + + // ctor & dtor +public: + ConvertToolPrivate(ConvertTool::ConvertSettings* settings) + : m_settings(settings) + , m_out(std::cout.rdbuf()) + {} + + ~ConvertToolPrivate() {} + + // interface +public: + bool Run(); + + // internal methods +private: + void PrintBed(const BamAlignment& a); + void PrintFasta(const BamAlignment& a); + void PrintFastq(const BamAlignment& a); + void PrintJson(const BamAlignment& a); + void PrintSam(const BamAlignment& a); + void PrintYaml(const BamAlignment& a); + + // special case - uses the PileupEngine + bool RunPileupConversion(BamMultiReader* reader); + + // data members +private: + ConvertTool::ConvertSettings* m_settings; + RefVector m_references; + std::ostream m_out; +}; + +bool ConvertTool::ConvertToolPrivate::Run() +{ + + // ------------------------------------ + // initialize conversion input/output + + // set to default input if none provided + if (!m_settings->HasInput && !m_settings->HasInputFilelist) + m_settings->InputFiles.push_back(Options::StandardIn()); + + // add files in the filelist to the input file list + if (m_settings->HasInputFilelist) { + + std::ifstream filelist(m_settings->InputFilelist.c_str(), std::ios::in); + if (!filelist.is_open()) { + std::cerr << "bamtools convert ERROR: could not open input BAM file list... Aborting." + << std::endl; + return false; + } + + std::string line; + while (std::getline(filelist, line)) + m_settings->InputFiles.push_back(line); + } + + // open input files + BamMultiReader reader; + if (!reader.Open(m_settings->InputFiles)) { + std::cerr << "bamtools convert ERROR: could not open input BAM file(s)... Aborting." + << std::endl; + return false; + } + + // if input is not stdin & a region is provided, look for index files + if (m_settings->HasInput && m_settings->HasRegion) { + if (!reader.LocateIndexes()) { + std::cerr << "bamtools convert ERROR: could not locate index file(s)... Aborting." + << std::endl; + return false; + } + } + + // retrieve reference data + m_references = reader.GetReferenceData(); + + // set region if specified + BamRegion region; + if (m_settings->HasRegion) { + if (Utilities::ParseRegionString(m_settings->Region, reader, region)) { + + if (reader.HasIndexes()) { + if (!reader.SetRegion(region)) { + std::cerr << "bamtools convert ERROR: set region failed. Check that REGION " + "describes a valid range" + << std::endl; + reader.Close(); + return false; + } + } + + } else { + std::cerr << "bamtools convert ERROR: could not parse REGION: " << m_settings->Region + << std::endl; + std::cerr << "Check that REGION is in valid format (see documentation) and that the " + "coordinates are valid" + << std::endl; + reader.Close(); + return false; + } + } + + // if output file given + std::ofstream outFile; + if (m_settings->HasOutput) { + + // open output file stream + outFile.open(m_settings->OutputFilename.c_str()); + if (!outFile) { + std::cerr << "bamtools convert ERROR: could not open " << m_settings->OutputFilename + << " for output" << std::endl; + return false; + } + + // set m_out to file's streambuf + m_out.rdbuf(outFile.rdbuf()); + } + + // ------------------------------------- + // do conversion based on format + + bool convertedOk = true; + + // pileup is special case + // conversion not done per alignment, like the other formats + if (m_settings->Format == FORMAT_PILEUP) convertedOk = RunPileupConversion(&reader); + + // all other formats + else { + + bool formatError = false; + + // set function pointer to proper conversion method + void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0; + if (m_settings->Format == FORMAT_BED) + pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed; + else if (m_settings->Format == FORMAT_FASTA) + pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta; + else if (m_settings->Format == FORMAT_FASTQ) + pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq; + else if (m_settings->Format == FORMAT_JSON) + pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson; + else if (m_settings->Format == FORMAT_SAM) + pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam; + else if (m_settings->Format == FORMAT_YAML) + pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintYaml; + else { + std::cerr << "bamtools convert ERROR: unrecognized format: " << m_settings->Format + << std::endl; + std::cerr << "Please see documentation for list of supported formats " << std::endl; + formatError = true; + convertedOk = false; + } + + // if format selected ok + if (!formatError) { + + // if SAM format & not omitting header, print SAM header first + if ((m_settings->Format == FORMAT_SAM) && !m_settings->IsOmittingSamHeader) + m_out << reader.GetHeaderText(); + + // iterate through file, doing conversion + BamAlignment a; + while (reader.GetNextAlignment(a)) + (this->*pFunction)(a); + + // set flag for successful conversion + convertedOk = true; + } + } + + // ------------------------ + // clean up & exit + reader.Close(); + if (m_settings->HasOutput) outFile.close(); + return convertedOk; +} + +// ---------------------------------------------------------- +// Conversion/output methods +// ---------------------------------------------------------- + +void ConvertTool::ConvertToolPrivate::PrintBed(const BamAlignment& a) +{ + + // tab-delimited, 0-based half-open + // (e.g. a 50-base read aligned to pos 10 could have BED coordinates (10, 60) instead of BAM coordinates (10, 59) ) + // <chromName> <chromStart> <chromEnd> <readName> <score> <strand> + + m_out << m_references.at(a.RefID).RefName << '\t' << a.Position << '\t' << a.GetEndPosition() + << '\t' << a.Name << '\t' << a.MapQuality << '\t' << (a.IsReverseStrand() ? '-' : '+') + << std::endl; +} + +// print BamAlignment in FASTA format +// N.B. - uses QueryBases NOT AlignedBases +void ConvertTool::ConvertToolPrivate::PrintFasta(const BamAlignment& a) +{ + + // >BamAlignment.Name + // BamAlignment.QueryBases (up to FASTA_LINE_MAX bases per line) + // ... + // + // N.B. - QueryBases are reverse-complemented if aligned to reverse strand + + // print header + m_out << '>' << a.Name << std::endl; + + // handle reverse strand alignment - bases + std::string sequence = a.QueryBases; + if (a.IsReverseStrand()) Utilities::ReverseComplement(sequence); + + // if sequence fits on single line + if (sequence.length() <= FASTA_LINE_MAX) m_out << sequence << std::endl; + + // else split over multiple lines + else { + + std::size_t position = 0; + std::size_t seqLength = + sequence.length(); // handle reverse strand alignment - bases & qualitiesth(); + + // write subsequences to each line + while (position < (seqLength - FASTA_LINE_MAX)) { + m_out << sequence.substr(position, FASTA_LINE_MAX) << std::endl; + position += FASTA_LINE_MAX; + } + + // write final subsequence + m_out << sequence.substr(position) << std::endl; + } +} + +// print BamAlignment in FASTQ format +// N.B. - uses QueryBases NOT AlignedBases +void ConvertTool::ConvertToolPrivate::PrintFastq(const BamAlignment& a) +{ + + // @BamAlignment.Name + // BamAlignment.QueryBases + // + + // BamAlignment.Qualities + // + // N.B. - QueryBases are reverse-complemented (& Qualities reversed) if aligned to reverse strand . + // Name is appended "/1" or "/2" if paired-end, to reflect which mate this entry is. + + // handle paired-end alignments + std::string name = a.Name; + if (a.IsPaired()) name.append((a.IsFirstMate() ? "/1" : "/2")); + + // handle reverse strand alignment - bases & qualities + std::string qualities = a.Qualities; + std::string sequence = a.QueryBases; + if (a.IsReverseStrand()) { + Utilities::Reverse(qualities); + Utilities::ReverseComplement(sequence); + } + + // write to output stream + m_out << '@' << name << std::endl + << sequence << std::endl + << '+' << std::endl + << qualities << std::endl; +} + +// print BamAlignment in JSON format +void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a) +{ + + // write name & alignment flag + m_out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\"" << a.AlignmentFlag << "\","; + + // write reference name + if ((a.RefID >= 0) && (a.RefID < (int)m_references.size())) + m_out << "\"reference\":\"" << m_references[a.RefID].RefName << "\","; + + // write position & map quality + m_out << "\"position\":" << a.Position + 1 << ",\"mapQuality\":" << a.MapQuality << ','; + + // write CIGAR + const std::vector<CigarOp>& cigarData = a.CigarData; + if (!cigarData.empty()) { + m_out << "\"cigar\":["; + std::vector<CigarOp>::const_iterator cigarBegin = cigarData.begin(); + std::vector<CigarOp>::const_iterator cigarIter = cigarBegin; + std::vector<CigarOp>::const_iterator cigarEnd = cigarData.end(); + for (; cigarIter != cigarEnd; ++cigarIter) { + const CigarOp& op = (*cigarIter); + if (cigarIter != cigarBegin) m_out << ','; + m_out << '"' << op.Length << op.Type << '"'; + } + m_out << "],"; + } + + // write mate reference name, mate position, & insert size + if (a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size())) { + m_out << "\"mate\":{" + << "\"reference\":\"" << m_references[a.MateRefID].RefName << "\"," + << "\"position\":" << a.MatePosition + 1 << ",\"insertSize\":" << a.InsertSize + << "},"; + } + + // write sequence + if (!a.QueryBases.empty()) m_out << "\"queryBases\":\"" << a.QueryBases << "\","; + + // write qualities + if (!a.Qualities.empty() && a.Qualities.at(0) != (char)0xFF) { + std::string::const_iterator s = a.Qualities.begin(); + m_out << "\"qualities\":[" << static_cast<short>(*s) - 33; + ++s; + for (; s != a.Qualities.end(); ++s) + m_out << ',' << static_cast<short>(*s) - 33; + m_out << "],"; + } + + // write alignment's source BAM file + m_out << "\"filename\":\"" << a.Filename << "\","; + + // write tag data + const char* tagData = a.TagData.c_str(); + const std::size_t tagDataLength = a.TagData.length(); + std::size_t index = 0; + if (index < tagDataLength) { + + m_out << "\"tags\":{"; + + while (index < tagDataLength) { + + if (index > 0) m_out << ','; + + // write tag name + m_out << '"' << a.TagData.substr(index, 2) << "\":"; + index += 2; + + // get data type + char type = a.TagData.at(index); + ++index; + switch (type) { + case (Constants::BAM_TAG_TYPE_ASCII): + m_out << '"' << tagData[index] << '"'; + ++index; + break; + + case (Constants::BAM_TAG_TYPE_INT8): + // force value into integer-type (instead of char value) + m_out << static_cast<int16_t>(tagData[index]); + ++index; + break; + + case (Constants::BAM_TAG_TYPE_UINT8): + // force value into integer-type (instead of char value) + m_out << static_cast<uint16_t>(tagData[index]); + ++index; + break; + + case (Constants::BAM_TAG_TYPE_INT16): + m_out << BamTools::UnpackSignedShort(&tagData[index]); + index += sizeof(int16_t); + break; + + case (Constants::BAM_TAG_TYPE_UINT16): + m_out << BamTools::UnpackUnsignedShort(&tagData[index]); + index += sizeof(uint16_t); + break; + + case (Constants::BAM_TAG_TYPE_INT32): + m_out << BamTools::UnpackSignedInt(&tagData[index]); + index += sizeof(int32_t); + break; + + case (Constants::BAM_TAG_TYPE_UINT32): + m_out << BamTools::UnpackUnsignedInt(&tagData[index]); + index += sizeof(uint32_t); + break; + + case (Constants::BAM_TAG_TYPE_FLOAT): + m_out << BamTools::UnpackFloat(&tagData[index]); + index += sizeof(float); + break; + + case (Constants::BAM_TAG_TYPE_HEX): + case (Constants::BAM_TAG_TYPE_STRING): + m_out << '"'; + while (tagData[index]) { + if (tagData[index] == '\"') + m_out << "\\\""; // escape for json + else + m_out << tagData[index]; + ++index; + } + m_out << '"'; + ++index; + break; + } + + if (tagData[index] == '\0') break; + } + + m_out << '}'; + } + + m_out << '}' << std::endl; +} + +// print BamAlignment in SAM format +void ConvertTool::ConvertToolPrivate::PrintSam(const BamAlignment& a) +{ + + // tab-delimited + // <QNAME> <FLAG> <RNAME> <POS> <MAPQ> <CIGAR> <MRNM> <MPOS> <ISIZE> <SEQ> <QUAL> [ <TAG>:<VTYPE>:<VALUE> [...] ] + + // write name & alignment flag + m_out << a.Name << '\t' << a.AlignmentFlag << '\t'; + + // write reference name + if ((a.RefID >= 0) && (a.RefID < (int)m_references.size())) + m_out << m_references[a.RefID].RefName << '\t'; + else + m_out << "*\t"; + + // write position & map quality + m_out << a.Position + 1 << '\t' << a.MapQuality << '\t'; + + // write CIGAR + const std::vector<CigarOp>& cigarData = a.CigarData; + if (cigarData.empty()) + m_out << "*\t"; + else { + std::vector<CigarOp>::const_iterator cigarIter = cigarData.begin(); + std::vector<CigarOp>::const_iterator cigarEnd = cigarData.end(); + for (; cigarIter != cigarEnd; ++cigarIter) { + const CigarOp& op = (*cigarIter); + m_out << op.Length << op.Type; + } + m_out << '\t'; + } + + // write mate reference name, mate position, & insert size + if (a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size())) { + if (a.MateRefID == a.RefID) + m_out << "=\t"; + else + m_out << m_references[a.MateRefID].RefName << '\t'; + m_out << a.MatePosition + 1 << '\t' << a.InsertSize << '\t'; + } else + m_out << "*\t0\t0\t"; + + // write sequence + if (a.QueryBases.empty()) + m_out << "*\t"; + else + m_out << a.QueryBases << '\t'; + + // write qualities + if (a.Qualities.empty() || (a.Qualities.at(0) == (char)0xFF)) + m_out << '*'; + else + m_out << a.Qualities; + + // write tag data + const char* tagData = a.TagData.c_str(); + const std::size_t tagDataLength = a.TagData.length(); + + std::size_t index = 0; + while (index < tagDataLength) { + + // write tag name + std::string tagName = a.TagData.substr(index, 2); + m_out << '\t' << tagName << ':'; + index += 2; + + // get data type + char type = a.TagData.at(index); + ++index; + switch (type) { + case (Constants::BAM_TAG_TYPE_ASCII): + m_out << "A:" << tagData[index]; + ++index; + break; + + case (Constants::BAM_TAG_TYPE_INT8): + // force value into integer-type (instead of char value) + m_out << "i:" << static_cast<int16_t>(tagData[index]); + ++index; + break; + + case (Constants::BAM_TAG_TYPE_UINT8): + // force value into integer-type (instead of char value) + m_out << "i:" << static_cast<uint16_t>(tagData[index]); + ++index; + break; + + case (Constants::BAM_TAG_TYPE_INT16): + m_out << "i:" << BamTools::UnpackSignedShort(&tagData[index]); + index += sizeof(int16_t); + break; + + case (Constants::BAM_TAG_TYPE_UINT16): + m_out << "i:" << BamTools::UnpackUnsignedShort(&tagData[index]); + index += sizeof(uint16_t); + break; + + case (Constants::BAM_TAG_TYPE_INT32): + m_out << "i:" << BamTools::UnpackSignedInt(&tagData[index]); + index += sizeof(int32_t); + break; + + case (Constants::BAM_TAG_TYPE_UINT32): + m_out << "i:" << BamTools::UnpackUnsignedInt(&tagData[index]); + index += sizeof(uint32_t); + break; + + case (Constants::BAM_TAG_TYPE_FLOAT): + m_out << "f:" << BamTools::UnpackFloat(&tagData[index]); + index += sizeof(float); + break; + + case (Constants::BAM_TAG_TYPE_HEX): // fall-through + case (Constants::BAM_TAG_TYPE_STRING): + m_out << type << ':'; + while (tagData[index]) { + m_out << tagData[index]; + ++index; + } + ++index; + break; + } + + if (tagData[index] == '\0') break; + } + + m_out << std::endl; +} + +// Print BamAlignment in YAML format +void ConvertTool::ConvertToolPrivate::PrintYaml(const BamAlignment& a) +{ + + // write alignment name + m_out << "---" << std::endl; + m_out << a.Name << ':' << std::endl; + + // write alignment data + m_out << " " + << "AlndBases: " << a.AlignedBases << std::endl; + m_out << " " + << "Qualities: " << a.Qualities << std::endl; + m_out << " " + << "Name: " << a.Name << std::endl; + m_out << " " + << "Length: " << a.Length << std::endl; + m_out << " " + << "TagData: " << a.TagData << std::endl; + m_out << " " + << "RefID: " << a.RefID << std::endl; + m_out << " " + << "RefName: " << m_references[a.RefID].RefName << std::endl; + m_out << " " + << "Position: " << a.Position << std::endl; + m_out << " " + << "Bin: " << a.Bin << std::endl; + m_out << " " + << "MapQuality: " << a.MapQuality << std::endl; + m_out << " " + << "AlignmentFlag: " << a.AlignmentFlag << std::endl; + m_out << " " + << "MateRefID: " << a.MateRefID << std::endl; + m_out << " " + << "MatePosition: " << a.MatePosition << std::endl; + m_out << " " + << "InsertSize: " << a.InsertSize << std::endl; + m_out << " " + << "Filename: " << a.Filename << std::endl; + + // write Cigar data + const std::vector<CigarOp>& cigarData = a.CigarData; + if (!cigarData.empty()) { + m_out << " " + << "Cigar: "; + std::vector<CigarOp>::const_iterator cigarBegin = cigarData.begin(); + std::vector<CigarOp>::const_iterator cigarIter = cigarBegin; + std::vector<CigarOp>::const_iterator cigarEnd = cigarData.end(); + for (; cigarIter != cigarEnd; ++cigarIter) { + const CigarOp& op = (*cigarIter); + m_out << op.Length << op.Type; + } + m_out << std::endl; + } +} + +bool ConvertTool::ConvertToolPrivate::RunPileupConversion(BamMultiReader* reader) +{ + + // check for valid BamMultiReader + if (reader == 0) return false; + + // set up our pileup format 'visitor' + ConvertPileupFormatVisitor* v = new ConvertPileupFormatVisitor( + m_references, m_settings->FastaFilename, m_settings->IsPrintingPileupMapQualities, &m_out); + + // set up PileupEngine + PileupEngine pileup; + pileup.AddVisitor(v); + + // iterate through data + BamAlignment al; + while (reader->GetNextAlignment(al)) + pileup.AddAlignment(al); + pileup.Flush(); + + // clean up + delete v; + v = 0; + + // return success + return true; +} + +// --------------------------------------------- +// ConvertTool implementation + +ConvertTool::ConvertTool() + : AbstractTool() + , m_settings(new ConvertSettings) + , m_impl(0) +{ + // set program details + Options::SetProgramInfo("bamtools convert", "converts BAM to a number of other formats", + "-format <FORMAT> [-in <filename> -in <filename> ... | -list " + "<filelist>] [-out <filename>] [-region <REGION>] [format-specific " + "options]"); + + // set up options + OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); + Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", + m_settings->HasInput, m_settings->InputFiles, IO_Opts, + Options::StandardIn()); + Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "", + m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts); + Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", + m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, + Options::StandardOut()); + Options::AddValueOption("-format", "FORMAT", + "the output file format - see README for recognized formats", "", + m_settings->HasFormat, m_settings->Format, IO_Opts); + Options::AddValueOption("-region", "REGION", + "genomic region. Index file is recommended for better performance, and " + "is used automatically if it exists. See \'bamtools help index\' for " + "more details on creating one", + "", m_settings->HasRegion, m_settings->Region, IO_Opts); + + OptionGroup* PileupOpts = Options::CreateOptionGroup("Pileup Options"); + Options::AddValueOption("-fasta", "FASTA filename", "FASTA reference file", "", + m_settings->HasFastaFilename, m_settings->FastaFilename, PileupOpts); + Options::AddOption("-mapqual", "print the mapping qualities", + m_settings->IsPrintingPileupMapQualities, PileupOpts); + + OptionGroup* SamOpts = Options::CreateOptionGroup("SAM Options"); + Options::AddOption("-noheader", "omit the SAM header from output", + m_settings->IsOmittingSamHeader, SamOpts); +} + +ConvertTool::~ConvertTool() +{ + + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int ConvertTool::Help() +{ + Options::DisplayHelp(); + return 0; +} + +int ConvertTool::Run(int argc, char* argv[]) +{ + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // initialize ConvertTool with settings + m_impl = new ConvertToolPrivate(m_settings); + + // run ConvertTool, return success/fail + if (m_impl->Run()) + return 0; + else + return 1; +} + +// --------------------------------------------- +// ConvertPileupFormatVisitor implementation + +ConvertPileupFormatVisitor::ConvertPileupFormatVisitor(const RefVector& references, + const std::string& fastaFilename, + const bool isPrintingMapQualities, + std::ostream* out) + : PileupVisitor() + , m_hasFasta(false) + , m_isPrintingMapQualities(isPrintingMapQualities) + , m_out(out) + , m_references(references) +{ + // set up Fasta reader if file is provided + if (!fastaFilename.empty()) { + + // check for FASTA index + std::string indexFilename; + if (Utilities::FileExists(fastaFilename + ".fai")) indexFilename = fastaFilename + ".fai"; + + // open FASTA file + if (m_fasta.Open(fastaFilename, indexFilename)) m_hasFasta = true; + } +} + +ConvertPileupFormatVisitor::~ConvertPileupFormatVisitor() +{ + // be sure to close Fasta reader + if (m_hasFasta) { + m_fasta.Close(); + m_hasFasta = false; + } +} + +void ConvertPileupFormatVisitor::Visit(const PileupPosition& pileupData) +{ + + // skip if no alignments at this position + if (pileupData.PileupAlignments.empty()) return; + + // retrieve reference name + const std::string& referenceName = m_references[pileupData.RefId].RefName; + const int& position = pileupData.Position; + + // retrieve reference base from FASTA file, if one provided; otherwise default to 'N' + char referenceBase('N'); + if (m_hasFasta && (pileupData.Position < m_references[pileupData.RefId].RefLength)) { + if (!m_fasta.GetBase(pileupData.RefId, pileupData.Position, referenceBase)) { + std::cerr << "bamtools convert ERROR: pileup conversion - could not read reference " + "base from FASTA file" + << std::endl; + return; + } + } + + // get count of alleles at this position + const int numberAlleles = pileupData.PileupAlignments.size(); + + // ----------------------------------------------------------- + // build strings based on alleles at this positionInAlignment + + std::stringstream bases; + std::stringstream baseQualities; + std::stringstream mapQualities; + + // iterate over alignments at this pileup position + std::vector<PileupAlignment>::const_iterator pileupIter = pileupData.PileupAlignments.begin(); + std::vector<PileupAlignment>::const_iterator pileupEnd = pileupData.PileupAlignments.end(); + for (; pileupIter != pileupEnd; ++pileupIter) { + const PileupAlignment pa = (*pileupIter); + const BamAlignment& ba = pa.Alignment; + + // if beginning of read segment + if (pa.IsSegmentBegin) + bases << '^' + << (((int)ba.MapQuality > 93) ? (char)126 : (char)((int)ba.MapQuality + 33)); + + // if current base is not a DELETION + if (!pa.IsCurrentDeletion) { + + // get base at current position + char base = ba.QueryBases.at(pa.PositionInAlignment); + + // if base matches reference + if (base == '=' || toupper(base) == toupper(referenceBase) || + tolower(base) == tolower(referenceBase)) { + base = (ba.IsReverseStrand() ? ',' : '.'); + } + + // mismatches reference + else + base = (ba.IsReverseStrand() ? tolower(base) : toupper(base)); + + // store base + bases << base; + + // if next position contains insertion + if (pa.IsNextInsertion) { + bases << '+' << pa.InsertionLength; + for (int i = 1; i <= pa.InsertionLength; ++i) { + char insertedBase = (char)ba.QueryBases.at(pa.PositionInAlignment + i); + bases << (ba.IsReverseStrand() ? (char)tolower(insertedBase) + : (char)toupper(insertedBase)); + } + } + + // if next position contains DELETION + else if (pa.IsNextDeletion) { + bases << '-' << pa.DeletionLength; + for (int i = 1; i <= pa.DeletionLength; ++i) { + char deletedBase('N'); + if (m_hasFasta && + (pileupData.Position + i < m_references[pileupData.RefId].RefLength)) { + if (!m_fasta.GetBase(pileupData.RefId, pileupData.Position + i, + deletedBase)) { + std::cerr << "bamtools convert ERROR: pileup conversion - could not " + "read reference base from FASTA file" + << std::endl; + return; + } + } + bases << (ba.IsReverseStrand() ? (char)tolower(deletedBase) + : (char)toupper(deletedBase)); + } + } + } + + // otherwise, DELETION + else + bases << '*'; + + // if end of read segment + if (pa.IsSegmentEnd) bases << '$'; + + // store current base quality + baseQualities << ba.Qualities.at(pa.PositionInAlignment); + + // save alignment map quality if desired + if (m_isPrintingMapQualities) + mapQualities << (((int)ba.MapQuality > 93) ? (char)126 + : (char)((int)ba.MapQuality + 33)); + } + + // ---------------------- + // print results + + // tab-delimited + // <refName> <1-based pos> <refBase> <numberAlleles> <bases> <qualities> [mapQuals] + + const std::string TAB(1, '\t'); + *m_out << referenceName << TAB << position + 1 << TAB << referenceBase << TAB << numberAlleles + << TAB << bases.str() << TAB << baseQualities.str() << TAB << mapQualities.str() + << std::endl; +} diff --git a/src/toolkit/bamtools_convert.h b/src/toolkit/bamtools_convert.h new file mode 100644 index 0000000..d981963 --- /dev/null +++ b/src/toolkit/bamtools_convert.h @@ -0,0 +1,38 @@ +// *************************************************************************** +// bamtools_convert.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 9 July 2010 +// --------------------------------------------------------------------------- +// Converts between BAM and a number of other formats +// *************************************************************************** + +#ifndef BAMTOOLS_CONVERT_H +#define BAMTOOLS_CONVERT_H + +#include "bamtools_tool.h" + +namespace BamTools { + +class ConvertTool : public AbstractTool +{ + +public: + ConvertTool(); + ~ConvertTool(); + +public: + int Help(); + int Run(int argc, char* argv[]); + +private: + struct ConvertSettings; + ConvertSettings* m_settings; + + struct ConvertToolPrivate; + ConvertToolPrivate* m_impl; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_CONVERT_H diff --git a/src/toolkit/bamtools_count.cpp b/src/toolkit/bamtools_count.cpp new file mode 100644 index 0000000..95c5edc --- /dev/null +++ b/src/toolkit/bamtools_count.cpp @@ -0,0 +1,228 @@ +// *************************************************************************** +// bamtools_count.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 December 2012 +// --------------------------------------------------------------------------- +// Prints alignment count for BAM file(s) +// *************************************************************************** + +#include "bamtools_count.h" + +#include <api/BamAlgorithms.h> +#include <api/BamMultiReader.h> +#include <utils/bamtools_options.h> +#include <utils/bamtools_utilities.h> +using namespace BamTools; + +#include <fstream> +#include <iostream> +#include <string> +#include <vector> + +// --------------------------------------------- +// CountSettings implementation + +struct CountTool::CountSettings +{ + + // flags + bool HasInput; + bool HasInputFilelist; + bool HasRegion; + + // filenames + std::vector<std::string> InputFiles; + std::string InputFilelist; + std::string Region; + + // constructor + CountSettings() + : HasInput(false) + , HasInputFilelist(false) + , HasRegion(false) + {} +}; + +// --------------------------------------------- +// CountToolPrivate implementation + +struct CountTool::CountToolPrivate +{ + + // ctor & dtro +public: + CountToolPrivate(CountTool::CountSettings* settings) + : m_settings(settings) + {} + + ~CountToolPrivate() {} + + // interface +public: + bool Run(); + + // data members +private: + CountTool::CountSettings* m_settings; +}; + +bool CountTool::CountToolPrivate::Run() +{ + + // set to default input if none provided + if (!m_settings->HasInput && !m_settings->HasInputFilelist) + m_settings->InputFiles.push_back(Options::StandardIn()); + + // add files in the filelist to the input file list + if (m_settings->HasInputFilelist) { + + std::ifstream filelist(m_settings->InputFilelist.c_str(), std::ios::in); + if (!filelist.is_open()) { + std::cerr << "bamtools count ERROR: could not open input BAM file list... Aborting." + << std::endl; + return false; + } + + std::string line; + while (std::getline(filelist, line)) + m_settings->InputFiles.push_back(line); + } + + // open reader without index + BamMultiReader reader; + if (!reader.Open(m_settings->InputFiles)) { + std::cerr << "bamtools count ERROR: could not open input BAM file(s)... Aborting." + << std::endl; + return false; + } + + // alignment counter + BamAlignment al; + int alignmentCount(0); + + // if no region specified, count entire file + if (!m_settings->HasRegion) { + while (reader.GetNextAlignmentCore(al)) + ++alignmentCount; + } + + // otherwise attempt to use region as constraint + else { + + // if region string parses OK + BamRegion region; + if (Utilities::ParseRegionString(m_settings->Region, reader, region)) { + + // attempt to find index files + reader.LocateIndexes(); + + // if index data available for all BAM files, we can use SetRegion + if (reader.HasIndexes()) { + + // attempt to set region on reader + if (!reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, + region.RightPosition)) { + std::cerr << "bamtools count ERROR: set region failed. Check that REGION " + "describes a valid range" + << std::endl; + reader.Close(); + return false; + } + + // everything checks out, just iterate through specified region, counting alignments + while (reader.GetNextAlignmentCore(al)) + ++alignmentCount; + } + + // no index data available, we have to iterate through until we + // find overlapping alignments + else { + while (reader.GetNextAlignmentCore(al)) { + if ((al.RefID >= region.LeftRefID) && + ((al.Position + al.Length) >= region.LeftPosition) && + (al.RefID <= region.RightRefID) && (al.Position <= region.RightPosition)) { + ++alignmentCount; + } + } + } + } + + // error parsing REGION string + else { + std::cerr << "bamtools count ERROR: could not parse REGION - " << m_settings->Region + << std::endl; + std::cerr << "Check that REGION is in valid format (see documentation) and that the " + "coordinates are valid" + << std::endl; + reader.Close(); + return false; + } + } + + // print results + std::cout << alignmentCount << std::endl; + + // clean up & exit + reader.Close(); + return true; +} + +// --------------------------------------------- +// CountTool implementation + +CountTool::CountTool() + : AbstractTool() + , m_settings(new CountSettings) + , m_impl(0) +{ + // set program details + Options::SetProgramInfo( + "bamtools count", "prints number of alignments in BAM file(s)", + "[-in <filename> -in <filename> ... | -list <filelist>] [-region <REGION>]"); + + // set up options + OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); + Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", + m_settings->HasInput, m_settings->InputFiles, IO_Opts, + Options::StandardIn()); + Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "", + m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts); + Options::AddValueOption("-region", "REGION", + "genomic region. Index file is recommended for better performance, and " + "is used automatically if it exists. See \'bamtools help index\' for " + "more details on creating one", + "", m_settings->HasRegion, m_settings->Region, IO_Opts); +} + +CountTool::~CountTool() +{ + + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int CountTool::Help() +{ + Options::DisplayHelp(); + return 0; +} + +int CountTool::Run(int argc, char* argv[]) +{ + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // initialize CountTool with settings + m_impl = new CountToolPrivate(m_settings); + + // run CountTool, return success/fail + if (m_impl->Run()) + return 0; + else + return 1; +} diff --git a/src/toolkit/bamtools_count.h b/src/toolkit/bamtools_count.h new file mode 100644 index 0000000..57de0f9 --- /dev/null +++ b/src/toolkit/bamtools_count.h @@ -0,0 +1,38 @@ +// *************************************************************************** +// bamtools_count.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 7 April 2011 +// --------------------------------------------------------------------------- +// Prints alignment count for BAM file(s) +// *************************************************************************** + +#ifndef BAMTOOLS_COUNT_H +#define BAMTOOLS_COUNT_H + +#include "bamtools_tool.h" + +namespace BamTools { + +class CountTool : public AbstractTool +{ + +public: + CountTool(); + ~CountTool(); + +public: + int Help(); + int Run(int argc, char* argv[]); + +private: + struct CountSettings; + CountSettings* m_settings; + + struct CountToolPrivate; + CountToolPrivate* m_impl; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_COUNT_H diff --git a/src/toolkit/bamtools_coverage.cpp b/src/toolkit/bamtools_coverage.cpp new file mode 100644 index 0000000..aaf1de4 --- /dev/null +++ b/src/toolkit/bamtools_coverage.cpp @@ -0,0 +1,207 @@ +// *************************************************************************** +// bamtools_coverage.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 24 July 2013 +// --------------------------------------------------------------------------- +// Prints coverage data for a single BAM file +// *************************************************************************** + +#include "bamtools_coverage.h" + +#include <api/BamReader.h> +#include <utils/bamtools_options.h> +#include <utils/bamtools_pileup_engine.h> +using namespace BamTools; + +#include <fstream> +#include <iostream> +#include <string> +#include <vector> + +namespace BamTools { + +// --------------------------------------------- +// CoverageVisitor implementation + +class CoverageVisitor : public PileupVisitor +{ + +public: + CoverageVisitor(const RefVector& references, std::ostream* out) + : PileupVisitor() + , m_references(references) + , m_out(out) + {} + ~CoverageVisitor() {} + + // PileupVisitor interface implementation +public: + // prints coverage results ( tab-delimited ) + void Visit(const PileupPosition& pileupData) + { + *m_out << m_references[pileupData.RefId].RefName << '\t' << pileupData.Position << '\t' + << pileupData.PileupAlignments.size() << std::endl; + } + +private: + RefVector m_references; + std::ostream* m_out; +}; + +} // namespace BamTools + +// --------------------------------------------- +// CoverageSettings implementation + +struct CoverageTool::CoverageSettings +{ + + // flags + bool HasInputFile; + bool HasOutputFile; + + // filenames + std::string InputBamFilename; + std::string OutputFilename; + + // constructor + CoverageSettings() + : HasInputFile(false) + , HasOutputFile(false) + , InputBamFilename(Options::StandardIn()) + , OutputFilename(Options::StandardOut()) + {} +}; + +// --------------------------------------------- +// CoverageToolPrivate implementation + +struct CoverageTool::CoverageToolPrivate +{ + + // ctor & dtor +public: + CoverageToolPrivate(CoverageTool::CoverageSettings* settings) + : m_settings(settings) + , m_out(std::cout.rdbuf()) + {} + + ~CoverageToolPrivate() {} + + // interface +public: + bool Run(); + + // data members +private: + CoverageTool::CoverageSettings* m_settings; + std::ostream m_out; + RefVector m_references; +}; + +bool CoverageTool::CoverageToolPrivate::Run() +{ + + // if output filename given + std::ofstream outFile; + if (m_settings->HasOutputFile) { + + // open output file stream + outFile.open(m_settings->OutputFilename.c_str()); + if (!outFile) { + std::cerr << "bamtools coverage ERROR: could not open " << m_settings->OutputFilename + << " for output" << std::endl; + return false; + } + + // set m_out to file's streambuf + m_out.rdbuf(outFile.rdbuf()); + } + + //open our BAM reader + BamReader reader; + if (!reader.Open(m_settings->InputBamFilename)) { + std::cerr << "bamtools coverage ERROR: could not open input BAM file: " + << m_settings->InputBamFilename << std::endl; + return false; + } + + // retrieve references + m_references = reader.GetReferenceData(); + + // set up our output 'visitor' + CoverageVisitor* cv = new CoverageVisitor(m_references, &m_out); + + // set up pileup engine with 'visitor' + PileupEngine pileup; + pileup.AddVisitor(cv); + + // process input data + BamAlignment al; + while (reader.GetNextAlignment(al)) + pileup.AddAlignment(al); + pileup.Flush(); + + // clean up + reader.Close(); + if (m_settings->HasOutputFile) outFile.close(); + delete cv; + cv = 0; + + // return success + return true; +} + +// --------------------------------------------- +// CoverageTool implementation + +CoverageTool::CoverageTool() + : AbstractTool() + , m_settings(new CoverageSettings) + , m_impl(0) +{ + // set program details + Options::SetProgramInfo("bamtools coverage", "prints coverage data for a single BAM file", + "[-in <filename>] [-out <filename>]"); + + // set up options + OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); + Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", + m_settings->HasInputFile, m_settings->InputBamFilename, IO_Opts, + Options::StandardIn()); + Options::AddValueOption("-out", "filename", "the output file", "", m_settings->HasOutputFile, + m_settings->OutputFilename, IO_Opts, Options::StandardOut()); +} + +CoverageTool::~CoverageTool() +{ + + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int CoverageTool::Help() +{ + Options::DisplayHelp(); + return 0; +} + +int CoverageTool::Run(int argc, char* argv[]) +{ + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // initialize CoverageTool with settings + m_impl = new CoverageToolPrivate(m_settings); + + // run CoverageTool, return success/fail + if (m_impl->Run()) + return 0; + else + return 1; +} diff --git a/src/toolkit/bamtools_coverage.h b/src/toolkit/bamtools_coverage.h new file mode 100644 index 0000000..df5cadc --- /dev/null +++ b/src/toolkit/bamtools_coverage.h @@ -0,0 +1,38 @@ +// *************************************************************************** +// bamtools_coverage.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 1 August 2010 +// --------------------------------------------------------------------------- +// Prints coverage data for a single BAM file +// *************************************************************************** + +#ifndef BAMTOOLS_COVERAGE_H +#define BAMTOOLS_COVERAGE_H + +#include "bamtools_tool.h" + +namespace BamTools { + +class CoverageTool : public AbstractTool +{ + +public: + CoverageTool(); + ~CoverageTool(); + +public: + int Help(); + int Run(int argc, char* argv[]); + +private: + struct CoverageSettings; + CoverageSettings* m_settings; + + struct CoverageToolPrivate; + CoverageToolPrivate* m_impl; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_COVERAGE_H diff --git a/src/toolkit/bamtools_filter.cpp b/src/toolkit/bamtools_filter.cpp new file mode 100644 index 0000000..46b6402 --- /dev/null +++ b/src/toolkit/bamtools_filter.cpp @@ -0,0 +1,1048 @@ +// *************************************************************************** +// bamtools_filter.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 3 May 2013 +// --------------------------------------------------------------------------- +// Filters BAM file(s) according to some user-specified criteria +// *************************************************************************** + +#include "bamtools_filter.h" + +#include <api/BamMultiReader.h> +#include <api/BamWriter.h> +#include <utils/bamtools_filter_engine.h> +#include <utils/bamtools_options.h> +#include <utils/bamtools_utilities.h> +using namespace BamTools; + +#include <json/json.h> +using namespace Json; + +#include <cstdio> +#include <fstream> +#include <iostream> +#include <sstream> +#include <string> +#include <vector> + +namespace BamTools { + +// ------------------------------- +// string literal constants + +// property names +const std::string ALIGNMENTFLAG_PROPERTY = "alignmentFlag"; +const std::string CIGAR_PROPERTY = "cigar"; +const std::string INSERTSIZE_PROPERTY = "insertSize"; +const std::string ISDUPLICATE_PROPERTY = "isDuplicate"; +const std::string ISFAILEDQC_PROPERTY = "isFailedQC"; +const std::string ISFIRSTMATE_PROPERTY = "isFirstMate"; +const std::string ISMAPPED_PROPERTY = "isMapped"; +const std::string ISMATEMAPPED_PROPERTY = "isMateMapped"; +const std::string ISMATEREVERSESTRAND_PROPERTY = "isMateReverseStrand"; +const std::string ISPAIRED_PROPERTY = "isPaired"; +const std::string ISPRIMARYALIGNMENT_PROPERTY = "isPrimaryAlignment"; +const std::string ISPROPERPAIR_PROPERTY = "isProperPair"; +const std::string ISREVERSESTRAND_PROPERTY = "isReverseStrand"; +const std::string ISSECONDMATE_PROPERTY = "isSecondMate"; +const std::string ISSINGLETON_PROPERTY = "isSingleton"; +const std::string LENGTH_PROPERTY = "length"; +const std::string MAPQUALITY_PROPERTY = "mapQuality"; +const std::string MATEPOSITION_PROPERTY = "matePosition"; +const std::string MATEREFERENCE_PROPERTY = "mateReference"; +const std::string NAME_PROPERTY = "name"; +const std::string POSITION_PROPERTY = "position"; +const std::string QUERYBASES_PROPERTY = "queryBases"; +const std::string REFERENCE_PROPERTY = "reference"; +const std::string TAG_PROPERTY = "tag"; + +// boolalpha +const std::string TRUE_STR = "true"; +const std::string FALSE_STR = "false"; + +RefVector filterToolReferences; + +struct BamAlignmentChecker +{ + bool check(const PropertyFilter& filter, const BamAlignment& al) + { + + bool keepAlignment = true; + const PropertyMap& properties = filter.Properties; + PropertyMap::const_iterator propertyIter = properties.begin(); + PropertyMap::const_iterator propertyEnd = properties.end(); + for (; propertyIter != propertyEnd; ++propertyIter) { + + // check alignment data field depending on propertyName + const std::string& propertyName = (*propertyIter).first; + const PropertyFilterValue& valueFilter = (*propertyIter).second; + + if (propertyName == ALIGNMENTFLAG_PROPERTY) + keepAlignment &= valueFilter.check(al.AlignmentFlag); + else if (propertyName == CIGAR_PROPERTY) { + std::stringstream cigarSs; + const std::vector<CigarOp>& cigarData = al.CigarData; + if (!cigarData.empty()) { + std::vector<CigarOp>::const_iterator cigarBegin = cigarData.begin(); + std::vector<CigarOp>::const_iterator cigarIter = cigarBegin; + std::vector<CigarOp>::const_iterator cigarEnd = cigarData.end(); + for (; cigarIter != cigarEnd; ++cigarIter) { + const CigarOp& op = (*cigarIter); + cigarSs << op.Length << op.Type; + } + keepAlignment &= valueFilter.check(cigarSs.str()); + } + } else if (propertyName == INSERTSIZE_PROPERTY) + keepAlignment &= valueFilter.check(al.InsertSize); + else if (propertyName == ISDUPLICATE_PROPERTY) + keepAlignment &= valueFilter.check(al.IsDuplicate()); + else if (propertyName == ISFAILEDQC_PROPERTY) + keepAlignment &= valueFilter.check(al.IsFailedQC()); + else if (propertyName == ISFIRSTMATE_PROPERTY) + keepAlignment &= valueFilter.check(al.IsFirstMate()); + else if (propertyName == ISMAPPED_PROPERTY) + keepAlignment &= valueFilter.check(al.IsMapped()); + else if (propertyName == ISMATEMAPPED_PROPERTY) + keepAlignment &= valueFilter.check(al.IsMateMapped()); + else if (propertyName == ISMATEREVERSESTRAND_PROPERTY) + keepAlignment &= valueFilter.check(al.IsMateReverseStrand()); + else if (propertyName == ISPAIRED_PROPERTY) + keepAlignment &= valueFilter.check(al.IsPaired()); + else if (propertyName == ISPRIMARYALIGNMENT_PROPERTY) + keepAlignment &= valueFilter.check(al.IsPrimaryAlignment()); + else if (propertyName == ISPROPERPAIR_PROPERTY) + keepAlignment &= valueFilter.check(al.IsProperPair()); + else if (propertyName == ISREVERSESTRAND_PROPERTY) + keepAlignment &= valueFilter.check(al.IsReverseStrand()); + else if (propertyName == ISSECONDMATE_PROPERTY) + keepAlignment &= valueFilter.check(al.IsSecondMate()); + else if (propertyName == ISSINGLETON_PROPERTY) { + const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped(); + keepAlignment &= valueFilter.check(isSingleton); + } else if (propertyName == LENGTH_PROPERTY) + keepAlignment &= valueFilter.check(al.Length); + else if (propertyName == MAPQUALITY_PROPERTY) + keepAlignment &= valueFilter.check(al.MapQuality); + else if (propertyName == MATEPOSITION_PROPERTY) + keepAlignment &= + (al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID)); + else if (propertyName == MATEREFERENCE_PROPERTY) { + if (!al.IsPaired() || !al.IsMateMapped()) return false; + BAMTOOLS_ASSERT_MESSAGE( + (al.MateRefID >= 0 && (al.MateRefID < (int)filterToolReferences.size())), + "Invalid MateRefID"); + const std::string& refName = filterToolReferences.at(al.MateRefID).RefName; + keepAlignment &= valueFilter.check(refName); + } else if (propertyName == NAME_PROPERTY) + keepAlignment &= valueFilter.check(al.Name); + else if (propertyName == POSITION_PROPERTY) + keepAlignment &= valueFilter.check(al.Position); + else if (propertyName == QUERYBASES_PROPERTY) + keepAlignment &= valueFilter.check(al.QueryBases); + else if (propertyName == REFERENCE_PROPERTY) { + BAMTOOLS_ASSERT_MESSAGE( + (al.RefID >= 0 && (al.RefID < (int)filterToolReferences.size())), + "Invalid RefID"); + const std::string& refName = filterToolReferences.at(al.RefID).RefName; + keepAlignment &= valueFilter.check(refName); + } else if (propertyName == TAG_PROPERTY) + keepAlignment &= checkAlignmentTag(valueFilter, al); + else + BAMTOOLS_ASSERT_UNREACHABLE; + + // if alignment fails at ANY point, just quit and return false + if (!keepAlignment) return false; + } + + BAMTOOLS_ASSERT_MESSAGE( + keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here"); + return keepAlignment; + } + + bool checkAlignmentTag(const PropertyFilterValue& valueFilter, const BamAlignment& al) + { + + // ensure filter contains string data + Variant entireTagFilter = valueFilter.Value; + if (!entireTagFilter.is_type<std::string>()) return false; + + // localize string from variant + const std::string& entireTagFilterString = entireTagFilter.get<std::string>(); + + // ensure we have at least "XX:x" + if (entireTagFilterString.length() < 4) return false; + + // get tagName & lookup in alignment + // if found, set tagType to tag type character + // if not found, return false + const std::string& tagName = entireTagFilterString.substr(0, 2); + char tagType = '\0'; + if (!al.GetTagType(tagName, tagType)) return false; + + // remove tagName & ':' from beginning tagFilter + std::string tagFilterString = entireTagFilterString.substr(3); + + // switch on tag type to set tag query value & parse filter token + int8_t asciiFilterValue, asciiQueryValue; + int32_t intFilterValue, intQueryValue; + uint32_t uintFilterValue, uintQueryValue; + float realFilterValue, realQueryValue; + std::string stringFilterValue, stringQueryValue; + + PropertyFilterValue tagFilter; + PropertyFilterValue::ValueCompareType compareType; + bool keepAlignment = false; + switch (tagType) { + + // ASCII tag type + case 'A': + if (al.GetTag(tagName, asciiQueryValue)) { + if (FilterEngine<BamAlignmentChecker>::parseToken( + tagFilterString, asciiFilterValue, compareType)) { + tagFilter.Value = asciiFilterValue; + tagFilter.Type = compareType; + keepAlignment = tagFilter.check(asciiQueryValue); + } + } + break; + + // signed int tag type + case 'c': + case 's': + case 'i': + if (al.GetTag(tagName, intQueryValue)) { + if (FilterEngine<BamAlignmentChecker>::parseToken( + tagFilterString, intFilterValue, compareType)) { + tagFilter.Value = intFilterValue; + tagFilter.Type = compareType; + keepAlignment = tagFilter.check(intQueryValue); + } + } + break; + + // unsigned int tag type + case 'C': + case 'S': + case 'I': + if (al.GetTag(tagName, uintQueryValue)) { + if (FilterEngine<BamAlignmentChecker>::parseToken( + tagFilterString, uintFilterValue, compareType)) { + tagFilter.Value = uintFilterValue; + tagFilter.Type = compareType; + keepAlignment = tagFilter.check(uintQueryValue); + } + } + break; + + // 'real' tag type + case 'f': + if (al.GetTag(tagName, realQueryValue)) { + if (FilterEngine<BamAlignmentChecker>::parseToken( + tagFilterString, realFilterValue, compareType)) { + tagFilter.Value = realFilterValue; + tagFilter.Type = compareType; + keepAlignment = tagFilter.check(realQueryValue); + } + } + break; + + // string tag type + + case 'Z': + case 'H': + if (al.GetTag(tagName, stringQueryValue)) { + if (FilterEngine<BamAlignmentChecker>::parseToken( + tagFilterString, stringFilterValue, compareType)) { + tagFilter.Value = stringFilterValue; + tagFilter.Type = compareType; + keepAlignment = tagFilter.check(stringQueryValue); + } + } + break; + + // unknown tag type + default: + keepAlignment = false; + } + + return keepAlignment; + } +}; + +} // namespace BamTools + +// --------------------------------------------- +// FilterSettings implementation + +struct FilterTool::FilterSettings +{ + + // ---------------------------------- + // IO opts + + // flags + bool HasInput; + bool HasInputFilelist; + bool HasOutput; + bool HasRegion; + bool HasScript; + bool IsForceCompression; + + // filenames + std::vector<std::string> InputFiles; + std::string InputFilelist; + std::string OutputFilename; + std::string Region; + std::string ScriptFilename; + + // ----------------------------------- + // General filter opts + + // flags + bool HasAlignmentFlagFilter; + bool HasInsertSizeFilter; + bool HasLengthFilter; + bool HasMapQualityFilter; + bool HasNameFilter; + bool HasQueryBasesFilter; + bool HasTagFilter; //(s) + + // filters + std::string AlignmentFlagFilter; + std::string InsertSizeFilter; + std::string LengthFilter; + std::string MapQualityFilter; + std::string NameFilter; + std::string QueryBasesFilter; + std::string TagFilter; // support multiple ? + + // ----------------------------------- + // AlignmentFlag filter opts + + // flags + bool HasIsDuplicateFilter; + bool HasIsFailedQCFilter; + bool HasIsFirstMateFilter; + bool HasIsMappedFilter; + bool HasIsMateMappedFilter; + bool HasIsMateReverseStrandFilter; + bool HasIsPairedFilter; + bool HasIsPrimaryAlignmentFilter; + bool HasIsProperPairFilter; + bool HasIsReverseStrandFilter; + bool HasIsSecondMateFilter; + bool HasIsSingletonFilter; + + // filters + std::string IsDuplicateFilter; + std::string IsFailedQCFilter; + std::string IsFirstMateFilter; + std::string IsMappedFilter; + std::string IsMateMappedFilter; + std::string IsMateReverseStrandFilter; + std::string IsPairedFilter; + std::string IsPrimaryAlignmentFilter; + std::string IsProperPairFilter; + std::string IsReverseStrandFilter; + std::string IsSecondMateFilter; + std::string IsSingletonFilter; + + // --------------------------------- + // constructor + + FilterSettings() + : HasInput(false) + , HasInputFilelist(false) + , HasOutput(false) + , HasRegion(false) + , HasScript(false) + , IsForceCompression(false) + , OutputFilename(Options::StandardOut()) + , HasAlignmentFlagFilter(false) + , HasInsertSizeFilter(false) + , HasLengthFilter(false) + , HasMapQualityFilter(false) + , HasNameFilter(false) + , HasQueryBasesFilter(false) + , HasTagFilter(false) + , HasIsDuplicateFilter(false) + , HasIsFailedQCFilter(false) + , HasIsFirstMateFilter(false) + , HasIsMappedFilter(false) + , HasIsMateMappedFilter(false) + , HasIsMateReverseStrandFilter(false) + , HasIsPairedFilter(false) + , HasIsPrimaryAlignmentFilter(false) + , HasIsProperPairFilter(false) + , HasIsReverseStrandFilter(false) + , HasIsSecondMateFilter(false) + , HasIsSingletonFilter(false) + , IsDuplicateFilter(TRUE_STR) + , IsFailedQCFilter(TRUE_STR) + , IsFirstMateFilter(TRUE_STR) + , IsMappedFilter(TRUE_STR) + , IsMateMappedFilter(TRUE_STR) + , IsMateReverseStrandFilter(TRUE_STR) + , IsPairedFilter(TRUE_STR) + , IsPrimaryAlignmentFilter(TRUE_STR) + , IsProperPairFilter(TRUE_STR) + , IsReverseStrandFilter(TRUE_STR) + , IsSecondMateFilter(TRUE_STR) + , IsSingletonFilter(TRUE_STR) + {} +}; + +// --------------------------------------------- +// FilterToolPrivate declaration + +class FilterTool::FilterToolPrivate +{ + + // ctor & dtor +public: + FilterToolPrivate(FilterTool::FilterSettings* settings); + ~FilterToolPrivate(); + + // 'public' interface +public: + bool Run(); + + // internal methods +private: + bool AddPropertyTokensToFilter(const std::string& filterName, + const std::map<std::string, std::string>& propertyTokens); + bool CheckAlignment(const BamAlignment& al); + const std::string GetScriptContents(); + void InitProperties(); + bool ParseCommandLine(); + bool ParseFilterObject(const std::string& filterName, const Json::Value& filterObject); + bool ParseScript(); + bool SetupFilters(); + + // data members +private: + std::vector<std::string> m_propertyNames; + FilterTool::FilterSettings* m_settings; + FilterEngine<BamAlignmentChecker> m_filterEngine; +}; + +// --------------------------------------------- +// FilterToolPrivate implementation + +// constructor +FilterTool::FilterToolPrivate::FilterToolPrivate(FilterTool::FilterSettings* settings) + : m_settings(settings) +{} + +// destructor +FilterTool::FilterToolPrivate::~FilterToolPrivate() {} + +bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter( + const std::string& filterName, const std::map<std::string, std::string>& propertyTokens) +{ + // dummy temp values for token parsing + bool boolValue; + int32_t int32Value; + uint16_t uint16Value; + uint32_t uint32Value; + std::string stringValue; + PropertyFilterValue::ValueCompareType type; + + // iterate over property token map + std::map<std::string, std::string>::const_iterator mapIter = propertyTokens.begin(); + std::map<std::string, std::string>::const_iterator mapEnd = propertyTokens.end(); + for (; mapIter != mapEnd; ++mapIter) { + + const std::string& propertyName = (*mapIter).first; + const std::string& token = (*mapIter).second; + + // ------------------------------ + // convert token to value & compare type + // then add to filter engine + + // bool conversion + if (propertyName == ISDUPLICATE_PROPERTY || propertyName == ISFAILEDQC_PROPERTY || + propertyName == ISFIRSTMATE_PROPERTY || propertyName == ISMAPPED_PROPERTY || + propertyName == ISMATEMAPPED_PROPERTY || propertyName == ISMATEREVERSESTRAND_PROPERTY || + propertyName == ISPAIRED_PROPERTY || propertyName == ISPRIMARYALIGNMENT_PROPERTY || + propertyName == ISPROPERPAIR_PROPERTY || propertyName == ISREVERSESTRAND_PROPERTY || + propertyName == ISSECONDMATE_PROPERTY || propertyName == ISSINGLETON_PROPERTY) { + FilterEngine<BamAlignmentChecker>::parseToken(token, boolValue, type); + m_filterEngine.setProperty(filterName, propertyName, boolValue, type); + } + + // int32_t conversion + else if (propertyName == INSERTSIZE_PROPERTY || propertyName == LENGTH_PROPERTY || + propertyName == MATEPOSITION_PROPERTY || propertyName == POSITION_PROPERTY) { + FilterEngine<BamAlignmentChecker>::parseToken(token, int32Value, type); + m_filterEngine.setProperty(filterName, propertyName, int32Value, type); + } + + // uint16_t conversion + else if (propertyName == MAPQUALITY_PROPERTY) { + FilterEngine<BamAlignmentChecker>::parseToken(token, uint16Value, type); + m_filterEngine.setProperty(filterName, propertyName, uint16Value, type); + } + + // uint32_t conversion + else if (propertyName == ALIGNMENTFLAG_PROPERTY) { + FilterEngine<BamAlignmentChecker>::parseToken(token, uint32Value, type); + m_filterEngine.setProperty(filterName, propertyName, uint32Value, type); + } + + // string conversion + else if (propertyName == CIGAR_PROPERTY || propertyName == MATEREFERENCE_PROPERTY || + propertyName == NAME_PROPERTY || propertyName == QUERYBASES_PROPERTY || + propertyName == REFERENCE_PROPERTY) { + FilterEngine<BamAlignmentChecker>::parseToken(token, stringValue, type); + m_filterEngine.setProperty(filterName, propertyName, stringValue, type); + } + + else if (propertyName == TAG_PROPERTY) { + // this will be stored directly as the TAG:VALUE token + // (VALUE may contain compare ops, will be parsed out later) + m_filterEngine.setProperty(filterName, propertyName, token, PropertyFilterValue::EXACT); + } + + // else unknown property + else { + std::cerr << "bamtools filter ERROR: unknown property - " << propertyName << std::endl; + return false; + } + } + return true; +} + +bool FilterTool::FilterToolPrivate::CheckAlignment(const BamAlignment& al) +{ + return m_filterEngine.check(al); +} + +const std::string FilterTool::FilterToolPrivate::GetScriptContents() +{ + + // open script for reading + FILE* inFile = fopen(m_settings->ScriptFilename.c_str(), "rb"); + if (!inFile) { + std::cerr << "bamtools filter ERROR: could not open script: " << m_settings->ScriptFilename + << " for reading" << std::endl; + return std::string(); + } + + // read in entire script contents + char buffer[1024]; + std::ostringstream docStream; + while (true) { + + // peek ahead, make sure there is data available + char ch = fgetc(inFile); + ungetc(ch, inFile); + if (feof(inFile)) break; + + // read next block of data + if (fgets(buffer, 1024, inFile) == 0) { + std::cerr << "bamtools filter ERROR: could not read script contents" << std::endl; + return std::string(); + } + + docStream << buffer; + } + + // close script file + fclose(inFile); + + // import buffer contents to document, return + return docStream.str(); +} + +void FilterTool::FilterToolPrivate::InitProperties() +{ + + // store property names in vector + m_propertyNames.push_back(ALIGNMENTFLAG_PROPERTY); + m_propertyNames.push_back(CIGAR_PROPERTY); + m_propertyNames.push_back(INSERTSIZE_PROPERTY); + m_propertyNames.push_back(ISDUPLICATE_PROPERTY); + m_propertyNames.push_back(ISFAILEDQC_PROPERTY); + m_propertyNames.push_back(ISFIRSTMATE_PROPERTY); + m_propertyNames.push_back(ISMAPPED_PROPERTY); + m_propertyNames.push_back(ISMATEMAPPED_PROPERTY); + m_propertyNames.push_back(ISMATEREVERSESTRAND_PROPERTY); + m_propertyNames.push_back(ISPAIRED_PROPERTY); + m_propertyNames.push_back(ISPRIMARYALIGNMENT_PROPERTY); + m_propertyNames.push_back(ISPROPERPAIR_PROPERTY); + m_propertyNames.push_back(ISREVERSESTRAND_PROPERTY); + m_propertyNames.push_back(ISSECONDMATE_PROPERTY); + m_propertyNames.push_back(ISSINGLETON_PROPERTY); + m_propertyNames.push_back(LENGTH_PROPERTY); + m_propertyNames.push_back(MAPQUALITY_PROPERTY); + m_propertyNames.push_back(MATEPOSITION_PROPERTY); + m_propertyNames.push_back(MATEREFERENCE_PROPERTY); + m_propertyNames.push_back(NAME_PROPERTY); + m_propertyNames.push_back(POSITION_PROPERTY); + m_propertyNames.push_back(QUERYBASES_PROPERTY); + m_propertyNames.push_back(REFERENCE_PROPERTY); + m_propertyNames.push_back(TAG_PROPERTY); + + // add vector contents to FilterEngine<BamAlignmentChecker> + std::vector<std::string>::const_iterator propertyNameIter = m_propertyNames.begin(); + std::vector<std::string>::const_iterator propertyNameEnd = m_propertyNames.end(); + for (; propertyNameIter != propertyNameEnd; ++propertyNameIter) + m_filterEngine.addProperty((*propertyNameIter)); +} + +bool FilterTool::FilterToolPrivate::ParseCommandLine() +{ + + // add a rule set to filter engine + const std::string CMD = "COMMAND_LINE"; + m_filterEngine.addFilter(CMD); + + // map property names to command line args + std::map<std::string, std::string> propertyTokens; + if (m_settings->HasAlignmentFlagFilter) + propertyTokens.insert(make_pair(ALIGNMENTFLAG_PROPERTY, m_settings->AlignmentFlagFilter)); + if (m_settings->HasInsertSizeFilter) + propertyTokens.insert(make_pair(INSERTSIZE_PROPERTY, m_settings->InsertSizeFilter)); + if (m_settings->HasIsDuplicateFilter) + propertyTokens.insert(make_pair(ISDUPLICATE_PROPERTY, m_settings->IsDuplicateFilter)); + if (m_settings->HasIsFailedQCFilter) + propertyTokens.insert(make_pair(ISFAILEDQC_PROPERTY, m_settings->IsFailedQCFilter)); + if (m_settings->HasIsFirstMateFilter) + propertyTokens.insert(make_pair(ISFIRSTMATE_PROPERTY, m_settings->IsFirstMateFilter)); + if (m_settings->HasIsMappedFilter) + propertyTokens.insert(make_pair(ISMAPPED_PROPERTY, m_settings->IsMappedFilter)); + if (m_settings->HasIsMateMappedFilter) + propertyTokens.insert(make_pair(ISMATEMAPPED_PROPERTY, m_settings->IsMateMappedFilter)); + if (m_settings->HasIsMateReverseStrandFilter) + propertyTokens.insert( + make_pair(ISMATEREVERSESTRAND_PROPERTY, m_settings->IsMateReverseStrandFilter)); + if (m_settings->HasIsPairedFilter) + propertyTokens.insert(make_pair(ISPAIRED_PROPERTY, m_settings->IsPairedFilter)); + if (m_settings->HasIsPrimaryAlignmentFilter) + propertyTokens.insert( + make_pair(ISPRIMARYALIGNMENT_PROPERTY, m_settings->IsPrimaryAlignmentFilter)); + if (m_settings->HasIsProperPairFilter) + propertyTokens.insert(make_pair(ISPROPERPAIR_PROPERTY, m_settings->IsProperPairFilter)); + if (m_settings->HasIsReverseStrandFilter) + propertyTokens.insert( + make_pair(ISREVERSESTRAND_PROPERTY, m_settings->IsReverseStrandFilter)); + if (m_settings->HasIsSecondMateFilter) + propertyTokens.insert(make_pair(ISSECONDMATE_PROPERTY, m_settings->IsSecondMateFilter)); + if (m_settings->HasIsSingletonFilter) + propertyTokens.insert(make_pair(ISSINGLETON_PROPERTY, m_settings->IsSingletonFilter)); + if (m_settings->HasLengthFilter) + propertyTokens.insert(make_pair(LENGTH_PROPERTY, m_settings->LengthFilter)); + if (m_settings->HasMapQualityFilter) + propertyTokens.insert(make_pair(MAPQUALITY_PROPERTY, m_settings->MapQualityFilter)); + if (m_settings->HasNameFilter) + propertyTokens.insert(make_pair(NAME_PROPERTY, m_settings->NameFilter)); + if (m_settings->HasQueryBasesFilter) + propertyTokens.insert(make_pair(QUERYBASES_PROPERTY, m_settings->QueryBasesFilter)); + if (m_settings->HasTagFilter) + propertyTokens.insert(make_pair(TAG_PROPERTY, m_settings->TagFilter)); + + // send add these properties to filter set "COMMAND_LINE" + return AddPropertyTokensToFilter(CMD, propertyTokens); +} + +bool FilterTool::FilterToolPrivate::ParseFilterObject(const std::string& filterName, + const Json::Value& filterObject) +{ + + // filter object parsing variables + Json::Value null(Json::nullValue); + Json::Value propertyValue; + + // store results + std::map<std::string, std::string> propertyTokens; + + // iterate over known properties + std::vector<std::string>::const_iterator propertyNameIter = m_propertyNames.begin(); + std::vector<std::string>::const_iterator propertyNameEnd = m_propertyNames.end(); + for (; propertyNameIter != propertyNameEnd; ++propertyNameIter) { + const std::string& propertyName = (*propertyNameIter); + + // if property defined in filter, add to token list + propertyValue = filterObject.get(propertyName, null); + if (propertyValue != null) + propertyTokens.insert(make_pair(propertyName, propertyValue.asString())); + } + + // add this filter to engin + m_filterEngine.addFilter(filterName); + + // add token list to this filter + return AddPropertyTokensToFilter(filterName, propertyTokens); +} + +bool FilterTool::FilterToolPrivate::ParseScript() +{ + + // read in script contents from file + const std::string document = GetScriptContents(); + std::istringstream sin(document); + + // set up JsonCPP reader and attempt to parse script + Json::Value root; + Json::CharReaderBuilder rbuilder; + std::string errs; + const bool ok = Json::parseFromStream(rbuilder, sin, &root, &errs); + if (!ok) { + // use built-in error reporting mechanism to alert user what was wrong with the script + std::cerr << "bamtools filter ERROR: failed to parse script - see error message(s) below" + << std::endl + << errs; + return false; + } + + // initialize return status + bool success = true; + + // see if root object contains multiple filters + const Json::Value filters = root["filters"]; + if (!filters.isNull()) { + + // iterate over any filters found + int filterIndex = 0; + Json::Value::const_iterator filtersIter = filters.begin(); + Json::Value::const_iterator filtersEnd = filters.end(); + for (; filtersIter != filtersEnd; ++filtersIter, ++filterIndex) { + Json::Value filter = (*filtersIter); + + // convert filter index to string + std::string filterName; + + // if id tag supplied + const Json::Value id = filter["id"]; + if (!id.isNull()) filterName = id.asString(); + + // use array index + else { + std::stringstream convert; + convert << filterIndex; + filterName = convert.str(); + } + + // create & parse filter + success &= ParseFilterObject(filterName, filter); + } + + // see if user defined a "rule" for these filters + // otherwise, use filter engine's default rule behavior + std::string ruleString; + const Json::Value rule = root["rule"]; + if (rule.isString()) ruleString = rule.asString(); + m_filterEngine.setRule(ruleString); + + // return success/fail + return success; + } + + // otherwise, root is the only filter (just contains properties) + // create & parse filter named "ROOT" + else + success = ParseFilterObject("ROOT", root); + + // return success/failure + return success; +} + +bool FilterTool::FilterToolPrivate::Run() +{ + + // set to default input if none provided + if (!m_settings->HasInput && !m_settings->HasInputFilelist) + m_settings->InputFiles.push_back(Options::StandardIn()); + + // add files in the filelist to the input file list + if (m_settings->HasInputFilelist) { + + std::ifstream filelist(m_settings->InputFilelist.c_str(), std::ios::in); + if (!filelist.is_open()) { + std::cerr << "bamtools filter ERROR: could not open input BAM file list... Aborting." + << std::endl; + return false; + } + + std::string line; + while (std::getline(filelist, line)) + m_settings->InputFiles.push_back(line); + } + + // initialize defined properties & user-specified filters + // quit if failed + if (!SetupFilters()) return false; + + // open reader without index + BamMultiReader reader; + if (!reader.Open(m_settings->InputFiles)) { + std::cerr << "bamtools filter ERROR: could not open input files for reading." << std::endl; + return false; + } + + // retrieve reader header & reference data + const std::string headerText = reader.GetHeaderText(); + filterToolReferences = reader.GetReferenceData(); + + // determine compression mode for BamWriter + bool writeUncompressed = + (m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression); + BamWriter::CompressionMode compressionMode = BamWriter::Compressed; + if (writeUncompressed) compressionMode = BamWriter::Uncompressed; + + // open BamWriter + BamWriter writer; + writer.SetCompressionMode(compressionMode); + if (!writer.Open(m_settings->OutputFilename, headerText, filterToolReferences)) { + std::cerr << "bamtools filter ERROR: could not open " << m_settings->OutputFilename + << " for writing." << std::endl; + reader.Close(); + return false; + } + + // if no region specified, filter entire file + BamAlignment al; + if (!m_settings->HasRegion) { + while (reader.GetNextAlignment(al)) { + if (CheckAlignment(al)) writer.SaveAlignment(al); + } + } + + // otherwise attempt to use region as constraint + else { + + // if region string parses OK + BamRegion region; + if (Utilities::ParseRegionString(m_settings->Region, reader, region)) { + + // attempt to find index files + reader.LocateIndexes(); + + // if index data available for all BAM files, we can use SetRegion + if (reader.HasIndexes()) { + + // attempt to use SetRegion(), if failed report error + if (!reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, + region.RightPosition)) { + std::cerr << "bamtools filter ERROR: set region failed. Check that REGION " + "describes a valid range" + << std::endl; + reader.Close(); + return false; + } + + // everything checks out, just iterate through specified region, filtering alignments + while (reader.GetNextAlignment(al)) + if (CheckAlignment(al)) writer.SaveAlignment(al); + } + + // no index data available, we have to iterate through until we + // find overlapping alignments + else { + while (reader.GetNextAlignment(al)) { + if ((al.RefID >= region.LeftRefID) && + ((al.Position + al.Length) >= region.LeftPosition) && + (al.RefID <= region.RightRefID) && (al.Position <= region.RightPosition)) { + if (CheckAlignment(al)) writer.SaveAlignment(al); + } + } + } + } + + // error parsing REGION string + else { + std::cerr << "bamtools filter ERROR: could not parse REGION: " << m_settings->Region + << std::endl; + std::cerr << "Check that REGION is in valid format (see documentation) and that the " + "coordinates are valid" + << std::endl; + reader.Close(); + return false; + } + } + + // clean up & exit + reader.Close(); + writer.Close(); + return true; +} + +bool FilterTool::FilterToolPrivate::SetupFilters() +{ + + // set up filter engine with supported properties + InitProperties(); + + // parse script for filter rules, if given + if (m_settings->HasScript) return ParseScript(); + + // otherwise check command line for filters + else + return ParseCommandLine(); +} + +// --------------------------------------------- +// FilterTool implementation + +FilterTool::FilterTool() + : AbstractTool() + , m_settings(new FilterSettings) + , m_impl(0) +{ + // ---------------------------------- + // set program details + + const std::string usage = + "[-in <filename> -in <filename> ... | -list <filelist>] " + "[-out <filename> | [-forceCompression]] [-region <REGION>] " + "[ [-script <filename] | [filterOptions] ]"; + + Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", usage); + + // ---------------------------------- + // I/O options + + OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); + + const std::string inDesc = "the input BAM file(s)"; + const std::string listDesc = "the input BAM file list, one line per file"; + const std::string outDesc = "the output BAM file"; + const std::string regionDesc = + "only read data from this genomic region (see documentation for more details)"; + const std::string scriptDesc = "the filter script file (see documentation for more details)"; + const std::string forceDesc = + "if results are sent to stdout (like when piping to another tool), " + "default behavior is to leave output uncompressed. Use this flag to " + "override and force compression"; + + Options::AddValueOption("-in", "BAM filename", inDesc, "", m_settings->HasInput, + m_settings->InputFiles, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-list", "filename", listDesc, "", m_settings->HasInputFilelist, + m_settings->InputFilelist, IO_Opts); + Options::AddValueOption("-out", "BAM filename", outDesc, "", m_settings->HasOutput, + m_settings->OutputFilename, IO_Opts, Options::StandardOut()); + Options::AddValueOption("-region", "REGION", regionDesc, "", m_settings->HasRegion, + m_settings->Region, IO_Opts); + Options::AddValueOption("-script", "filename", scriptDesc, "", m_settings->HasScript, + m_settings->ScriptFilename, IO_Opts); + Options::AddOption("-forceCompression", forceDesc, m_settings->IsForceCompression, IO_Opts); + + // ---------------------------------- + // general filter options + + OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters"); + + const std::string flagDesc = + "keep reads with this *exact* alignment flag (for more detailed queries, see below)"; + const std::string insertDesc = "keep reads with insert size that matches pattern"; + const std::string lengthDesc = "keep reads with length that matches pattern"; + const std::string mapQualDesc = "keep reads with map quality that matches pattern"; + const std::string nameDesc = "keep reads with name that matches pattern"; + const std::string queryDesc = "keep reads with motif that matches pattern"; + const std::string tagDesc = "keep reads with this key=>value pair"; + + Options::AddValueOption("-alignmentFlag", "int", flagDesc, "", + m_settings->HasAlignmentFlagFilter, m_settings->AlignmentFlagFilter, + FilterOpts); + Options::AddValueOption("-insertSize", "int", insertDesc, "", m_settings->HasInsertSizeFilter, + m_settings->InsertSizeFilter, FilterOpts); + Options::AddValueOption("-length", "int", lengthDesc, "", m_settings->HasLengthFilter, + m_settings->LengthFilter, FilterOpts); + Options::AddValueOption("-mapQuality", "[0-255]", mapQualDesc, "", + m_settings->HasMapQualityFilter, m_settings->MapQualityFilter, + FilterOpts); + Options::AddValueOption("-name", "string", nameDesc, "", m_settings->HasNameFilter, + m_settings->NameFilter, FilterOpts); + Options::AddValueOption("-queryBases", "string", queryDesc, "", m_settings->HasQueryBasesFilter, + m_settings->QueryBasesFilter, FilterOpts); + Options::AddValueOption("-tag", "TAG:VALUE", tagDesc, "", m_settings->HasTagFilter, + m_settings->TagFilter, FilterOpts); + + // ---------------------------------- + // alignment flag filter options + + OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters"); + + const std::string boolArg = "true/false"; + const std::string isDupDesc = "keep only alignments that are marked as duplicate?"; + const std::string isFailQcDesc = "keep only alignments that failed QC?"; + const std::string isFirstMateDesc = "keep only alignments marked as first mate?"; + const std::string isMappedDesc = "keep only alignments that were mapped?"; + const std::string isMateMappedDesc = "keep only alignments with mates that mapped"; + const std::string isMateReverseDesc = "keep only alignments with mate on reverese strand?"; + const std::string isPairedDesc = "keep only alignments that were sequenced as paired?"; + const std::string isPrimaryDesc = "keep only alignments marked as primary?"; + const std::string isProperPairDesc = "keep only alignments that passed PE resolution?"; + const std::string isReverseDesc = "keep only alignments on reverse strand?"; + const std::string isSecondMateDesc = "keep only alignments marked as second mate?"; + const std::string isSingletonDesc = "keep only singletons"; + + Options::AddValueOption("-isDuplicate", boolArg, isDupDesc, "", + m_settings->HasIsDuplicateFilter, m_settings->IsDuplicateFilter, + AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isFailedQC", boolArg, isFailQcDesc, "", + m_settings->HasIsFailedQCFilter, m_settings->IsFailedQCFilter, + AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isFirstMate", boolArg, isFirstMateDesc, "", + m_settings->HasIsFirstMateFilter, m_settings->IsFirstMateFilter, + AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isMapped", boolArg, isMappedDesc, "", m_settings->HasIsMappedFilter, + m_settings->IsMappedFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isMateMapped", boolArg, isMateMappedDesc, "", + m_settings->HasIsMateMappedFilter, m_settings->IsMateMappedFilter, + AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isMateReverseStrand", boolArg, isMateReverseDesc, "", + m_settings->HasIsMateReverseStrandFilter, + m_settings->IsMateReverseStrandFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isPaired", boolArg, isPairedDesc, "", m_settings->HasIsPairedFilter, + m_settings->IsPairedFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isPrimaryAlignment", boolArg, isPrimaryDesc, "", + m_settings->HasIsPrimaryAlignmentFilter, + m_settings->IsPrimaryAlignmentFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isProperPair", boolArg, isProperPairDesc, "", + m_settings->HasIsProperPairFilter, m_settings->IsProperPairFilter, + AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isReverseStrand", boolArg, isReverseDesc, "", + m_settings->HasIsReverseStrandFilter, m_settings->IsReverseStrandFilter, + AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isSecondMate", boolArg, isSecondMateDesc, "", + m_settings->HasIsSecondMateFilter, m_settings->IsSecondMateFilter, + AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isSingleton", boolArg, isSingletonDesc, "", + m_settings->HasIsSingletonFilter, m_settings->IsSingletonFilter, + AlignmentFlagOpts, TRUE_STR); +} + +FilterTool::~FilterTool() +{ + + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int FilterTool::Help() +{ + Options::DisplayHelp(); + return 0; +} + +int FilterTool::Run(int argc, char* argv[]) +{ + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // initialize FilterTool with settings + m_impl = new FilterToolPrivate(m_settings); + + // run FilterTool, return success/fail + if (m_impl->Run()) + return 0; + else + return 1; +} diff --git a/src/toolkit/bamtools_filter.h b/src/toolkit/bamtools_filter.h new file mode 100644 index 0000000..8f4247e --- /dev/null +++ b/src/toolkit/bamtools_filter.h @@ -0,0 +1,38 @@ +// *************************************************************************** +// bamtools_filter.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 28 August 2010 +// --------------------------------------------------------------------------- +// Filters BAM file(s) according to some user-specified criteria +// *************************************************************************** + +#ifndef BAMTOOLS_FILTER_H +#define BAMTOOLS_FILTER_H + +#include "bamtools_tool.h" + +namespace BamTools { + +class FilterTool : public AbstractTool +{ + +public: + FilterTool(); + ~FilterTool(); + +public: + int Help(); + int Run(int argc, char* argv[]); + +private: + struct FilterSettings; + FilterSettings* m_settings; + + class FilterToolPrivate; + FilterToolPrivate* m_impl; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_FILTER_H diff --git a/src/toolkit/bamtools_header.cpp b/src/toolkit/bamtools_header.cpp new file mode 100644 index 0000000..db4cbeb --- /dev/null +++ b/src/toolkit/bamtools_header.cpp @@ -0,0 +1,152 @@ +// *************************************************************************** +// bamtools_header.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 December 2012 +// --------------------------------------------------------------------------- +// Prints the SAM-style header from a single BAM file ( or merged header from +// multiple BAM files) to stdout +// *************************************************************************** + +#include "bamtools_header.h" + +#include <api/BamMultiReader.h> +#include <utils/bamtools_options.h> +using namespace BamTools; + +#include <fstream> +#include <iostream> +#include <string> +#include <vector> + +// --------------------------------------------- +// HeaderSettings implementation + +struct HeaderTool::HeaderSettings +{ + + // flags + bool HasInput; + bool HasInputFilelist; + + // filenames + std::vector<std::string> InputFiles; + std::string InputFilelist; + + // constructor + HeaderSettings() + : HasInput(false) + , HasInputFilelist(false) + {} +}; + +struct HeaderTool::HeaderToolPrivate +{ + + // ctor & dtor +public: + HeaderToolPrivate(HeaderTool::HeaderSettings* settings) + : m_settings(settings) + {} + + ~HeaderToolPrivate() {} + + // interface +public: + bool Run(); + + // data members +private: + HeaderTool::HeaderSettings* m_settings; +}; + +bool HeaderTool::HeaderToolPrivate::Run() +{ + + // set to default input if none provided + if (!m_settings->HasInput && !m_settings->HasInputFilelist) + m_settings->InputFiles.push_back(Options::StandardIn()); + + // add files in the filelist to the input file list + if (m_settings->HasInputFilelist) { + + std::ifstream filelist(m_settings->InputFilelist.c_str(), std::ios::in); + if (!filelist.is_open()) { + std::cerr << "bamtools header ERROR: could not open input BAM file list... Aborting." + << std::endl; + return false; + } + + std::string line; + while (std::getline(filelist, line)) + m_settings->InputFiles.push_back(line); + } + + // attemp to open BAM files + BamMultiReader reader; + if (!reader.Open(m_settings->InputFiles)) { + std::cerr << "bamtools header ERROR: could not open BAM file(s) for reading... Aborting." + << std::endl; + return false; + } + + // dump (merged) header contents to stdout + std::cout << reader.GetHeaderText() << std::endl; + + // clean up & exit + reader.Close(); + return true; +} + +// --------------------------------------------- +// HeaderTool implementation + +HeaderTool::HeaderTool() + : AbstractTool() + , m_settings(new HeaderSettings) + , m_impl(0) +{ + // set program details + Options::SetProgramInfo("bamtools header", "prints header from BAM file(s)", + "[-in <filename> -in <filename> ... | -list <filelist>]"); + + // set up options + OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); + Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", + m_settings->HasInput, m_settings->InputFiles, IO_Opts, + Options::StandardIn()); + Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "", + m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts); +} + +HeaderTool::~HeaderTool() +{ + + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int HeaderTool::Help() +{ + Options::DisplayHelp(); + return 0; +} + +int HeaderTool::Run(int argc, char* argv[]) +{ + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // initialize HeaderTool with settings + m_impl = new HeaderToolPrivate(m_settings); + + // run HeaderTool, return success/fail + if (m_impl->Run()) + return 0; + else + return 1; +} diff --git a/src/toolkit/bamtools_header.h b/src/toolkit/bamtools_header.h new file mode 100644 index 0000000..fe59238 --- /dev/null +++ b/src/toolkit/bamtools_header.h @@ -0,0 +1,39 @@ +// *************************************************************************** +// bamtools_header.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 7 April 2011 +// --------------------------------------------------------------------------- +// Prints the SAM-style header from a single BAM file ( or merged header from +// multiple BAM files) to stdout +// *************************************************************************** + +#ifndef BAMTOOLS_HEADER_H +#define BAMTOOLS_HEADER_H + +#include "bamtools_tool.h" + +namespace BamTools { + +class HeaderTool : public AbstractTool +{ + +public: + HeaderTool(); + ~HeaderTool(); + +public: + int Help(); + int Run(int argc, char* argv[]); + +private: + struct HeaderSettings; + HeaderSettings* m_settings; + + struct HeaderToolPrivate; + HeaderToolPrivate* m_impl; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_HEADER_H diff --git a/src/toolkit/bamtools_index.cpp b/src/toolkit/bamtools_index.cpp new file mode 100644 index 0000000..57ca5b2 --- /dev/null +++ b/src/toolkit/bamtools_index.cpp @@ -0,0 +1,137 @@ +// *************************************************************************** +// bamtools_index.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 7 April 2011 +// --------------------------------------------------------------------------- +// Creates a BAM index file +// *************************************************************************** + +#include "bamtools_index.h" + +#include <api/BamReader.h> +#include <utils/bamtools_options.h> +using namespace BamTools; + +#include <iostream> +#include <string> + +// --------------------------------------------- +// IndexSettings implementation + +struct IndexTool::IndexSettings +{ + + // flags + bool HasInputBamFilename; + bool IsUsingBamtoolsIndex; + + // filenames + std::string InputBamFilename; + + // constructor + IndexSettings() + : HasInputBamFilename(false) + , IsUsingBamtoolsIndex(false) + , InputBamFilename(Options::StandardIn()) + {} +}; + +// --------------------------------------------- +// IndexToolPrivate implementation + +struct IndexTool::IndexToolPrivate +{ + + // ctor & dtor +public: + IndexToolPrivate(IndexTool::IndexSettings* settings) + : m_settings(settings) + {} + + ~IndexToolPrivate() {} + + // interface +public: + bool Run(); + + // data members +private: + IndexTool::IndexSettings* m_settings; +}; + +bool IndexTool::IndexToolPrivate::Run() +{ + + // open our BAM reader + BamReader reader; + if (!reader.Open(m_settings->InputBamFilename)) { + std::cerr << "bamtools index ERROR: could not open BAM file: " + << m_settings->InputBamFilename << std::endl; + return false; + } + + // create index for BAM file + const BamIndex::IndexType type = + (m_settings->IsUsingBamtoolsIndex ? BamIndex::BAMTOOLS : BamIndex::STANDARD); + reader.CreateIndex(type); + + // clean & exit + reader.Close(); + return true; +} + +// --------------------------------------------- +// IndexTool implementation + +IndexTool::IndexTool() + : AbstractTool() + , m_settings(new IndexSettings) + , m_impl(0) +{ + // set program details + Options::SetProgramInfo("bamtools index", "creates index for BAM file", + "[-in <filename>] [-bti]"); + + // set up options + OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); + Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", + m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, + Options::StandardIn()); + Options::AddOption("-bti", + "create (non-standard) BamTools index file (*.bti). Default behavior is to " + "create standard BAM index (*.bai)", + m_settings->IsUsingBamtoolsIndex, IO_Opts); +} + +IndexTool::~IndexTool() +{ + + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int IndexTool::Help() +{ + Options::DisplayHelp(); + return 0; +} + +int IndexTool::Run(int argc, char* argv[]) +{ + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // initialize IndexTool with settings + m_impl = new IndexToolPrivate(m_settings); + + // run IndexTool, return success/fail + if (m_impl->Run()) + return 0; + else + return 1; +} diff --git a/src/toolkit/bamtools_index.h b/src/toolkit/bamtools_index.h new file mode 100644 index 0000000..c378832 --- /dev/null +++ b/src/toolkit/bamtools_index.h @@ -0,0 +1,38 @@ +// *************************************************************************** +// bamtools_index.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 7 April 2011 +// --------------------------------------------------------------------------- +// Creates a BAM index file +// *************************************************************************** + +#ifndef BAMTOOLS_INDEX_H +#define BAMTOOLS_INDEX_H + +#include "bamtools_tool.h" + +namespace BamTools { + +class IndexTool : public AbstractTool +{ + +public: + IndexTool(); + ~IndexTool(); + +public: + int Help(); + int Run(int argc, char* argv[]); + +private: + struct IndexSettings; + IndexSettings* m_settings; + + struct IndexToolPrivate; + IndexToolPrivate* m_impl; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_INDEX_H diff --git a/src/toolkit/bamtools_merge.cpp b/src/toolkit/bamtools_merge.cpp new file mode 100644 index 0000000..2bac936 --- /dev/null +++ b/src/toolkit/bamtools_merge.cpp @@ -0,0 +1,257 @@ +// *************************************************************************** +// bamtools_merge.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 December 2012 +// --------------------------------------------------------------------------- +// Merges multiple BAM files into one +// *************************************************************************** + +#include "bamtools_merge.h" + +#include <api/BamMultiReader.h> +#include <api/BamWriter.h> +#include <utils/bamtools_options.h> +#include <utils/bamtools_utilities.h> +using namespace BamTools; + +#include <fstream> +#include <iostream> +#include <string> +#include <vector> + +// --------------------------------------------- +// MergeSettings implementation + +struct MergeTool::MergeSettings +{ + + // flags + bool HasInput; + bool HasInputFilelist; + bool HasOutput; + bool IsForceCompression; + bool HasRegion; + + // filenames + std::vector<std::string> InputFiles; + std::string InputFilelist; + + // other parameters + std::string OutputFilename; + std::string Region; + + // constructor + MergeSettings() + : HasInput(false) + , HasInputFilelist(false) + , HasOutput(false) + , IsForceCompression(false) + , HasRegion(false) + , OutputFilename(Options::StandardOut()) + {} +}; + +// --------------------------------------------- +// MergeToolPrivate implementation + +struct MergeTool::MergeToolPrivate +{ + + // ctor & dtor +public: + MergeToolPrivate(MergeTool::MergeSettings* settings) + : m_settings(settings) + {} + + ~MergeToolPrivate() {} + + // interface +public: + bool Run(); + + // data members +private: + MergeTool::MergeSettings* m_settings; +}; + +bool MergeTool::MergeToolPrivate::Run() +{ + + // set to default input if none provided + if (!m_settings->HasInput && !m_settings->HasInputFilelist) + m_settings->InputFiles.push_back(Options::StandardIn()); + + // add files in the filelist to the input file list + if (m_settings->HasInputFilelist) { + + std::ifstream filelist(m_settings->InputFilelist.c_str(), std::ios::in); + if (!filelist.is_open()) { + std::cerr << "bamtools merge ERROR: could not open input BAM file list... Aborting." + << std::endl; + return false; + } + + std::string line; + while (std::getline(filelist, line)) + m_settings->InputFiles.push_back(line); + } + + // opens the BAM files (by default without checking for indexes) + BamMultiReader reader; + if (!reader.Open(m_settings->InputFiles)) { + std::cerr << "bamtools merge ERROR: could not open input BAM file(s)... Aborting." + << std::endl; + return false; + } + + // retrieve header & reference dictionary info + std::string mergedHeader = reader.GetHeaderText(); + RefVector references = reader.GetReferenceData(); + + // determine compression mode for BamWriter + bool writeUncompressed = + (m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression); + BamWriter::CompressionMode compressionMode = BamWriter::Compressed; + if (writeUncompressed) compressionMode = BamWriter::Uncompressed; + + // open BamWriter + BamWriter writer; + writer.SetCompressionMode(compressionMode); + if (!writer.Open(m_settings->OutputFilename, mergedHeader, references)) { + std::cerr << "bamtools merge ERROR: could not open " << m_settings->OutputFilename + << " for writing." << std::endl; + reader.Close(); + return false; + } + + // if no region specified, store entire contents of file(s) + if (!m_settings->HasRegion) { + BamAlignment al; + while (reader.GetNextAlignmentCore(al)) + writer.SaveAlignment(al); + } + + // otherwise attempt to use region as constraint + else { + + // if region string parses OK + BamRegion region; + if (Utilities::ParseRegionString(m_settings->Region, reader, region)) { + + // attempt to find index files + reader.LocateIndexes(); + + // if index data available for all BAM files, we can use SetRegion + if (reader.HasIndexes()) { + + // attempt to use SetRegion(), if failed report error + if (!reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, + region.RightPosition)) { + std::cerr << "bamtools merge ERROR: set region failed. Check that REGION " + "describes a valid range" + << std::endl; + reader.Close(); + return false; + } + + // everything checks out, just iterate through specified region, storing alignments + BamAlignment al; + while (reader.GetNextAlignmentCore(al)) + writer.SaveAlignment(al); + } + + // no index data available, we have to iterate through until we + // find overlapping alignments + else { + BamAlignment al; + while (reader.GetNextAlignmentCore(al)) { + if ((al.RefID >= region.LeftRefID) && + ((al.Position + al.Length) >= region.LeftPosition) && + (al.RefID <= region.RightRefID) && (al.Position <= region.RightPosition)) { + writer.SaveAlignment(al); + } + } + } + } + + // error parsing REGION string + else { + std::cerr << "bamtools merge ERROR: could not parse REGION - " << m_settings->Region + << std::endl; + std::cerr << "Check that REGION is in valid format (see documentation) and that the " + "coordinates are valid" + << std::endl; + reader.Close(); + writer.Close(); + return false; + } + } + + // clean & exit + reader.Close(); + writer.Close(); + return true; +} + +// --------------------------------------------- +// MergeTool implementation + +MergeTool::MergeTool() + : AbstractTool() + , m_settings(new MergeSettings) + , m_impl(0) +{ + // set program details + Options::SetProgramInfo("bamtools merge", "merges multiple BAM files into one", + "[-in <filename> -in <filename> ... | -list <filelist>] [-out " + "<filename> | [-forceCompression]] [-region <REGION>]"); + + // set up options + OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); + Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", + m_settings->HasInput, m_settings->InputFiles, IO_Opts); + Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "", + m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts); + Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", + m_settings->HasOutput, m_settings->OutputFilename, IO_Opts); + Options::AddOption("-forceCompression", + "if results are sent to stdout (like when piping to another tool), default " + "behavior is to leave output uncompressed. Use this flag to override and " + "force compression", + m_settings->IsForceCompression, IO_Opts); + Options::AddValueOption("-region", "REGION", "genomic region. See README for more details", "", + m_settings->HasRegion, m_settings->Region, IO_Opts); +} + +MergeTool::~MergeTool() +{ + + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int MergeTool::Help() +{ + Options::DisplayHelp(); + return 0; +} + +int MergeTool::Run(int argc, char* argv[]) +{ + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // initialize MergeTool with settings + m_impl = new MergeToolPrivate(m_settings); + + // run MergeTool, return success/fail + if (m_impl->Run()) + return 0; + else + return 1; +} diff --git a/src/toolkit/bamtools_merge.h b/src/toolkit/bamtools_merge.h new file mode 100644 index 0000000..0db4bc7 --- /dev/null +++ b/src/toolkit/bamtools_merge.h @@ -0,0 +1,38 @@ +// *************************************************************************** +// bamtools_merge.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 7 April 2011 +// --------------------------------------------------------------------------- +// Merges multiple BAM files into one +// *************************************************************************** + +#ifndef BAMTOOLS_MERGE_H +#define BAMTOOLS_MERGE_H + +#include "bamtools_tool.h" + +namespace BamTools { + +class MergeTool : public AbstractTool +{ + +public: + MergeTool(); + ~MergeTool(); + +public: + int Help(); + int Run(int argc, char* argv[]); + +private: + struct MergeSettings; + MergeSettings* m_settings; + + struct MergeToolPrivate; + MergeToolPrivate* m_impl; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_MERGE_H diff --git a/src/toolkit/bamtools_random.cpp b/src/toolkit/bamtools_random.cpp new file mode 100644 index 0000000..fceebda --- /dev/null +++ b/src/toolkit/bamtools_random.cpp @@ -0,0 +1,316 @@ +// *************************************************************************** +// bamtools_random.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 24 July 2013 (DB) +// --------------------------------------------------------------------------- +// Grab a random subset of alignments (testing tool) +// *************************************************************************** + +#include "bamtools_random.h" + +#include <api/BamMultiReader.h> +#include <api/BamWriter.h> +#include <utils/bamtools_options.h> +#include <utils/bamtools_utilities.h> +using namespace BamTools; + +#include <cstdlib> +#include <ctime> +#include <fstream> +#include <iostream> +#include <string> +#include <vector> + +namespace BamTools { + +// define constants +const unsigned int RANDOM_MAX_ALIGNMENT_COUNT = 10000; + +// utility methods for RandomTool +int getRandomInt(const int& lowerBound, const int& upperBound) +{ + const int range = (upperBound - lowerBound) + 1; + return (lowerBound + (int)(range * (double)rand() / ((double)RAND_MAX + 1))); +} + +} // namespace BamTools + +// --------------------------------------------- +// RandomSettings implementation + +struct RandomTool::RandomSettings +{ + + // flags + bool HasAlignmentCount; + bool HasInput; + bool HasInputFilelist; + bool HasOutput; + bool HasRandomNumberSeed; + bool HasRegion; + bool IsForceCompression; + + // parameters + unsigned int AlignmentCount; + std::vector<std::string> InputFiles; + std::string InputFilelist; + std::string OutputFilename; + unsigned int RandomNumberSeed; + std::string Region; + + // constructor + RandomSettings() + : HasAlignmentCount(false) + , HasInput(false) + , HasInputFilelist(false) + , HasOutput(false) + , HasRandomNumberSeed(false) + , HasRegion(false) + , IsForceCompression(false) + , AlignmentCount(RANDOM_MAX_ALIGNMENT_COUNT) + , OutputFilename(Options::StandardOut()) + , RandomNumberSeed(0) + {} +}; + +// --------------------------------------------- +// RandomToolPrivate implementation + +struct RandomTool::RandomToolPrivate +{ + + // ctor & dtor +public: + RandomToolPrivate(RandomTool::RandomSettings* settings) + : m_settings(settings) + {} + + ~RandomToolPrivate() {} + + // interface +public: + bool Run(); + + // data members +private: + RandomTool::RandomSettings* m_settings; +}; + +bool RandomTool::RandomToolPrivate::Run() +{ + + // set to default stdin if no input files provided + if (!m_settings->HasInput && !m_settings->HasInputFilelist) + m_settings->InputFiles.push_back(Options::StandardIn()); + + // add files in the filelist to the input file list + if (m_settings->HasInputFilelist) { + + std::ifstream filelist(m_settings->InputFilelist.c_str(), std::ios::in); + if (!filelist.is_open()) { + std::cerr << "bamtools random ERROR: could not open input BAM file list... Aborting." + << std::endl; + return false; + } + + std::string line; + while (std::getline(filelist, line)) + m_settings->InputFiles.push_back(line); + } + + // open our reader + BamMultiReader reader; + if (!reader.Open(m_settings->InputFiles)) { + std::cerr << "bamtools random ERROR: could not open input BAM file(s)... Aborting." + << std::endl; + return false; + } + + // look up index files for all BAM files + reader.LocateIndexes(); + + // make sure index data is available + if (!reader.HasIndexes()) { + std::cerr << "bamtools random ERROR: could not load index data for all input BAM " + "file(s)... Aborting." + << std::endl; + reader.Close(); + return false; + } + + // get BamReader metadata + const std::string headerText = reader.GetHeaderText(); + const RefVector references = reader.GetReferenceData(); + if (references.empty()) { + std::cerr << "bamtools random ERROR: no reference data available... Aborting." << std::endl; + reader.Close(); + return false; + } + + // determine compression mode for BamWriter + bool writeUncompressed = + (m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression); + BamWriter::CompressionMode compressionMode = BamWriter::Compressed; + if (writeUncompressed) compressionMode = BamWriter::Uncompressed; + + // open BamWriter + BamWriter writer; + writer.SetCompressionMode(compressionMode); + if (!writer.Open(m_settings->OutputFilename, headerText, references)) { + std::cerr << "bamtools random ERROR: could not open " << m_settings->OutputFilename + << " for writing... Aborting." << std::endl; + reader.Close(); + return false; + } + + // if user specified a REGION constraint, attempt to parse REGION string + BamRegion region; + if (m_settings->HasRegion && + !Utilities::ParseRegionString(m_settings->Region, reader, region)) { + std::cerr << "bamtools random ERROR: could not parse REGION: " << m_settings->Region + << std::endl; + std::cerr << "Check that REGION is in valid format (see documentation) and that the " + "coordinates are valid" + << std::endl; + reader.Close(); + writer.Close(); + return false; + } + + // seed our random number generator + if (m_settings->HasRandomNumberSeed) + srand(m_settings->RandomNumberSeed); + else + srand(time(NULL)); + + // grab random alignments + BamAlignment al; + unsigned int i = 0; + while (i < m_settings->AlignmentCount) { + + int randomRefId = 0; + int randomPosition = 0; + + // use REGION constraints to select random refId & position + if (m_settings->HasRegion) { + + // select a random refId + randomRefId = getRandomInt(region.LeftRefID, region.RightRefID); + + // select a random position based on randomRefId + const int lowerBoundPosition = + ((randomRefId == region.LeftRefID) ? region.LeftPosition : 0); + const int upperBoundPosition = + ((randomRefId == region.RightRefID) ? region.RightPosition + : (references.at(randomRefId).RefLength - 1)); + randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition); + } + + // otherwise select from all possible random refId & position + else { + + // select random refId + randomRefId = getRandomInt(0, (int)references.size() - 1); + + // select random position based on randomRefId + const int lowerBoundPosition = 0; + const int upperBoundPosition = references.at(randomRefId).RefLength - 1; + randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition); + } + + // if jump & read successful, save first alignment that overlaps random refId & position + if (reader.Jump(randomRefId, randomPosition)) { + while (reader.GetNextAlignmentCore(al)) { + if (al.RefID == randomRefId && al.Position >= randomPosition) { + writer.SaveAlignment(al); + ++i; + break; + } + } + } + } + + // cleanup & exit + reader.Close(); + writer.Close(); + return true; +} + +// --------------------------------------------- +// RandomTool implementation + +RandomTool::RandomTool() + : AbstractTool() + , m_settings(new RandomSettings) + , m_impl(0) +{ + // set program details + Options::SetProgramInfo("bamtools random", "grab a random subset of alignments", + "[-in <filename> -in <filename> ... | -list <filelist>] [-out " + "<filename>] [-forceCompression] [-n] [-region <REGION>]"); + + // set up options + OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); + Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, + m_settings->InputFiles, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "", + m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts); + Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", + m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, + Options::StandardOut()); + Options::AddValueOption("-region", "REGION", + "only pull random alignments from within this genomic region. Index " + "file is recommended for better performance, and is used automatically " + "if it exists. See \'bamtools help index\' for more details on " + "creating one", + "", m_settings->HasRegion, m_settings->Region, IO_Opts); + Options::AddOption("-forceCompression", + "if results are sent to stdout (like when piping to another tool), default " + "behavior is to leave output uncompressed. Use this flag to override and " + "force compression", + m_settings->IsForceCompression, IO_Opts); + + OptionGroup* SettingsOpts = Options::CreateOptionGroup("Settings"); + Options::AddValueOption( + "-n", "count", "number of alignments to grab. Note - no duplicate checking is performed", + "", m_settings->HasAlignmentCount, m_settings->AlignmentCount, SettingsOpts, + RANDOM_MAX_ALIGNMENT_COUNT); + Options::AddValueOption("-seed", "unsigned integer", + "random number generator seed (for repeatable results). Current time " + "is used if no seed value is provided.", + "", m_settings->HasRandomNumberSeed, m_settings->RandomNumberSeed, + SettingsOpts); +} + +RandomTool::~RandomTool() +{ + + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int RandomTool::Help() +{ + Options::DisplayHelp(); + return 0; +} + +int RandomTool::Run(int argc, char* argv[]) +{ + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // initialize RandomTool with settings + m_impl = new RandomToolPrivate(m_settings); + + // run RandomTool, return success/fail + if (m_impl->Run()) + return 0; + else + return 1; +} diff --git a/src/toolkit/bamtools_random.h b/src/toolkit/bamtools_random.h new file mode 100644 index 0000000..664a919 --- /dev/null +++ b/src/toolkit/bamtools_random.h @@ -0,0 +1,38 @@ +// *************************************************************************** +// bamtools_random.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 7 April 2010 (DB) +// --------------------------------------------------------------------------- +// Grab a random subset of alignments (testing tool) +// *************************************************************************** + +#ifndef BAMTOOLS_RANDOM_H +#define BAMTOOLS_RANDOM_H + +#include "bamtools_tool.h" + +namespace BamTools { + +class RandomTool : public AbstractTool +{ + +public: + RandomTool(); + ~RandomTool(); + +public: + int Help(); + int Run(int argc, char* argv[]); + +private: + struct RandomSettings; + RandomSettings* m_settings; + + struct RandomToolPrivate; + RandomToolPrivate* m_impl; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_RANDOM _H diff --git a/src/toolkit/bamtools_resolve.cpp b/src/toolkit/bamtools_resolve.cpp new file mode 100644 index 0000000..9f8c3e3 --- /dev/null +++ b/src/toolkit/bamtools_resolve.cpp @@ -0,0 +1,1523 @@ +// *************************************************************************** +// bamtools_resolve.cpp (c) 2011 +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 24 July 2013 (DB) +// --------------------------------------------------------------------------- +// Resolves paired-end reads (marking the IsProperPair flag as needed). +// *************************************************************************** + +#include "bamtools_resolve.h" +#include <api/BamReader.h> +#include <api/BamWriter.h> +#include <utils/bamtools_options.h> +#include <utils/bamtools_utilities.h> +#include "bamtools_version.h" +using namespace BamTools; + +#include <algorithm> +#include <cassert> +#include <cctype> +#include <cstddef> +#include <cstdio> +#include <cstdlib> +#include <fstream> +#include <functional> +#include <iostream> +#include <map> +#include <sstream> +#include <string> +#include <utility> +#include <vector> + +// -------------------------------------------------------------------------- +// general ResolveTool constants +// -------------------------------------------------------------------------- + +static const int NUM_MODELS = 8; +static const std::string READ_GROUP_TAG = "RG"; +static const double DEFAULT_CONFIDENCE_INTERVAL = 0.9973; +static const uint16_t DEFAULT_MIN_MAPQUALITY = 1; +static const double DEFAULT_UNUSEDMODEL_THRESHOLD = 0.1; + +// -------------------------------------------------------------------------- +// stats file constants +// -------------------------------------------------------------------------- + +// basic char/string constants +static const char COMMENT_CHAR = '#'; +static const char EQUAL_CHAR = '='; +static const char TAB_CHAR = '\t'; + +static const std::string WHITESPACE_CHARS = " \t\n"; +static const std::string TRUE_KEYWORD = "true"; +static const std::string FALSE_KEYWORD = "false"; + +// field counts +static const std::size_t NUM_OPTIONS_FIELDS = 2; +static const std::size_t NUM_READGROUPS_FIELDS = 7; + +// header strings +static const std::string INPUT_TOKEN = "[Input]"; +static const std::string OPTIONS_TOKEN = "[Options]"; +static const std::string READGROUPS_TOKEN = "[ReadGroups]"; + +// option keywords +static const std::string OPTION_CONFIDENCEINTERVAL = "ConfidenceInterval"; +static const std::string OPTION_MINIMUMMAPQUALITY = "MinimumMapQuality"; +static const std::string OPTION_UNUSEDMODELTHRESHOLD = "UnusedModelThreshold"; +static const std::string OPTION_FORCEMARKREADGROUPS = "ForceMarkReadGroups"; + +// other string constants +static const std::string RG_FIELD_DESCRIPTION = + "#<name> <medianFL> <minFL> <maxFL> <topModelID> <nextTopModelID> <isAmbiguous?>"; + +static const std::string MODEL_DESCRIPTION = + "# ------------- Model Types Description ---------------\n" + "#\n" + "# ID Position Orientation \n" + "# 1 mate1 < mate2 mate1:forward, mate2:forward \n" + "# 2 mate1 < mate2 mate1:forward, mate2:reverse \n" + "# 3 mate1 < mate2 mate1:reverse, mate2:forward \n" + "# 4 mate1 < mate2 mate1:reverse, mate2:reverse \n" + "# 5 mate2 < mate1 mate2:forward, mate1:forward \n" + "# 6 mate2 < mate1 mate2:forward, mate1:reverse \n" + "# 7 mate2 < mate1 mate2:reverse, mate1:forward \n" + "# 8 mate2 < mate1 mate2:reverse, mate1:reverse \n" + "# -----------------------------------------------------\n"; + +// -------------------------------------------------------------------------- +// unique readname file constants +// -------------------------------------------------------------------------- + +static const std::string READNAME_FILE_SUFFIX = ".uniq_names.txt"; +static const std::string DEFAULT_READNAME_FILE = "bt_resolve_TEMP" + READNAME_FILE_SUFFIX; + +// -------------------------------------------------------------------------- +// ModelType implementation + +struct ModelType +{ + + // data members + uint16_t ID; + std::vector<int32_t> FragmentLengths; + + // ctor + ModelType(const uint16_t id) + : ID(id) + { + // preallocate space for 10K fragments per model type + FragmentLengths.reserve(10000); + } + + // convenience access to internal fragment lengths vector + std::vector<int32_t>::iterator begin() + { + return FragmentLengths.begin(); + } + std::vector<int32_t>::const_iterator begin() const + { + return FragmentLengths.begin(); + } + void clear() + { + FragmentLengths.clear(); + } + std::vector<int32_t>::iterator end() + { + return FragmentLengths.end(); + } + std::vector<int32_t>::const_iterator end() const + { + return FragmentLengths.end(); + } + void push_back(const int32_t& x) + { + FragmentLengths.push_back(x); + } + std::size_t size() const + { + return FragmentLengths.size(); + } + + // constants + static const uint16_t DUMMY_ID; +}; + +const uint16_t ModelType::DUMMY_ID = 100; + +bool operator>(const ModelType& lhs, const ModelType& rhs) +{ + return lhs.size() > rhs.size(); +} + +uint16_t CalculateModelType(const BamAlignment& al) +{ + + // localize alignment's mate positions & orientations for convenience + const int32_t m1_begin = (al.IsFirstMate() ? al.Position : al.MatePosition); + const int32_t m2_begin = (al.IsFirstMate() ? al.MatePosition : al.Position); + const bool m1_isReverseStrand = + (al.IsFirstMate() ? al.IsReverseStrand() : al.IsMateReverseStrand()); + const bool m2_isReverseStrand = + (al.IsFirstMate() ? al.IsMateReverseStrand() : al.IsReverseStrand()); + + // determine 'model type' + if (m1_begin < m2_begin) { + if (!m1_isReverseStrand && !m2_isReverseStrand) return 0; // ID: 1 + if (!m1_isReverseStrand && m2_isReverseStrand) return 1; // ID: 2 + if (m1_isReverseStrand && !m2_isReverseStrand) return 2; // ID: 3 + if (m1_isReverseStrand && m2_isReverseStrand) return 3; // ID: 4 + } else { + if (!m2_isReverseStrand && !m1_isReverseStrand) return 4; // ID: 5 + if (!m2_isReverseStrand && m1_isReverseStrand) return 5; // ID: 6 + if (m2_isReverseStrand && !m1_isReverseStrand) return 6; // ID: 7 + if (m2_isReverseStrand && m1_isReverseStrand) return 7; // ID: 8 + } + + // unknown model + return ModelType::DUMMY_ID; +} + +// -------------------------------------------------------------------------- +// ReadGroupResolver implementation + +struct ReadGroupResolver +{ + + // data members + int32_t MinFragmentLength; + int32_t MedianFragmentLength; + int32_t MaxFragmentLength; + uint16_t TopModelId; + uint16_t NextTopModelId; + bool IsAmbiguous; + bool HasData; + std::vector<ModelType> Models; + std::map<std::string, bool> ReadNames; + + // ctor + ReadGroupResolver(); + + // resolving methods + bool IsValidInsertSize(const BamAlignment& al) const; + bool IsValidOrientation(const BamAlignment& al) const; + + // select 2 best models based on observed data + void DetermineTopModels(const std::string& readGroupName); + + // static settings + static double ConfidenceInterval; + static double UnusedModelThreshold; + static void SetConfidenceInterval(const double& ci); + static void SetUnusedModelThreshold(const double& umt); +}; + +double ReadGroupResolver::ConfidenceInterval = DEFAULT_CONFIDENCE_INTERVAL; +double ReadGroupResolver::UnusedModelThreshold = DEFAULT_UNUSEDMODEL_THRESHOLD; + +ReadGroupResolver::ReadGroupResolver() + : MinFragmentLength(0) + , MedianFragmentLength(0) + , MaxFragmentLength(0) + , TopModelId(ModelType::DUMMY_ID) + , NextTopModelId(ModelType::DUMMY_ID) + , IsAmbiguous(false) + , HasData(false) +{ + // pre-allocate space for 8 models + Models.reserve(NUM_MODELS); + for (uint16_t i = 0; i < NUM_MODELS; ++i) + Models.push_back(ModelType(i + 1)); +} + +bool ReadGroupResolver::IsValidInsertSize(const BamAlignment& al) const +{ + const int32_t absInsertSize = abs(al.InsertSize); + return (absInsertSize >= MinFragmentLength && absInsertSize <= MaxFragmentLength); +} + +bool ReadGroupResolver::IsValidOrientation(const BamAlignment& al) const +{ + const uint16_t currentModelId = + CalculateModelType(al) + 1; // convert model type (array index) to ID number + return (currentModelId == TopModelId || currentModelId == NextTopModelId); +} + +void ReadGroupResolver::DetermineTopModels(const std::string& readGroupName) +{ + + // sort models (from most common to least common) + std::sort(Models.begin(), Models.end(), std::greater<ModelType>()); + + // store top 2 models for later + TopModelId = Models[0].ID; + NextTopModelId = Models[1].ID; + + // make sure that the 2 most common models are some threshold more common + // than the remaining models + const unsigned int activeModelCountSum = Models[0].size() + Models[1].size(); + if (activeModelCountSum == 0) return; // skip if no data in this read group + const unsigned int unusedModelCountSum = Models[2].size() + Models[3].size() + + Models[4].size() + Models[5].size() + + Models[6].size() + Models[7].size(); + const double unusedPercentage = (double)unusedModelCountSum / (double)activeModelCountSum; + if (unusedPercentage > UnusedModelThreshold) { + std::cerr << "WARNING: " << readGroupName << " does not have clearly defined 'top models'" + << std::endl + << " The fraction of alignments in bottom 6 models (" << unusedPercentage + << ") exceeds threshold: " << UnusedModelThreshold << std::endl; + IsAmbiguous = true; + } + + // emit a warning if the best alignment models are non-standard + const bool isModel1Top = (TopModelId == 1) || (NextTopModelId == 1); + const bool isModel2Top = (TopModelId == 2) || (NextTopModelId == 2); + const bool isModel4Top = (TopModelId == 4) || (NextTopModelId == 4); + const bool isModel5Top = (TopModelId == 5) || (NextTopModelId == 5); + const bool isModel6Top = (TopModelId == 6) || (NextTopModelId == 6); + const bool isModel8Top = (TopModelId == 8) || (NextTopModelId == 8); + + bool isMatePair = (isModel4Top && isModel5Top ? true : false); + bool isPairedEnd = (isModel2Top && isModel6Top ? true : false); + bool isSolidPair = (isModel1Top && isModel8Top ? true : false); + + if (!isMatePair && !isPairedEnd && !isSolidPair) { + std::cerr << "WARNING: Found a non-standard alignment model configuration. " << std::endl + << " Using alignment models " << TopModelId << " & " << NextTopModelId + << std::endl; + } + + // store only the fragments from the best alignment models, then sort + std::vector<int32_t> fragments; + fragments.reserve(Models[0].size() + Models[1].size()); + fragments.insert(fragments.end(), Models[0].begin(), Models[0].end()); + fragments.insert(fragments.end(), Models[1].begin(), Models[1].end()); + sort(fragments.begin(), fragments.end()); + + // clear out Model fragment data, not needed anymore + Models.clear(); + + // skip if no fragments found for this read group + if (fragments.empty()) { + HasData = false; + return; + } else + HasData = true; + + // calculate & store the min,median, & max fragment lengths + const unsigned int numFragmentLengths = fragments.size(); + const double halfNonConfidenceInterval = (1.0 - ReadGroupResolver::ConfidenceInterval) / 2.0; + const unsigned int minIndex = (unsigned int)(numFragmentLengths * halfNonConfidenceInterval); + const unsigned int medianIndex = (unsigned int)(numFragmentLengths * 0.5); + const unsigned int maxIndex = + (unsigned int)(numFragmentLengths * (1.0 - halfNonConfidenceInterval)); + + MinFragmentLength = fragments[minIndex]; + MedianFragmentLength = fragments[medianIndex]; + MaxFragmentLength = fragments[maxIndex]; +} + +void ReadGroupResolver::SetConfidenceInterval(const double& ci) +{ + ConfidenceInterval = ci; +} + +void ReadGroupResolver::SetUnusedModelThreshold(const double& umt) +{ + UnusedModelThreshold = umt; +} + +// -------------------------------------------------------------------------- +// ResolveSettings implementation + +struct ResolveTool::ResolveSettings +{ + + // modes + bool IsMakeStats; + bool IsMarkPairs; + bool IsTwoPass; + + // I/O flags + bool HasInputBamFile; + bool HasOutputBamFile; + bool HasStatsFile; + bool IsForceCompression; + + // resolve option flags + bool HasConfidenceInterval; + bool HasForceMarkReadGroups; + bool HasMinimumMapQuality; + bool HasUnusedModelThreshold; + + // I/O filenames + std::string InputBamFilename; + std::string OutputBamFilename; + std::string StatsFilename; + std::string ReadNamesFilename; // ** N.B. - Only used internally, not set from cmdline ** + + // resolve options + double ConfidenceInterval; + uint16_t MinimumMapQuality; + double UnusedModelThreshold; + + // constructor + ResolveSettings() + : IsMakeStats(false) + , IsMarkPairs(false) + , IsTwoPass(false) + , HasInputBamFile(false) + , HasOutputBamFile(false) + , HasStatsFile(false) + , IsForceCompression(false) + , HasConfidenceInterval(false) + , HasForceMarkReadGroups(false) + , HasMinimumMapQuality(false) + , HasUnusedModelThreshold(false) + , InputBamFilename(Options::StandardIn()) + , OutputBamFilename(Options::StandardOut()) + , ReadNamesFilename(DEFAULT_READNAME_FILE) + , ConfidenceInterval(DEFAULT_CONFIDENCE_INTERVAL) + , MinimumMapQuality(DEFAULT_MIN_MAPQUALITY) + , UnusedModelThreshold(DEFAULT_UNUSEDMODEL_THRESHOLD) + {} +}; + +// -------------------------------------------------------------------------- +// ReadNamesFileReader implementation + +struct ResolveTool::ReadNamesFileReader +{ + + // ctor & dtor + ReadNamesFileReader() {} + ~ReadNamesFileReader() + { + Close(); + } + + // main reader interface +public: + void Close(); + bool Open(const std::string& filename); + bool Read(std::map<std::string, ReadGroupResolver>& readGroups); + + // data members +private: + std::ifstream m_stream; +}; + +void ResolveTool::ReadNamesFileReader::Close() +{ + if (m_stream.is_open()) m_stream.close(); +} + +bool ResolveTool::ReadNamesFileReader::Open(const std::string& filename) +{ + + // make sure stream is fresh + Close(); + + // attempt to open filename, return status + m_stream.open(filename.c_str(), std::ifstream::in); + return m_stream.good(); +} + +bool ResolveTool::ReadNamesFileReader::Read(std::map<std::string, ReadGroupResolver>& readGroups) +{ + + // up-front sanity check + if (!m_stream.is_open()) return false; + + // parse read names file + std::string line; + std::vector<std::string> fields; + std::map<std::string, ReadGroupResolver>::iterator rgIter; + std::map<std::string, ReadGroupResolver>::iterator rgEnd = readGroups.end(); + while (std::getline(m_stream, line)) { + + // skip if empty line + if (line.empty()) continue; + + // split line on '\t' + fields = Utilities::Split(line, TAB_CHAR); + if (fields.size() != 2) continue; + + // look up resolver for read group + rgIter = readGroups.find(fields[0]); + if (rgIter == rgEnd) return false; + ReadGroupResolver& resolver = (*rgIter).second; + + // store read name with resolver + resolver.ReadNames.insert(std::make_pair(fields[1], true)); + } + + // if here, return success + return true; +} + +// -------------------------------------------------------------------------- +// ReadNamesFileWriter implementation + +struct ResolveTool::ReadNamesFileWriter +{ + + // ctor & dtor + ReadNamesFileWriter() {} + ~ReadNamesFileWriter() + { + Close(); + } + + // main reader interface +public: + void Close(); + bool Open(const std::string& filename); + void Write(const std::string& readGroupName, const std::string& readName); + + // data members +private: + std::ofstream m_stream; +}; + +void ResolveTool::ReadNamesFileWriter::Close() +{ + if (m_stream.is_open()) m_stream.close(); +} + +bool ResolveTool::ReadNamesFileWriter::Open(const std::string& filename) +{ + + // make sure stream is fresh + Close(); + + // attempt to open filename, return status + m_stream.open(filename.c_str(), std::ofstream::out); + return m_stream.good(); +} + +void ResolveTool::ReadNamesFileWriter::Write(const std::string& readGroupName, + const std::string& readName) +{ + m_stream << readGroupName << TAB_CHAR << readName << std::endl; +} + +// -------------------------------------------------------------------------- +// StatsFileReader implementation + +struct ResolveTool::StatsFileReader +{ + + // ctor & dtor +public: + StatsFileReader() {} + ~StatsFileReader() + { + Close(); + } + + // main reader interface +public: + void Close(); + bool Open(const std::string& filename); + bool Read(ResolveTool::ResolveSettings* settings, + std::map<std::string, ReadGroupResolver>& readGroups); + + // internal methods +private: + bool IsComment(const std::string& line) const; + bool IsWhitespace(const std::string& line) const; + bool ParseInputLine(const std::string& line); + bool ParseOptionLine(const std::string& line, ResolveTool::ResolveSettings* settings); + bool ParseReadGroupLine(const std::string& line, + std::map<std::string, ReadGroupResolver>& readGroups); + std::string SkipCommentsAndWhitespace(); + + // data members +private: + std::ifstream m_stream; + + enum State + { + None = 0, + InInput, + InOptions, + InReadGroups + }; +}; + +void ResolveTool::StatsFileReader::Close() +{ + if (m_stream.is_open()) m_stream.close(); +} + +bool ResolveTool::StatsFileReader::IsComment(const std::string& line) const +{ + assert(!line.empty()); + return (line.at(0) == COMMENT_CHAR); +} + +bool ResolveTool::StatsFileReader::IsWhitespace(const std::string& line) const +{ + if (line.empty()) return true; + return (isspace(line.at(0))); +} + +bool ResolveTool::StatsFileReader::Open(const std::string& filename) +{ + + // make sure stream is fresh + Close(); + + // attempt to open filename, return status + m_stream.open(filename.c_str(), std::ifstream::in); + return m_stream.good(); +} + +bool ResolveTool::StatsFileReader::ParseInputLine(const std::string& /*line*/) +{ + // input lines are ignored (for now at least), tool will use input from command line + return true; +} + +bool ResolveTool::StatsFileReader::ParseOptionLine(const std::string& line, + ResolveTool::ResolveSettings* settings) +{ + // split line into option, value + std::vector<std::string> fields = Utilities::Split(line, EQUAL_CHAR); + if (fields.size() != NUM_OPTIONS_FIELDS) return false; + const std::string& option = fields.at(0); + std::stringstream value(fields.at(1)); + + // ----------------------------------- + // handle option based on keyword + + // ConfidenceInterval + if (option == OPTION_CONFIDENCEINTERVAL) { + value >> settings->ConfidenceInterval; + settings->HasConfidenceInterval = true; + return true; + } + + // ForceMarkReadGroups + if (option == OPTION_FORCEMARKREADGROUPS) { + value >> settings->HasForceMarkReadGroups; + return true; + } + + // MinimumMapQuality + if (option == OPTION_MINIMUMMAPQUALITY) { + value >> settings->MinimumMapQuality; + settings->HasMinimumMapQuality = true; + return true; + } + + // UnusedModelThreshold + if (option == OPTION_UNUSEDMODELTHRESHOLD) { + value >> settings->UnusedModelThreshold; + settings->HasUnusedModelThreshold = true; + return true; + } + + // otherwise unknown option + std::cerr << "bamtools resolve ERROR - unrecognized option: " << option << " in stats file" + << std::endl; + return false; +} + +bool ResolveTool::StatsFileReader::ParseReadGroupLine( + const std::string& line, std::map<std::string, ReadGroupResolver>& readGroups) +{ + // split read group data in to fields + std::vector<std::string> fields = Utilities::Split(line, WHITESPACE_CHARS); + if (fields.size() != NUM_READGROUPS_FIELDS) return false; + + // retrieve RG name + const std::string& name = fields.at(0); + + // populate RG's 'resolver' data + ReadGroupResolver resolver; + + std::stringstream dataStream; + dataStream.str(fields.at(1)); + dataStream >> resolver.MedianFragmentLength; + dataStream.clear(); + + dataStream.str(fields.at(2)); + dataStream >> resolver.MinFragmentLength; + dataStream.clear(); + + dataStream.str(fields.at(3)); + dataStream >> resolver.MaxFragmentLength; + dataStream.clear(); + + dataStream.str(fields.at(4)); + dataStream >> resolver.TopModelId; + dataStream.clear(); + + dataStream.str(fields.at(5)); + dataStream >> resolver.NextTopModelId; + dataStream.clear(); + + resolver.IsAmbiguous = (fields.at(6) == TRUE_KEYWORD); + + // store RG entry and return success + readGroups.insert(std::make_pair(name, resolver)); + return true; +} + +bool ResolveTool::StatsFileReader::Read(ResolveTool::ResolveSettings* settings, + std::map<std::string, ReadGroupResolver>& readGroups) +{ + // up-front sanity checks + if (!m_stream.is_open() || settings == 0) return false; + + // clear out read group data + readGroups.clear(); + + // initialize state + State currentState = StatsFileReader::None; + + // read stats file + std::string line = SkipCommentsAndWhitespace(); + while (!line.empty()) { + + bool foundError = false; + + // switch state on keyword found + if (Utilities::StartsWith(line, INPUT_TOKEN)) + currentState = StatsFileReader::InInput; + else if (Utilities::StartsWith(line, OPTIONS_TOKEN)) + currentState = StatsFileReader::InOptions; + else if (Utilities::StartsWith(line, READGROUPS_TOKEN)) + currentState = StatsFileReader::InReadGroups; + + // otherwise parse data line, depending on state + else { + if (currentState == StatsFileReader::InInput) + foundError = !ParseInputLine(line); + else if (currentState == StatsFileReader::InOptions) + foundError = !ParseOptionLine(line, settings); + else if (currentState == StatsFileReader::InReadGroups) + foundError = !ParseReadGroupLine(line, readGroups); + else + foundError = true; + } + + // break out if error found + if (foundError) return false; + + // get next line + line = SkipCommentsAndWhitespace(); + } + + // if here, return success + return true; +} + +std::string ResolveTool::StatsFileReader::SkipCommentsAndWhitespace() +{ + std::string line; + do { + if (m_stream.eof()) return std::string(); + std::getline(m_stream, line); + } while (IsWhitespace(line) || IsComment(line)); + return line; +} + +// -------------------------------------------------------------------------- +// StatsFileReader implementation + +struct ResolveTool::StatsFileWriter +{ + + // ctor & dtor +public: + StatsFileWriter() {} + ~StatsFileWriter() + { + Close(); + } + + // main reader interface +public: + void Close(); + bool Open(const std::string& filename); + bool Write(ResolveTool::ResolveSettings* settings, + const std::map<std::string, ReadGroupResolver>& readGroups); + + // internal methods +private: + void WriteHeader(); + void WriteInput(ResolveTool::ResolveSettings* settings); + void WriteOptions(ResolveTool::ResolveSettings* settings); + void WriteReadGroups(const std::map<std::string, ReadGroupResolver>& readGroups); + + // data members +private: + std::ofstream m_stream; +}; + +void ResolveTool::StatsFileWriter::Close() +{ + if (m_stream.is_open()) m_stream.close(); +} + +bool ResolveTool::StatsFileWriter::Open(const std::string& filename) +{ + + // make sure stream is fresh + Close(); + + // attempt to open filename, return status + m_stream.open(filename.c_str(), std::ofstream::out); + return m_stream.good(); +} + +bool ResolveTool::StatsFileWriter::Write(ResolveTool::ResolveSettings* settings, + const std::map<std::string, ReadGroupResolver>& readGroups) +{ + // return failure if file not open + if (!m_stream.is_open()) return false; + + // write stats file elements + WriteHeader(); + WriteInput(settings); + WriteOptions(settings); + WriteReadGroups(readGroups); + + // return success + return true; +} + +void ResolveTool::StatsFileWriter::WriteHeader() +{ + + // stringify current bamtools version + std::stringstream versionStream; + versionStream << 'v' << BAMTOOLS_VERSION_MAJOR << '.' << BAMTOOLS_VERSION_MINOR << '.' + << BAMTOOLS_VERSION_PATCH; + + // # bamtools resolve (vX.Y.Z) + // # + // # MODEL DESCRIPTION - see above for actual text + // \n + + m_stream << COMMENT_CHAR << " bamtools resolve (" << versionStream.str() << ')' << std::endl + << COMMENT_CHAR << std::endl + << MODEL_DESCRIPTION << std::endl; +} + +void ResolveTool::StatsFileWriter::WriteInput(ResolveTool::ResolveSettings* settings) +{ + + // [Input] + // filename + // \n + + m_stream << INPUT_TOKEN << std::endl << settings->InputBamFilename << std::endl << std::endl; +} + +void ResolveTool::StatsFileWriter::WriteOptions(ResolveTool::ResolveSettings* settings) +{ + + // [Options] + // ConfidenceInterval=<double> + // ForceMarkReadGroups=<true|false> + // MinimumMapQuality=<uint16_t> + // UnusedModelThreshold=<double> + // \n + + m_stream << OPTIONS_TOKEN << std::endl + << OPTION_CONFIDENCEINTERVAL << EQUAL_CHAR << settings->ConfidenceInterval << std::endl + << OPTION_FORCEMARKREADGROUPS << EQUAL_CHAR << std::boolalpha + << settings->HasForceMarkReadGroups << std::endl + << OPTION_MINIMUMMAPQUALITY << EQUAL_CHAR << settings->MinimumMapQuality << std::endl + << OPTION_UNUSEDMODELTHRESHOLD << EQUAL_CHAR << settings->UnusedModelThreshold + << std::endl + << std::endl; +} + +void ResolveTool::StatsFileWriter::WriteReadGroups( + const std::map<std::string, ReadGroupResolver>& readGroups) +{ + + // [ReadGroups] + // #<name> <medianFL> <minFL> <maxFL> <topModelID> <nextTopModelID> <isAmbiguous?> + m_stream << READGROUPS_TOKEN << std::endl << RG_FIELD_DESCRIPTION << std::endl; + + // iterate over read groups + std::map<std::string, ReadGroupResolver>::const_iterator rgIter = readGroups.begin(); + std::map<std::string, ReadGroupResolver>::const_iterator rgEnd = readGroups.end(); + for (; rgIter != rgEnd; ++rgIter) { + const std::string& name = (*rgIter).first; + const ReadGroupResolver& resolver = (*rgIter).second; + + // skip if read group has no data + if (!resolver.HasData) continue; + + // write read group data + m_stream << name << TAB_CHAR << resolver.MedianFragmentLength << TAB_CHAR + << resolver.MinFragmentLength << TAB_CHAR << resolver.MaxFragmentLength << TAB_CHAR + << resolver.TopModelId << TAB_CHAR << resolver.NextTopModelId << TAB_CHAR + << std::boolalpha << resolver.IsAmbiguous << std::endl; + } + + // extra newline at end + m_stream << std::endl; +} + +// -------------------------------------------------------------------------- +// ResolveToolPrivate implementation + +struct ResolveTool::ResolveToolPrivate +{ + + // ctor & dtor +public: + ResolveToolPrivate(ResolveTool::ResolveSettings* settings) + : m_settings(settings) + {} + ~ResolveToolPrivate() {} + + // 'public' interface +public: + bool Run(); + + // internal methods +private: + bool CheckSettings(std::vector<std::string>& errors); + bool MakeStats(); + void ParseHeader(const SamHeader& header); + bool ReadStatsFile(); + void ResolveAlignment(BamAlignment& al); + bool ResolvePairs(); + bool WriteStatsFile(); + + // data members +private: + ResolveTool::ResolveSettings* m_settings; + std::map<std::string, ReadGroupResolver> m_readGroups; +}; + +bool ResolveTool::ResolveToolPrivate::CheckSettings(std::vector<std::string>& errors) +{ + + // ensure clean slate + errors.clear(); + + // if MakeStats mode + if (m_settings->IsMakeStats) { + + // ensure mutex mode + if (m_settings->IsMarkPairs) + errors.push_back( + "Cannot run in both -makeStats & -markPairs modes. Please select ONE."); + if (m_settings->IsTwoPass) + errors.push_back("Cannot run in both -makeStats & -twoPass modes. Please select ONE."); + + // error if output BAM options supplied + if (m_settings->HasOutputBamFile) + errors.push_back("Cannot use -out (output BAM file) in -makeStats mode."); + if (m_settings->IsForceCompression) + errors.push_back( + "Cannot use -forceCompression. No output BAM file is being generated."); + + // make sure required stats file supplied + if (!m_settings->HasStatsFile) + errors.push_back( + "Ouptut stats filename required for -makeStats mode. Please specify one using " + "-stats option."); + + // check for UseStats options + if (m_settings->HasForceMarkReadGroups) + errors.push_back( + "Cannot use -forceMarkReadGroups. -markPairs options are DISABLED in -makeStats " + "mode."); + } + + // if MarkPairs mode + else if (m_settings->IsMarkPairs) { + + // ensure mutex mode + if (m_settings->IsMakeStats) + errors.push_back( + "Cannot run in both -makeStats & -markPairs modes. Please select ONE."); + if (m_settings->IsTwoPass) + errors.push_back("Cannot run in both -markPairs & -twoPass modes. Please select ONE."); + + // make sure required stats file supplied + if (!m_settings->HasStatsFile) + errors.push_back( + "Input stats filename required for -markPairs mode. Please specify one using " + "-stats option."); + + // check for MakeStats options + if (m_settings->HasConfidenceInterval) + errors.push_back("Cannot use -ci. -makeStats options are DISABLED is -markPairs mode."); + } + + // if TwoPass mode + else if (m_settings->IsTwoPass) { + + // ensure mutex mode + if (m_settings->IsMakeStats) + errors.push_back("Cannot run in both -makeStats & -twoPass modes. Please select ONE."); + if (m_settings->IsMarkPairs) + errors.push_back("Cannot run in both -markPairs & -twoPass modes. Please select ONE."); + + // make sure input is file not stdin + if (!m_settings->HasInputBamFile || m_settings->InputBamFilename == Options::StandardIn()) + errors.push_back( + "Cannot run -twoPass mode with BAM data from stdin. Please specify existing file " + "using -in option."); + } + + // no mode selected + else + errors.push_back( + "No resolve mode specified. Please select ONE of the following: -makeStats, " + "-markPairs, or -twoPass. See help for more info."); + + // boundary checks on values + if (m_settings->HasConfidenceInterval) { + if (m_settings->ConfidenceInterval < 0.0 || m_settings->ConfidenceInterval > 1.0) + errors.push_back("Invalid confidence interval. Must be between 0 and 1"); + } + if (m_settings->HasMinimumMapQuality) { + if (m_settings->MinimumMapQuality >= 256) + errors.push_back("Invalid minimum map quality. Must be between 0 and 255"); + } + if (m_settings->HasUnusedModelThreshold) { + if (m_settings->UnusedModelThreshold < 0.0 || m_settings->UnusedModelThreshold > 1.0) + errors.push_back("Invalid unused model threshold. Must be between 0 and 1"); + } + + // return success if no errors found + return (errors.empty()); +} + +bool ResolveTool::ResolveToolPrivate::MakeStats() +{ + + // pull resolver settings from command-line settings + ReadGroupResolver::SetConfidenceInterval(m_settings->ConfidenceInterval); + ReadGroupResolver::SetUnusedModelThreshold(m_settings->UnusedModelThreshold); + + // open our BAM reader + BamReader bamReader; + if (!bamReader.Open(m_settings->InputBamFilename)) { + std::cerr << "bamtools resolve ERROR: could not open input BAM file: " + << m_settings->InputBamFilename << std::endl; + return false; + } + + // retrieve header & parse for read groups + const SamHeader& header = bamReader.GetHeader(); + ParseHeader(header); + + // open ReadNamesFileWriter + ResolveTool::ReadNamesFileWriter readNamesWriter; + if (!readNamesWriter.Open(m_settings->ReadNamesFilename)) { + std::cerr << "bamtools resolve ERROR: could not open (temp) output read names file: " + << m_settings->ReadNamesFilename << std::endl; + bamReader.Close(); + return false; + } + + // read through BAM file + BamAlignment al; + std::string readGroup; + std::map<std::string, ReadGroupResolver>::iterator rgIter; + std::map<std::string, bool>::iterator readNameIter; + while (bamReader.GetNextAlignmentCore(al)) { + + // skip if alignment is not paired, mapped, nor mate is mapped + if (!al.IsPaired() || !al.IsMapped() || !al.IsMateMapped()) continue; + + // skip if alignment & mate not on same reference sequence + if (al.RefID != al.MateRefID) continue; + + // flesh out the char data, so we can retrieve its read group ID + al.BuildCharData(); + + // get read group from alignment (OK if empty) + readGroup.clear(); + al.GetTag(READ_GROUP_TAG, readGroup); + + // look up resolver for read group + rgIter = m_readGroups.find(readGroup); + if (rgIter == m_readGroups.end()) { + std::cerr << "bamtools resolve ERROR - unable to calculate stats, unknown read group " + "encountered: " + << readGroup << std::endl; + bamReader.Close(); + return false; + } + ReadGroupResolver& resolver = (*rgIter).second; + + // determine unique-ness of current alignment + const bool isCurrentMateUnique = (al.MapQuality >= m_settings->MinimumMapQuality); + + // look up read name + readNameIter = resolver.ReadNames.find(al.Name); + + // if read name found (current alignment's mate already parsed) + if (readNameIter != resolver.ReadNames.end()) { + + // if both unique mates are unique, store read name & insert size for later + const bool isStoredMateUnique = (*readNameIter).second; + if (isCurrentMateUnique && isStoredMateUnique) { + + // save read name in temp file as candidates for later pair marking + readNamesWriter.Write(readGroup, al.Name); + + // determine model type & store fragment length for stats calculation + const uint16_t currentModelType = CalculateModelType(al); + assert(currentModelType != ModelType::DUMMY_ID); + resolver.Models[currentModelType].push_back(abs(al.InsertSize)); + } + + // unique or not, remove read name from map + resolver.ReadNames.erase(readNameIter); + } + + // if read name not found, store new entry + else + resolver.ReadNames.insert(std::make_pair(al.Name, isCurrentMateUnique)); + } + + // close files + readNamesWriter.Close(); + bamReader.Close(); + + // iterate back through read groups + std::map<std::string, ReadGroupResolver>::iterator rgEnd = m_readGroups.end(); + for (rgIter = m_readGroups.begin(); rgIter != rgEnd; ++rgIter) { + const std::string& name = (*rgIter).first; + ReadGroupResolver& resolver = (*rgIter).second; + + // calculate acceptable orientation & insert sizes for this read group + resolver.DetermineTopModels(name); + + // clear out left over read names + // (these have mates that did not pass filters or were already removed as non-unique) + resolver.ReadNames.clear(); + } + + // if we get here, return success + return true; +} + +void ResolveTool::ResolveToolPrivate::ParseHeader(const SamHeader& header) +{ + + // iterate over header read groups, creating a 'resolver' for each + SamReadGroupConstIterator rgIter = header.ReadGroups.ConstBegin(); + SamReadGroupConstIterator rgEnd = header.ReadGroups.ConstEnd(); + for (; rgIter != rgEnd; ++rgIter) { + const SamReadGroup& rg = (*rgIter); + m_readGroups.insert(std::make_pair(rg.ID, ReadGroupResolver())); + } +} + +bool ResolveTool::ResolveToolPrivate::ReadStatsFile() +{ + + // skip if no filename provided + if (m_settings->StatsFilename.empty()) return false; + + // attempt to open stats file + ResolveTool::StatsFileReader statsReader; + if (!statsReader.Open(m_settings->StatsFilename)) { + std::cerr << "bamtools resolve ERROR - could not open stats file: " + << m_settings->StatsFilename << " for reading" << std::endl; + return false; + } + + // attempt to read stats data + if (!statsReader.Read(m_settings, m_readGroups)) { + std::cerr << "bamtools resolve ERROR - could not parse stats file: " + << m_settings->StatsFilename << " for data" << std::endl; + return false; + } + + // return success + return true; +} + +void ResolveTool::ResolveToolPrivate::ResolveAlignment(BamAlignment& al) +{ + + // clear proper-pair flag + al.SetIsProperPair(false); + + // quit check if alignment is not from paired-end read + if (!al.IsPaired()) return; + + // quit check if either alignment or its mate are unmapped + if (!al.IsMapped() || !al.IsMateMapped()) return; + + // quit check if alignment & its mate are on differenct references + if (al.RefID != al.MateRefID) return; + + // quit check if map quality less than cutoff + if (al.MapQuality < m_settings->MinimumMapQuality) return; + + // get read group from alignment + // empty string if not found, this is OK - we handle empty read group case + std::string readGroupName; + al.GetTag(READ_GROUP_TAG, readGroupName); + + // look up read group's 'resolver' + std::map<std::string, ReadGroupResolver>::iterator rgIter = m_readGroups.find(readGroupName); + if (rgIter == m_readGroups.end()) { + std::cerr << "bamtools resolve ERROR - read group found that was not in header: " + << readGroupName << std::endl; + std::exit(EXIT_FAILURE); + } + const ReadGroupResolver& resolver = (*rgIter).second; + + // quit check if pairs are not in proper orientation (can differ for each RG) + if (!resolver.IsValidOrientation(al)) return; + + // quit check if pairs are not within "reasonable" distance (can differ for each RG) + if (!resolver.IsValidInsertSize(al)) return; + + // quit check if alignment is not a "candidate proper pair" + std::map<std::string, bool>::const_iterator readNameIter; + readNameIter = resolver.ReadNames.find(al.Name); + if (readNameIter == resolver.ReadNames.end()) return; + + // if we get here, alignment is OK - set 'proper pair' flag + al.SetIsProperPair(true); +} + +bool ResolveTool::ResolveToolPrivate::ResolvePairs() +{ + + // open file containing read names of candidate proper pairs + ResolveTool::ReadNamesFileReader readNamesReader; + if (!readNamesReader.Open(m_settings->ReadNamesFilename)) { + std::cerr << "bamtools resolve ERROR: could not open (temp) inputput read names file: " + << m_settings->ReadNamesFilename << std::endl; + return false; + } + + // parse read names (matching with corresponding read groups) + if (!readNamesReader.Read(m_readGroups)) { + std::cerr << "bamtools resolve ERROR: could not read candidate read names from file: " + << m_settings->ReadNamesFilename << std::endl; + readNamesReader.Close(); + return false; + } + + // close read name file reader & delete temp file + readNamesReader.Close(); + if (remove(m_settings->ReadNamesFilename.c_str()) != 0) { + std::cerr << "bamtools resolve WARNING: could not delete temp file: " + << m_settings->ReadNamesFilename << std::endl; + } + + // open our BAM reader + BamReader reader; + if (!reader.Open(m_settings->InputBamFilename)) { + std::cerr << "bamtools resolve ERROR: could not open input BAM file: " + << m_settings->InputBamFilename << std::endl; + return false; + } + + // retrieve header & reference dictionary info + const SamHeader& header = reader.GetHeader(); + const RefVector& references = reader.GetReferenceData(); + + // determine compression mode for BamWriter + bool writeUncompressed = (m_settings->OutputBamFilename == Options::StandardOut() && + !m_settings->IsForceCompression); + BamWriter::CompressionMode compressionMode = BamWriter::Compressed; + if (writeUncompressed) compressionMode = BamWriter::Uncompressed; + + // open BamWriter + BamWriter writer; + writer.SetCompressionMode(compressionMode); + if (!writer.Open(m_settings->OutputBamFilename, header, references)) { + std::cerr << "bamtools resolve ERROR: could not open " << m_settings->OutputBamFilename + << " for writing." << std::endl; + reader.Close(); + return false; + } + + // plow through alignments, setting/clearing 'proper pair' flag + // and writing to new output BAM file + BamAlignment al; + while (reader.GetNextAlignment(al)) { + ResolveAlignment(al); + writer.SaveAlignment(al); + } + + // clean up & return success + reader.Close(); + writer.Close(); + return true; +} + +bool ResolveTool::ResolveToolPrivate::Run() +{ + + // verify that command line settings are acceptable + std::vector<std::string> errors; + if (!CheckSettings(errors)) { + std::cerr << "bamtools resolve ERROR - invalid settings: " << std::endl; + std::vector<std::string>::const_iterator errorIter = errors.begin(); + std::vector<std::string>::const_iterator errorEnd = errors.end(); + for (; errorIter != errorEnd; ++errorIter) + std::cerr << (*errorIter) << std::endl; + return false; + } + + // initialize read group map with default (empty name) read group + m_readGroups.insert(std::make_pair(std::string(), ReadGroupResolver())); + + // init readname filename + // uses (adjusted) stats filename if provided (req'd for makeStats, markPairs modes; optional for twoPass) + // else keep default filename + if (m_settings->HasStatsFile) + m_settings->ReadNamesFilename = m_settings->StatsFilename + READNAME_FILE_SUFFIX; + + // -makeStats mode + if (m_settings->IsMakeStats) { + + // generate stats data + if (!MakeStats()) { + std::cerr << "bamtools resolve ERROR - could not generate stats" << std::endl; + return false; + } + + // write stats to file + if (!WriteStatsFile()) { + std::cerr << "bamtools resolve ERROR - could not write stats file: " + << m_settings->StatsFilename << std::endl; + return false; + } + } + + // -markPairs mode + else if (m_settings->IsMarkPairs) { + + // read stats from file + if (!ReadStatsFile()) { + std::cerr << "bamtools resolve ERROR - could not read stats file: " + << m_settings->StatsFilename << std::endl; + return false; + } + + // do paired-end resolution + if (!ResolvePairs()) { + std::cerr << "bamtools resolve ERROR - could not resolve pairs" << std::endl; + return false; + } + } + + // -twoPass mode + else { + + // generate stats data + if (!MakeStats()) { + std::cerr << "bamtools resolve ERROR - could not generate stats" << std::endl; + return false; + } + + // if stats file requested + if (m_settings->HasStatsFile) { + + // write stats to file + // emit warning if write fails, but paired-end resolution should be allowed to proceed + if (!WriteStatsFile()) + std::cerr << "bamtools resolve WARNING - could not write stats file: " + << m_settings->StatsFilename << std::endl; + } + + // do paired-end resolution + if (!ResolvePairs()) { + std::cerr << "bamtools resolve ERROR - could not resolve pairs" << std::endl; + return false; + } + } + + // return success + return true; +} + +bool ResolveTool::ResolveToolPrivate::WriteStatsFile() +{ + + // skip if no filename provided + if (m_settings->StatsFilename.empty()) return false; + + // attempt to open stats file + ResolveTool::StatsFileWriter statsWriter; + if (!statsWriter.Open(m_settings->StatsFilename)) { + std::cerr << "bamtools resolve ERROR - could not open stats file: " + << m_settings->StatsFilename << " for writing" << std::endl; + return false; + } + + // attempt to write stats data + if (!statsWriter.Write(m_settings, m_readGroups)) { + std::cerr << "bamtools resolve ERROR - could not write stats file: " + << m_settings->StatsFilename << " for data" << std::endl; + return false; + } + + // return success + return true; +} + +// -------------------------------------------------------------------------- +// ResolveTool implementation + +ResolveTool::ResolveTool() + : AbstractTool() + , m_settings(new ResolveSettings) + , m_impl(0) +{ + // set description texts + const std::string programDescription = + "resolves paired-end reads (marking the IsProperPair flag as needed)"; + const std::string programUsage = + "<mode> [options] [-in <filename>] [-out <filename> | [-forceCompression] ] [-stats " + "<filename>]"; + const std::string inputBamDescription = "the input BAM file(s)"; + const std::string outputBamDescription = "the output BAM file"; + const std::string statsFileDescription = + "input/output stats file, depending on selected mode (see below). " + "This file is human-readable, storing fragment length data generated per read group, as " + "well as " + "the options used to configure the -makeStats mode"; + const std::string forceCompressionDescription = + "if results are sent to stdout (like when piping to another tool), " + "default behavior is to leave output uncompressed." + "Use this flag to override and force compression. This feature is disabled in -makeStats " + "mode."; + const std::string makeStatsDescription = + "generates a fragment-length stats file from the input BAM. " + "Data is written to file specified using the -stats option. " + "MarkPairs Mode Settings are DISABLED."; + const std::string markPairsDescription = + "generates an output BAM with alignments marked with proper-pair status. " + "Stats data is read from file specified using the -stats option. " + "MakeStats Mode Settings are DISABLED"; + const std::string twoPassDescription = + "combines the -makeStats & -markPairs modes into a single command. " + "However, due to the two-pass nature of paired-end resolution, piping BAM data via stdin " + "is DISABLED. " + "You must supply an explicit input BAM file. Output BAM may be piped to stdout, however, " + "if desired. " + "All MakeStats & MarkPairs Mode Settings are available. " + "The intermediate stats file is not necessary, but if the -stats options is used, then one " + "will be generated. " + "You may find this useful for documentation purposes."; + const std::string minMapQualDescription = + "minimum map quality. Used in -makeStats mode as a heuristic for determining a mate's " + "uniqueness. Used in -markPairs mode as a filter for marking candidate proper pairs."; + const std::string confidenceIntervalDescription = + "confidence interval. Set min/max fragment lengths such that we capture " + "this fraction of pairs"; + const std::string unusedModelThresholdDescription = + "unused model threshold. The resolve tool considers 8 possible orientation models " + "for pairs. The top 2 are selected for later use when actually marking alignments. This " + "value determines the " + "cutoff for marking a read group as ambiguous. Meaning that if the ratio of the number of " + "alignments from bottom 6 models " + "to the top 2 is greater than this threshold, then the read group is flagged as ambiguous. " + "By default, NO alignments " + "from ambiguous read groups will be marked as proper pairs. You may override this behavior " + "with the -force option " + "in -markPairs mode"; + const std::string forceMarkDescription = + "forces all read groups to be marked according to their top 2 'orientation models'. " + "When generating stats, the 2 (out of 8 possible) models with the most observations are " + "chosen as the top models for each read group. " + "If the remaining 6 models account for more than some threshold ([default=10%], see -umt), " + "then the read group is marked as ambiguous. " + "The default behavior is that for an ambiguous read group, NONE of its alignments are " + "marked as proper-pairs. " + "By setting this option, a read group's ambiguity flag will be ignored, and all of its " + "alignments will be compared to the top 2 models."; + + // set program details + Options::SetProgramInfo("bamtools resolve", programDescription, programUsage); + + // set up I/O options + OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); + Options::AddValueOption("-in", "BAM filename", inputBamDescription, "", + m_settings->HasInputBamFile, m_settings->InputBamFilename, IO_Opts, + Options::StandardIn()); + Options::AddValueOption("-out", "BAM filename", outputBamDescription, "", + m_settings->HasOutputBamFile, m_settings->OutputBamFilename, IO_Opts, + Options::StandardOut()); + Options::AddValueOption("-stats", "STATS filename", statsFileDescription, "", + m_settings->HasStatsFile, m_settings->StatsFilename, IO_Opts); + Options::AddOption("-forceCompression", forceCompressionDescription, + m_settings->IsForceCompression, IO_Opts); + + OptionGroup* ModeOpts = + Options::CreateOptionGroup("Resolve Modes (must select ONE of the following)"); + Options::AddOption("-makeStats", makeStatsDescription, m_settings->IsMakeStats, ModeOpts); + Options::AddOption("-markPairs", markPairsDescription, m_settings->IsMarkPairs, ModeOpts); + Options::AddOption("-twoPass", twoPassDescription, m_settings->IsTwoPass, ModeOpts); + + OptionGroup* GeneralOpts = + Options::CreateOptionGroup("General Resolve Options (available in all modes)"); + Options::AddValueOption("-minMQ", "unsigned short", minMapQualDescription, "", + m_settings->HasMinimumMapQuality, m_settings->MinimumMapQuality, + GeneralOpts); + + OptionGroup* MakeStatsOpts = + Options::CreateOptionGroup("MakeStats Mode Options (disabled in -markPairs mode)"); + Options::AddValueOption("-ci", "double", confidenceIntervalDescription, "", + m_settings->HasConfidenceInterval, m_settings->ConfidenceInterval, + MakeStatsOpts); + Options::AddValueOption("-umt", "double", unusedModelThresholdDescription, "", + m_settings->HasUnusedModelThreshold, m_settings->UnusedModelThreshold, + MakeStatsOpts); + + OptionGroup* MarkPairsOpts = + Options::CreateOptionGroup("MarkPairs Mode Options (disabled in -makeStats mode)"); + Options::AddOption("-force", forceMarkDescription, m_settings->HasForceMarkReadGroups, + MarkPairsOpts); +} + +ResolveTool::~ResolveTool() +{ + + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int ResolveTool::Help() +{ + Options::DisplayHelp(); + return 0; +} + +int ResolveTool::Run(int argc, char* argv[]) +{ + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // initialize ResolveTool + m_impl = new ResolveToolPrivate(m_settings); + + // run ResolveTool, return success/failure + if (m_impl->Run()) + return 0; + else + return 1; +} diff --git a/src/toolkit/bamtools_resolve.h b/src/toolkit/bamtools_resolve.h new file mode 100644 index 0000000..26a902f --- /dev/null +++ b/src/toolkit/bamtools_resolve.h @@ -0,0 +1,43 @@ +// *************************************************************************** +// bamtools_resolve.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 23 June 2011 +// --------------------------------------------------------------------------- +// Resolves paired-end reads (marking the IsProperPair flag as needed). +// *************************************************************************** + +#ifndef BAMTOOLS_RESOLVE_H +#define BAMTOOLS_RESOLVE_H + +#include "bamtools_tool.h" + +namespace BamTools { + +class ResolveTool : public AbstractTool +{ + +public: + ResolveTool(); + ~ResolveTool(); + +public: + int Help(); + int Run(int argc, char* argv[]); + +private: + struct ResolveSettings; + ResolveSettings* m_settings; + + struct ResolveToolPrivate; + ResolveToolPrivate* m_impl; + + struct ReadNamesFileReader; + struct ReadNamesFileWriter; + struct StatsFileReader; + struct StatsFileWriter; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_RESOLVE_H diff --git a/src/toolkit/bamtools_revert.cpp b/src/toolkit/bamtools_revert.cpp new file mode 100644 index 0000000..bdc8afc --- /dev/null +++ b/src/toolkit/bamtools_revert.cpp @@ -0,0 +1,212 @@ +// *************************************************************************** +// bamtools_revert.cpp (c) 2010 Derek Barnett, Alistair Ward +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 7 April 2011 +// --------------------------------------------------------------------------- +// Removes duplicate marks and restores original base qualities +// *************************************************************************** + +#include "bamtools_revert.h" + +#include <api/BamReader.h> +#include <api/BamWriter.h> +#include <utils/bamtools_options.h> +#include <utils/bamtools_utilities.h> +using namespace BamTools; + +#include <iostream> +#include <string> + +namespace BamTools { + +static const std::string OQ_TAG = "OQ"; + +} // namespace BamTools + +// --------------------------------------------- +// RevertSettings implementation + +struct RevertTool::RevertSettings +{ + + // flags + bool HasInput; + bool HasOutput; + bool IsForceCompression; + bool IsKeepDuplicateFlag; + bool IsKeepQualities; + + // filenames + std::string InputFilename; + std::string OutputFilename; + + // constructor + RevertSettings() + : HasInput(false) + , HasOutput(false) + , IsForceCompression(false) + , IsKeepDuplicateFlag(false) + , IsKeepQualities(false) + , InputFilename(Options::StandardIn()) + , OutputFilename(Options::StandardOut()) + {} +}; + +// --------------------------------------------- +// RevertToolPrivate implementation + +struct RevertTool::RevertToolPrivate +{ + + // ctor & dtor +public: + RevertToolPrivate(RevertTool::RevertSettings* settings) + : m_settings(settings) + {} + ~RevertToolPrivate() {} + + // 'public' interface +public: + bool Run(); + + // internal methods +private: + void RevertAlignment(BamAlignment& al); + + // data members +private: + RevertTool::RevertSettings* m_settings; +}; + +// 'reverts' a BAM alignment +// default behavior (for now) is: +// 1 - replace Qualities with OQ contents +// 2 - clear IsDuplicate flag +// can override default behavior using command line options +void RevertTool::RevertToolPrivate::RevertAlignment(BamAlignment& al) +{ + + // replace Qualities with OQ contents, if requested + if (!m_settings->IsKeepQualities) { + std::string originalQualities; + if (al.GetTag(OQ_TAG, originalQualities)) { + al.Qualities = originalQualities; + al.RemoveTag(OQ_TAG); + } + } + + // clear duplicate flag, if requested + if (!m_settings->IsKeepDuplicateFlag) al.SetIsDuplicate(false); +} + +bool RevertTool::RevertToolPrivate::Run() +{ + + // opens the BAM file without checking for indexes + BamReader reader; + if (!reader.Open(m_settings->InputFilename)) { + std::cerr << "bamtools revert ERROR: could not open " << m_settings->InputFilename + << " for reading... Aborting." << std::endl; + return false; + } + + // get BAM file metadata + const std::string& headerText = reader.GetHeaderText(); + const RefVector& references = reader.GetReferenceData(); + + // determine compression mode for BamWriter + bool writeUncompressed = + (m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression); + BamWriter::CompressionMode compressionMode = BamWriter::Compressed; + if (writeUncompressed) compressionMode = BamWriter::Uncompressed; + + // open BamWriter + BamWriter writer; + writer.SetCompressionMode(compressionMode); + if (!writer.Open(m_settings->OutputFilename, headerText, references)) { + std::cerr << "bamtools revert ERROR: could not open " << m_settings->OutputFilename + << " for writing... Aborting." << std::endl; + reader.Close(); + return false; + } + + // plow through file, reverting alignments + BamAlignment al; + while (reader.GetNextAlignment(al)) { + RevertAlignment(al); + writer.SaveAlignment(al); + } + + // clean and exit + reader.Close(); + writer.Close(); + return true; +} + +// --------------------------------------------- +// RevertTool implementation + +RevertTool::RevertTool() + : AbstractTool() + , m_settings(new RevertSettings) + , m_impl(0) +{ + // set program details + Options::SetProgramInfo( + "bamtools revert", + "removes duplicate marks and restores original (non-recalibrated) base qualities", + "[-in <filename> -in <filename> ...] [-out <filename> | [-forceCompression]] " + "[revertOptions]"); + + // set up options + OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); + Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, + m_settings->InputFilename, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", + m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, + Options::StandardOut()); + Options::AddOption("-forceCompression", + "if results are sent to stdout (like when piping to another tool), default " + "behavior is to leave output uncompressed. Use this flag to override and " + "force compression", + m_settings->IsForceCompression, IO_Opts); + + OptionGroup* RevertOpts = Options::CreateOptionGroup("Revert Options"); + Options::AddOption("-keepDuplicate", "keep duplicates marked", m_settings->IsKeepDuplicateFlag, + RevertOpts); + Options::AddOption("-keepQualities", "keep base qualities (do not replace with OQ contents)", + m_settings->IsKeepQualities, RevertOpts); +} + +RevertTool::~RevertTool() +{ + + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int RevertTool::Help() +{ + Options::DisplayHelp(); + return 0; +} + +int RevertTool::Run(int argc, char* argv[]) +{ + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // intialize RevertTool with settings + m_impl = new RevertToolPrivate(m_settings); + + // run RevertTool, return success/fail + if (m_impl->Run()) + return 0; + else + return 1; +} diff --git a/src/toolkit/bamtools_revert.h b/src/toolkit/bamtools_revert.h new file mode 100644 index 0000000..8e44fe3 --- /dev/null +++ b/src/toolkit/bamtools_revert.h @@ -0,0 +1,38 @@ +// *************************************************************************** +// bamtools_revert.h (c) 2010 Derek Barnett, Alistair Ward +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 7 April 2011 +// --------------------------------------------------------------------------- +// Removes duplicate marks and restores original base qualities +// *************************************************************************** + +#ifndef BAMTOOLS_REVERT_H +#define BAMTOOLS_REVERT_H + +#include "bamtools_tool.h" + +namespace BamTools { + +class RevertTool : public AbstractTool +{ + +public: + RevertTool(); + ~RevertTool(); + +public: + int Help(); + int Run(int argc, char* argv[]); + +private: + struct RevertSettings; + RevertSettings* m_settings; + + struct RevertToolPrivate; + RevertToolPrivate* m_impl; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_REVERT_H diff --git a/src/toolkit/bamtools_sort.cpp b/src/toolkit/bamtools_sort.cpp new file mode 100644 index 0000000..6c52f16 --- /dev/null +++ b/src/toolkit/bamtools_sort.cpp @@ -0,0 +1,381 @@ +// *************************************************************************** +// bamtools_sort.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 27 March 2012 (DB) +// --------------------------------------------------------------------------- +// Sorts an input BAM file +// *************************************************************************** + +#include "bamtools_sort.h" + +#include <api/BamMultiReader.h> +#include <api/BamWriter.h> +#include <api/SamConstants.h> +#include <api/algorithms/Sort.h> +#include <utils/bamtools_options.h> +using namespace BamTools; +using namespace BamTools::Algorithms; + +#include <algorithm> +#include <cstddef> +#include <cstdio> +#include <iostream> +#include <sstream> +#include <string> +#include <vector> + +namespace BamTools { + +// defaults +// +// ** These defaults should be tweaked & 'optimized' per testing ** // +// +// I say 'optimized' because each system will naturally perform +// differently. We will attempt to determine a sensible +// compromise that should perform well on average. +const unsigned int SORT_DEFAULT_MAX_BUFFER_COUNT = 500000; // max numberOfAlignments for buffer +const unsigned int SORT_DEFAULT_MAX_BUFFER_MEMORY = 1024; // Mb + +} // namespace BamTools + +// --------------------------------------------- +// SortSettings implementation + +struct SortTool::SortSettings +{ + + // flags + bool HasInputBamFilename; + bool HasMaxBufferCount; + bool HasMaxBufferMemory; + bool HasOutputBamFilename; + bool IsSortingByName; + + // filenames + std::string InputBamFilename; + std::string OutputBamFilename; + + // parameters + unsigned int MaxBufferCount; + unsigned int MaxBufferMemory; + + // constructor + SortSettings() + : HasInputBamFilename(false) + , HasMaxBufferCount(false) + , HasMaxBufferMemory(false) + , HasOutputBamFilename(false) + , IsSortingByName(false) + , InputBamFilename(Options::StandardIn()) + , OutputBamFilename(Options::StandardOut()) + , MaxBufferCount(SORT_DEFAULT_MAX_BUFFER_COUNT) + , MaxBufferMemory(SORT_DEFAULT_MAX_BUFFER_MEMORY) + {} +}; + +// --------------------------------------------- +// SortToolPrivate implementation + +class SortTool::SortToolPrivate +{ + + // ctor & dtor +public: + SortToolPrivate(SortTool::SortSettings* settings); + ~SortToolPrivate() {} + + // 'public' interface +public: + bool Run(); + + // internal methods +private: + bool CreateSortedTempFile(std::vector<BamAlignment>& buffer); + bool GenerateSortedRuns(); + bool MergeSortedRuns(); + bool WriteTempFile(const std::vector<BamAlignment>& buffer, const std::string& tempFilename); + void SortBuffer(std::vector<BamAlignment>& buffer); + + // data members +private: + SortTool::SortSettings* m_settings; + std::string m_tempFilenameStub; + int m_numberOfRuns; + std::string m_headerText; + RefVector m_references; + std::vector<std::string> m_tempFilenames; +}; + +// constructor +SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings) + : m_settings(settings) + , m_numberOfRuns(0) +{ + // set filename stub depending on inputfile path + // that way multiple sort runs don't trip on each other's temp files + if (m_settings) { + std::size_t extensionFound = m_settings->InputBamFilename.find(".bam"); + if (extensionFound != std::string::npos) + m_tempFilenameStub = m_settings->InputBamFilename.substr(0, extensionFound); + m_tempFilenameStub.append(".sort.temp."); + } +} + +// generates mutiple sorted temp BAM files from single unsorted BAM file +bool SortTool::SortToolPrivate::GenerateSortedRuns() +{ + + // open input BAM file + BamReader reader; + if (!reader.Open(m_settings->InputBamFilename)) { + std::cerr << "bamtools sort ERROR: could not open " << m_settings->InputBamFilename + << " for reading... Aborting." << std::endl; + return false; + } + + // get basic data that will be shared by all temp/output files + SamHeader header = reader.GetHeader(); + if (!header.HasVersion()) header.Version = Constants::SAM_CURRENT_VERSION; + header.SortOrder = (m_settings->IsSortingByName ? Constants::SAM_HD_SORTORDER_QUERYNAME + : Constants::SAM_HD_SORTORDER_COORDINATE); + m_headerText = header.ToString(); + m_references = reader.GetReferenceData(); + + // set up alignments buffer + BamAlignment al; + std::vector<BamAlignment> buffer; + buffer.reserve(static_cast<std::size_t>(m_settings->MaxBufferCount * 1.1)); + bool bufferFull = false; + + // if sorting by name, we need to generate full char data + // so can't use GetNextAlignmentCore() + if (m_settings->IsSortingByName) { + + // iterate through file + while (reader.GetNextAlignment(al)) { + + // check buffer's usage + bufferFull = (buffer.size() >= m_settings->MaxBufferCount); + + // store alignments until buffer is "full" + if (!bufferFull) buffer.push_back(al); + + // if buffer is "full" + else { + // so create a sorted temp file with current buffer contents + // then push "al" into fresh buffer + CreateSortedTempFile(buffer); + buffer.push_back(al); + } + } + } + + // sorting by position, can take advantage of GNACore() speedup + else { + + // iterate through file + while (reader.GetNextAlignmentCore(al)) { + + // check buffer's usage + bufferFull = (buffer.size() >= m_settings->MaxBufferCount); + + // store alignments until buffer is "full" + if (!bufferFull) buffer.push_back(al); + + // if buffer is "full" + else { + // create a sorted temp file with current buffer contents + // then push "al" into fresh buffer + CreateSortedTempFile(buffer); + buffer.push_back(al); + } + } + } + + // handle any leftover buffer contents + if (!buffer.empty()) CreateSortedTempFile(buffer); + + // close reader & return success + reader.Close(); + return true; +} + +bool SortTool::SortToolPrivate::CreateSortedTempFile(std::vector<BamAlignment>& buffer) +{ + + // do sorting + SortBuffer(buffer); + + // write sorted contents to temp file, store success/fail + std::stringstream tempStr; + tempStr << m_tempFilenameStub << m_numberOfRuns; + bool success = WriteTempFile(buffer, tempStr.str()); + + // save temp filename for merging later + m_tempFilenames.push_back(tempStr.str()); + + // clear buffer contents & update run counter + buffer.clear(); + ++m_numberOfRuns; + + // return success/fail of writing to temp file + // TODO: a failure returned here is not actually caught and handled anywhere + return success; +} + +// merges sorted temp BAM files into single sorted output BAM file +bool SortTool::SortToolPrivate::MergeSortedRuns() +{ + + // open up multi reader for all of our temp files + // this might get broken up if we do a multi-pass system later ?? + BamMultiReader multiReader; + if (!multiReader.Open(m_tempFilenames)) { + std::cerr << "bamtools sort ERROR: could not open BamMultiReader for merging temp files... " + "Aborting." + << std::endl; + return false; + } + + // open writer for our completely sorted output BAM file + BamWriter mergedWriter; + if (!mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references)) { + std::cerr << "bamtools sort ERROR: could not open " << m_settings->OutputBamFilename + << " for writing... Aborting." << std::endl; + multiReader.Close(); + return false; + } + + // while data available in temp files + BamAlignment al; + while (multiReader.GetNextAlignmentCore(al)) + mergedWriter.SaveAlignment(al); + + // close files + multiReader.Close(); + mergedWriter.Close(); + + // delete all temp files + std::vector<std::string>::const_iterator tempIter = m_tempFilenames.begin(); + std::vector<std::string>::const_iterator tempEnd = m_tempFilenames.end(); + for (; tempIter != tempEnd; ++tempIter) { + const std::string& tempFilename = (*tempIter); + remove(tempFilename.c_str()); + } + + // return success + return true; +} + +bool SortTool::SortToolPrivate::Run() +{ + + // this does a single pass, chunking up the input file into smaller sorted temp files, + // then write out using BamMultiReader to handle merging + + if (GenerateSortedRuns()) + return MergeSortedRuns(); + else + return false; +} + +void SortTool::SortToolPrivate::SortBuffer(std::vector<BamAlignment>& buffer) +{ + + // ** add further custom sort options later ?? ** + + // sort buffer by desired method + if (m_settings->IsSortingByName) + std::stable_sort(buffer.begin(), buffer.end(), Sort::ByName()); + else + std::stable_sort(buffer.begin(), buffer.end(), Sort::ByPosition()); +} + +bool SortTool::SortToolPrivate::WriteTempFile(const std::vector<BamAlignment>& buffer, + const std::string& tempFilename) +{ + // open temp file for writing + BamWriter tempWriter; + if (!tempWriter.Open(tempFilename, m_headerText, m_references)) { + std::cerr << "bamtools sort ERROR: could not open " << tempFilename << " for writing." + << std::endl; + return false; + } + + // write data + std::vector<BamAlignment>::const_iterator buffIter = buffer.begin(); + std::vector<BamAlignment>::const_iterator buffEnd = buffer.end(); + for (; buffIter != buffEnd; ++buffIter) { + const BamAlignment& al = (*buffIter); + tempWriter.SaveAlignment(al); + } + + // close temp file & return success + tempWriter.Close(); + return true; +} + +// --------------------------------------------- +// SortTool implementation + +SortTool::SortTool() + : AbstractTool() + , m_settings(new SortSettings) + , m_impl(0) +{ + // set program details + Options::SetProgramInfo("bamtools sort", "sorts a BAM file", + "[-in <filename>] [-out <filename>] [sortOptions]"); + + // set up options + OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); + Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", + m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, + Options::StandardIn()); + Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", + m_settings->HasOutputBamFilename, m_settings->OutputBamFilename, + IO_Opts, Options::StandardOut()); + + OptionGroup* SortOpts = Options::CreateOptionGroup("Sorting Methods"); + Options::AddOption("-byname", "sort by alignment name", m_settings->IsSortingByName, SortOpts); + + OptionGroup* MemOpts = Options::CreateOptionGroup("Memory Settings"); + Options::AddValueOption("-n", "count", "max number of alignments per tempfile", "", + m_settings->HasMaxBufferCount, m_settings->MaxBufferCount, MemOpts, + SORT_DEFAULT_MAX_BUFFER_COUNT); + Options::AddValueOption("-mem", "Mb", "max memory to use", "", m_settings->HasMaxBufferMemory, + m_settings->MaxBufferMemory, MemOpts, SORT_DEFAULT_MAX_BUFFER_MEMORY); +} + +SortTool::~SortTool() +{ + + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int SortTool::Help() +{ + Options::DisplayHelp(); + return 0; +} + +int SortTool::Run(int argc, char* argv[]) +{ + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // initialize SortTool with settings + m_impl = new SortToolPrivate(m_settings); + + // run SortTool, return success/fail + if (m_impl->Run()) + return 0; + else + return 1; +} diff --git a/src/toolkit/bamtools_sort.h b/src/toolkit/bamtools_sort.h new file mode 100644 index 0000000..2ceb12a --- /dev/null +++ b/src/toolkit/bamtools_sort.h @@ -0,0 +1,38 @@ +// *************************************************************************** +// bamtools_sort.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 7 April 2011 (DB) +// --------------------------------------------------------------------------- +// Sorts a BAM file +// *************************************************************************** + +#ifndef BAMTOOLS_SORT_H +#define BAMTOOLS_SORT_H + +#include "bamtools_tool.h" + +namespace BamTools { + +class SortTool : public AbstractTool +{ + +public: + SortTool(); + ~SortTool(); + +public: + int Help(); + int Run(int argc, char* argv[]); + +private: + struct SortSettings; + SortSettings* m_settings; + + class SortToolPrivate; + SortToolPrivate* m_impl; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_SORT_H diff --git a/src/toolkit/bamtools_split.cpp b/src/toolkit/bamtools_split.cpp new file mode 100644 index 0000000..f303b49 --- /dev/null +++ b/src/toolkit/bamtools_split.cpp @@ -0,0 +1,750 @@ +// *************************************************************************** +// bamtools_split.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 24 July 2013 (DB) +// --------------------------------------------------------------------------- +// Splits a BAM file on user-specified property, creating a new BAM output +// file for each value found +// *************************************************************************** + +#include "bamtools_split.h" + +#include <api/BamConstants.h> +#include <api/BamReader.h> +#include <api/BamWriter.h> +#include <utils/bamtools_options.h> +#include <utils/bamtools_variant.h> +using namespace BamTools; + +#include <cstddef> +#include <ctime> +#include <iostream> +#include <map> +#include <sstream> +#include <string> +#include <vector> + +namespace BamTools { + +// string constants +static const std::string SPLIT_MAPPED_TOKEN = ".MAPPED"; +static const std::string SPLIT_UNMAPPED_TOKEN = ".UNMAPPED"; +static const std::string SPLIT_PAIRED_TOKEN = ".PAIRED_END"; +static const std::string SPLIT_SINGLE_TOKEN = ".SINGLE_END"; +static const std::string SPLIT_REFERENCE_TOKEN = ".REF_"; +static const std::string SPLIT_TAG_TOKEN = ".TAG_"; + +std::string GetTimestampString() +{ + + // get human readable timestamp + time_t currentTime; + time(¤tTime); + std::stringstream timeStream; + timeStream << ctime(¤tTime); + + // convert whitespace to '_' + std::string timeString = timeStream.str(); + std::size_t found = timeString.find(' '); + while (found != std::string::npos) { + timeString.replace(found, 1, 1, '_'); + found = timeString.find(' ', found + 1); + } + return timeString; +} + +// remove copy of filename without extension +// (so /path/to/file.txt becomes /path/to/file ) +std::string RemoveFilenameExtension(const std::string& filename) +{ + std::size_t found = filename.rfind('.'); + return filename.substr(0, found); +} + +} // namespace BamTools + +// --------------------------------------------- +// SplitSettings implementation + +struct SplitTool::SplitSettings +{ + + // flags + bool HasInputFilename; + bool HasCustomOutputStub; + bool HasCustomRefPrefix; + bool HasCustomTagPrefix; + bool HasListTagDelimiter; + bool IsSplittingMapped; + bool IsSplittingPaired; + bool IsSplittingReference; + bool IsSplittingTag; + + // string args + std::string CustomOutputStub; + std::string CustomRefPrefix; + std::string CustomTagPrefix; + std::string InputFilename; + std::string TagToSplit; + std::string ListTagDelimiter; + + // constructor + SplitSettings() + : HasInputFilename(false) + , HasCustomOutputStub(false) + , HasCustomRefPrefix(false) + , HasCustomTagPrefix(false) + , HasListTagDelimiter(false) + , IsSplittingMapped(false) + , IsSplittingPaired(false) + , IsSplittingReference(false) + , IsSplittingTag(false) + , InputFilename(Options::StandardIn()) + , ListTagDelimiter("--") + {} +}; + +// --------------------------------------------- +// SplitToolPrivate declaration + +class SplitTool::SplitToolPrivate +{ + + // ctor & dtor +public: + SplitToolPrivate(SplitTool::SplitSettings* settings) + : m_settings(settings) + {} + + ~SplitToolPrivate() + { + m_reader.Close(); + } + + // 'public' interface +public: + bool Run(); + + // internal methods +private: + // close & delete BamWriters in map + template <typename T> + void CloseWriters(std::map<T, BamWriter*>& writers); + // calculate output stub based on IO args given + void DetermineOutputFilenameStub(); + // open our BamReader + bool OpenReader(); + // split alignments in BAM file based on isMapped property + bool SplitMapped(); + // split alignments in BAM file based on isPaired property + bool SplitPaired(); + // split alignments in BAM file based on refID property + bool SplitReference(); + // finds first alignment and calls corresponding SplitTagImpl<> + // depending on tag type + bool SplitTag(); + +public: + // handles list-type tags + template <typename T> + bool SplitListTagImpl(BamAlignment& al); + + // handles single-value tags + template <typename T> + bool SplitTagImpl(BamAlignment& al); + + // data members +private: + SplitTool::SplitSettings* m_settings; + std::string m_outputFilenameStub; + BamReader m_reader; + std::string m_header; + RefVector m_references; +}; + +void SplitTool::SplitToolPrivate::DetermineOutputFilenameStub() +{ + + // if user supplied output filename stub, use that + if (m_settings->HasCustomOutputStub) m_outputFilenameStub = m_settings->CustomOutputStub; + + // else if user supplied input BAM filename, use that (minus ".bam" extension) as stub + else if (m_settings->HasInputFilename) + m_outputFilenameStub = RemoveFilenameExtension(m_settings->InputFilename); + + // otherwise, user did not specify -stub, and input is coming from STDIN + // generate stub from timestamp + else + m_outputFilenameStub = GetTimestampString(); +} + +bool SplitTool::SplitToolPrivate::OpenReader() +{ + + // attempt to open BAM file + if (!m_reader.Open(m_settings->InputFilename)) { + std::cerr << "bamtools split ERROR: could not open BAM file: " << m_settings->InputFilename + << std::endl; + return false; + } + + // save file 'metadata' & return success + m_header = m_reader.GetHeaderText(); + m_references = m_reader.GetReferenceData(); + return true; +} + +bool SplitTool::SplitToolPrivate::Run() +{ + + // determine output stub + DetermineOutputFilenameStub(); + + // open up BamReader + if (!OpenReader()) return false; + + // determine split type from settings + if (m_settings->IsSplittingMapped) return SplitMapped(); + if (m_settings->IsSplittingPaired) return SplitPaired(); + if (m_settings->IsSplittingReference) return SplitReference(); + if (m_settings->IsSplittingTag) return SplitTag(); + + // if we get here, no property was specified + std::cerr + << "bamtools split ERROR: no property given to split on... " << std::endl + << "Please use -mapped, -paired, -reference, or -tag TAG to specify desired split behavior." + << std::endl; + return false; +} + +bool SplitTool::SplitToolPrivate::SplitMapped() +{ + + // set up splitting data structure + std::map<bool, BamWriter*> outputFiles; + std::map<bool, BamWriter*>::iterator writerIter; + + // iterate through alignments + BamAlignment al; + BamWriter* writer; + bool isCurrentAlignmentMapped; + while (m_reader.GetNextAlignment(al)) { + + // see if bool value exists + isCurrentAlignmentMapped = al.IsMapped(); + writerIter = outputFiles.find(isCurrentAlignmentMapped); + + // if no writer associated with this value + if (writerIter == outputFiles.end()) { + + // open new BamWriter + const std::string outputFilename = + m_outputFilenameStub + + (isCurrentAlignmentMapped ? SPLIT_MAPPED_TOKEN : SPLIT_UNMAPPED_TOKEN) + ".bam"; + writer = new BamWriter; + if (!writer->Open(outputFilename, m_header, m_references)) { + std::cerr << "bamtools split ERROR: could not open " << outputFilename + << " for writing." << std::endl; + return false; + } + + // store in map + outputFiles.insert(std::make_pair(isCurrentAlignmentMapped, writer)); + } + + // else grab corresponding writer + else + writer = (*writerIter).second; + + // store alignment in proper BAM output file + if (writer) writer->SaveAlignment(al); + } + + // clean up BamWriters + CloseWriters(outputFiles); + + // return success + return true; +} + +bool SplitTool::SplitToolPrivate::SplitPaired() +{ + + // set up splitting data structure + std::map<bool, BamWriter*> outputFiles; + std::map<bool, BamWriter*>::iterator writerIter; + + // iterate through alignments + BamAlignment al; + BamWriter* writer; + bool isCurrentAlignmentPaired; + while (m_reader.GetNextAlignment(al)) { + + // see if bool value exists + isCurrentAlignmentPaired = al.IsPaired(); + writerIter = outputFiles.find(isCurrentAlignmentPaired); + + // if no writer associated with this value + if (writerIter == outputFiles.end()) { + + // open new BamWriter + const std::string outputFilename = + m_outputFilenameStub + + (isCurrentAlignmentPaired ? SPLIT_PAIRED_TOKEN : SPLIT_SINGLE_TOKEN) + ".bam"; + writer = new BamWriter; + if (!writer->Open(outputFilename, m_header, m_references)) { + std::cerr << "bamtool split ERROR: could not open " << outputFilename + << " for writing." << std::endl; + return false; + } + + // store in map + outputFiles.insert(std::make_pair(isCurrentAlignmentPaired, writer)); + } + + // else grab corresponding writer + else + writer = (*writerIter).second; + + // store alignment in proper BAM output file + if (writer) writer->SaveAlignment(al); + } + + // clean up BamWriters + CloseWriters(outputFiles); + + // return success + return true; +} + +bool SplitTool::SplitToolPrivate::SplitReference() +{ + + // set up splitting data structure + std::map<int32_t, BamWriter*> outputFiles; + std::map<int32_t, BamWriter*>::iterator writerIter; + + // determine reference prefix + std::string refPrefix = SPLIT_REFERENCE_TOKEN; + if (m_settings->HasCustomRefPrefix) refPrefix = m_settings->CustomRefPrefix; + + // make sure prefix starts with '.' + const std::size_t dotFound = refPrefix.find('.'); + if (dotFound != 0) refPrefix = std::string(1, '.') + refPrefix; + + // iterate through alignments + BamAlignment al; + BamWriter* writer; + int32_t currentRefId; + while (m_reader.GetNextAlignment(al)) { + + // see if bool value exists + currentRefId = al.RefID; + writerIter = outputFiles.find(currentRefId); + + // if no writer associated with this value + if (writerIter == outputFiles.end()) { + + // fetch reference name for ID + std::string refName; + if (currentRefId == -1) + refName = "unmapped"; + else + refName = m_references.at(currentRefId).RefName; + + // construct new output filename + const std::string outputFilename = m_outputFilenameStub + refPrefix + refName + ".bam"; + + // open new BamWriter + writer = new BamWriter; + if (!writer->Open(outputFilename, m_header, m_references)) { + std::cerr << "bamtools split ERROR: could not open " << outputFilename + << " for writing." << std::endl; + return false; + } + + // store in map + outputFiles.insert(std::make_pair(currentRefId, writer)); + } + + // else grab corresponding writer + else + writer = (*writerIter).second; + + // store alignment in proper BAM output file + if (writer) writer->SaveAlignment(al); + } + + // clean up BamWriters + CloseWriters(outputFiles); + + // return success + return true; +} + +// finds first alignment and calls corresponding SplitTagImpl<>() depending on tag type +bool SplitTool::SplitToolPrivate::SplitTag() +{ + + // iterate through alignments, until we hit TAG + BamAlignment al; + while (m_reader.GetNextAlignment(al)) { + + // look for tag in this alignment and get tag type + char tagType(0); + if (!al.GetTagType(m_settings->TagToSplit, tagType)) continue; + + // request split method based on tag type + // pass it the current alignment found + switch (tagType) { + + case (Constants::BAM_TAG_TYPE_INT8): + return SplitTagImpl<int8_t>(al); + case (Constants::BAM_TAG_TYPE_INT16): + return SplitTagImpl<int16_t>(al); + case (Constants::BAM_TAG_TYPE_INT32): + return SplitTagImpl<int32_t>(al); + case (Constants::BAM_TAG_TYPE_UINT8): + return SplitTagImpl<uint8_t>(al); + case (Constants::BAM_TAG_TYPE_UINT16): + return SplitTagImpl<uint16_t>(al); + case (Constants::BAM_TAG_TYPE_UINT32): + return SplitTagImpl<uint32_t>(al); + case (Constants::BAM_TAG_TYPE_FLOAT): + return SplitTagImpl<float>(al); + + case (Constants::BAM_TAG_TYPE_ASCII): + case (Constants::BAM_TAG_TYPE_STRING): + case (Constants::BAM_TAG_TYPE_HEX): + return SplitTagImpl<std::string>(al); + + case (Constants::BAM_TAG_TYPE_ARRAY): { + + char arrayTagType(0); + if (!al.GetArrayTagType(m_settings->TagToSplit, arrayTagType)) continue; + switch (arrayTagType) { + case (Constants::BAM_TAG_TYPE_INT8): + return SplitListTagImpl<int8_t>(al); + case (Constants::BAM_TAG_TYPE_INT16): + return SplitListTagImpl<int16_t>(al); + case (Constants::BAM_TAG_TYPE_INT32): + return SplitListTagImpl<int32_t>(al); + case (Constants::BAM_TAG_TYPE_UINT8): + return SplitListTagImpl<uint8_t>(al); + case (Constants::BAM_TAG_TYPE_UINT16): + return SplitListTagImpl<uint16_t>(al); + case (Constants::BAM_TAG_TYPE_UINT32): + return SplitListTagImpl<uint32_t>(al); + case (Constants::BAM_TAG_TYPE_FLOAT): + return SplitListTagImpl<float>(al); + default: + std::cerr + << "bamtools split ERROR: array tag has unsupported element type: " + << arrayTagType << std::endl; + return false; + } + } + + default: + std::cerr << "bamtools split ERROR: unknown tag type encountered: " << tagType + << std::endl; + return false; + } + } + + // tag not found, but that's not an error - return success + return true; +} + +// -------------------------------------------------------------------------------- +// template method implementation +// *Technical Note* - use of template methods declared & defined in ".cpp" file +// goes against normal practices, but works here because these +// are purely internal (no one can call from outside this file) + +// close BamWriters & delete pointers +template <typename T> +void SplitTool::SplitToolPrivate::CloseWriters(std::map<T, BamWriter*>& writers) +{ + + typedef std::map<T, BamWriter*> WriterMap; + typedef typename WriterMap::iterator WriterMapIterator; + + // iterate over writers + WriterMapIterator writerIter = writers.begin(); + WriterMapIterator writerEnd = writers.end(); + for (; writerIter != writerEnd; ++writerIter) { + BamWriter* writer = (*writerIter).second; + if (writer == 0) continue; + + // close BamWriter + writer->Close(); + + // destroy BamWriter + delete writer; + writer = 0; + } + + // clear the container (destroying the items doesn't remove them) + writers.clear(); +} + +// handle list-type tags +template <typename T> +bool SplitTool::SplitToolPrivate::SplitListTagImpl(BamAlignment& al) +{ + + typedef std::vector<T> TagValueType; + typedef std::map<std::string, BamWriter*> WriterMap; + typedef typename WriterMap::iterator WriterMapIterator; + + // set up splitting data structure + WriterMap outputFiles; + WriterMapIterator writerIter; + + // determine tag prefix + std::string tagPrefix = SPLIT_TAG_TOKEN; + if (m_settings->HasCustomTagPrefix) tagPrefix = m_settings->CustomTagPrefix; + + // make sure prefix starts with '.' + const std::size_t dotFound = tagPrefix.find('.'); + if (dotFound != 0) tagPrefix = std::string(1, '.') + tagPrefix; + + const std::string tag = m_settings->TagToSplit; + BamWriter* writer; + TagValueType currentValue; + while (m_reader.GetNextAlignment(al)) { + + std::string listTagLabel; + if (!al.GetTag(tag, currentValue)) + listTagLabel = "none"; + else { + // make list label from tag data + std::stringstream listTagLabelStream; + typename TagValueType::const_iterator tagValueIter = currentValue.begin(); + typename TagValueType::const_iterator tagValueEnd = currentValue.end(); + for (; tagValueIter != tagValueEnd; ++tagValueIter) + listTagLabelStream << (*tagValueIter) << m_settings->ListTagDelimiter; + listTagLabel = listTagLabelStream.str(); + if (!listTagLabel.empty()) + listTagLabel = listTagLabel.substr( + 0, listTagLabel.size() - + m_settings->ListTagDelimiter.size()); // pop last delimiter + } + + // lookup writer for label + writerIter = outputFiles.find(listTagLabel); + + // if not found, create one + if (writerIter == outputFiles.end()) { + + // open new BamWriter, save first alignment + std::stringstream outputFilenameStream; + outputFilenameStream << m_outputFilenameStub << tagPrefix << tag << '_' << listTagLabel + << ".bam"; + writer = new BamWriter; + if (!writer->Open(outputFilenameStream.str(), m_header, m_references)) { + std::cerr << "bamtools split ERROR: could not open " << outputFilenameStream.str() + << " for writing." << std::endl; + return false; + } + + // store in map + outputFiles.insert(std::make_pair(listTagLabel, writer)); + } + + // else grab existing writer + else + writer = (*writerIter).second; + + // store alignment in proper BAM output file + if (writer) writer->SaveAlignment(al); + } + + // clean up & return success + CloseWriters(outputFiles); + return true; +} + +// handle the single-value tags +template <typename T> +bool SplitTool::SplitToolPrivate::SplitTagImpl(BamAlignment& al) +{ + + typedef T TagValueType; + typedef std::map<TagValueType, BamWriter*> WriterMap; + typedef typename WriterMap::iterator WriterMapIterator; + + // set up splitting data structure + WriterMap outputFiles; + WriterMapIterator writerIter; + + // determine tag prefix + std::string tagPrefix = SPLIT_TAG_TOKEN; + if (m_settings->HasCustomTagPrefix) tagPrefix = m_settings->CustomTagPrefix; + + // make sure prefix starts with '.' + const std::size_t dotFound = tagPrefix.find('.'); + if (dotFound != 0) tagPrefix = std::string(1, '.') + tagPrefix; + + // local variables + const std::string tag = m_settings->TagToSplit; + BamWriter* writer; + std::stringstream outputFilenameStream; + TagValueType currentValue; + + // retrieve first alignment tag value + if (al.GetTag(tag, currentValue)) { + + // open new BamWriter, save first alignment + outputFilenameStream << m_outputFilenameStub << tagPrefix << tag << '_' << currentValue + << ".bam"; + writer = new BamWriter; + if (!writer->Open(outputFilenameStream.str(), m_header, m_references)) { + std::cerr << "bamtools split ERROR: could not open " << outputFilenameStream.str() + << " for writing." << std::endl; + return false; + } + writer->SaveAlignment(al); + + // store in map + outputFiles.insert(std::make_pair(currentValue, writer)); + + // reset stream + outputFilenameStream.str(std::string()); + } + + // iterate through remaining alignments + while (m_reader.GetNextAlignment(al)) { + + // skip if this alignment doesn't have TAG + if (!al.GetTag(tag, currentValue)) continue; + + // look up tag value in map + writerIter = outputFiles.find(currentValue); + + // if no writer associated with this value + if (writerIter == outputFiles.end()) { + + // open new BamWriter + outputFilenameStream << m_outputFilenameStub << tagPrefix << tag << '_' << currentValue + << ".bam"; + writer = new BamWriter; + if (!writer->Open(outputFilenameStream.str(), m_header, m_references)) { + std::cerr << "bamtool split ERROR: could not open " << outputFilenameStream.str() + << " for writing." << std::endl; + return false; + } + + // store in map + outputFiles.insert(std::make_pair(currentValue, writer)); + + // reset stream + outputFilenameStream.str(std::string()); + } + + // else grab corresponding writer + else + writer = (*writerIter).second; + + // store alignment in proper BAM output file + if (writer) writer->SaveAlignment(al); + } + + // clean up BamWriters + CloseWriters(outputFiles); + + // return success + return true; +} + +// --------------------------------------------- +// SplitTool implementation + +SplitTool::SplitTool() + : AbstractTool() + , m_settings(new SplitSettings) + , m_impl(0) +{ + // set program details + const std::string name = "bamtools split"; + const std::string description = + "splits a BAM file on user-specified property, creating a new BAM output file for each " + "value found"; + const std::string args = + "[-in <filename>] [-stub <filename stub>] < -mapped | -paired | -reference [-refPrefix " + "<prefix>] | -tag <TAG> > "; + Options::SetProgramInfo(name, description, args); + + // set up options + OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); + Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", + m_settings->HasInputFilename, m_settings->InputFilename, IO_Opts, + Options::StandardIn()); + Options::AddValueOption( + "-refPrefix", "string", + "custom prefix for splitting by references. Currently files end with REF_<refName>.bam. " + "This option allows you to replace \"REF_\" with a prefix of your choosing.", + "", m_settings->HasCustomRefPrefix, m_settings->CustomRefPrefix, IO_Opts); + Options::AddValueOption( + "-tagPrefix", "string", + "custom prefix for splitting by tags. Current files end with TAG_<tagname>_<tagvalue>.bam. " + "This option allows you to replace \"TAG_\" with a prefix of your choosing.", + "", m_settings->HasCustomTagPrefix, m_settings->CustomTagPrefix, IO_Opts); + Options::AddValueOption("-stub", "filename stub", + "prefix stub for output BAM files (default behavior is to use input " + "filename, without .bam extension, as stub). If input is stdin and no " + "stub provided, a timestamp is generated as the stub.", + "", m_settings->HasCustomOutputStub, m_settings->CustomOutputStub, + IO_Opts); + Options::AddValueOption("-tagListDelim", "string", + "delimiter used to separate values in the filenames generated from " + "splitting on list-type tags [--]", + "", m_settings->HasListTagDelimiter, m_settings->ListTagDelimiter, + IO_Opts); + + OptionGroup* SplitOpts = Options::CreateOptionGroup("Split Options"); + Options::AddOption("-mapped", "split mapped/unmapped alignments", m_settings->IsSplittingMapped, + SplitOpts); + Options::AddOption("-paired", "split single-end/paired-end alignments", + m_settings->IsSplittingPaired, SplitOpts); + Options::AddOption("-reference", "split alignments by reference", + m_settings->IsSplittingReference, SplitOpts); + Options::AddValueOption("-tag", "tag name", + "splits alignments based on all values of TAG encountered (i.e. -tag " + "RG creates a BAM file for each read group in original BAM file)", + "", m_settings->IsSplittingTag, m_settings->TagToSplit, SplitOpts); +} + +SplitTool::~SplitTool() +{ + + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int SplitTool::Help() +{ + Options::DisplayHelp(); + return 0; +} + +int SplitTool::Run(int argc, char* argv[]) +{ + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // initialize SplitTool with settings + m_impl = new SplitToolPrivate(m_settings); + + // run SplitTool, return success/fail + if (m_impl->Run()) + return 0; + else + return 1; +} diff --git a/src/toolkit/bamtools_split.h b/src/toolkit/bamtools_split.h new file mode 100644 index 0000000..c246b40 --- /dev/null +++ b/src/toolkit/bamtools_split.h @@ -0,0 +1,39 @@ +// *************************************************************************** +// bamtools_split.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 7 April 2011 (DB) +// --------------------------------------------------------------------------- +// Splits a BAM file on user-specified property, creating a new BAM output +// file for each value found +// *************************************************************************** + +#ifndef BAMTOOLS_SPLIT_H +#define BAMTOOLS_SPLIT_H + +#include "bamtools_tool.h" + +namespace BamTools { + +class SplitTool : public AbstractTool +{ + +public: + SplitTool(); + ~SplitTool(); + +public: + int Help(); + int Run(int argc, char* argv[]); + +private: + struct SplitSettings; + SplitSettings* m_settings; + + class SplitToolPrivate; + SplitToolPrivate* m_impl; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_SPLIT_H diff --git a/src/toolkit/bamtools_stats.cpp b/src/toolkit/bamtools_stats.cpp new file mode 100644 index 0000000..3575aac --- /dev/null +++ b/src/toolkit/bamtools_stats.cpp @@ -0,0 +1,330 @@ +// *************************************************************************** +// bamtools_cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 December 2012 +// --------------------------------------------------------------------------- +// Prints general alignment statistics for BAM file(s). +// *************************************************************************** + +#include "bamtools_stats.h" + +#include <api/BamMultiReader.h> +#include <utils/bamtools_options.h> +using namespace BamTools; + +#include <algorithm> +#include <cmath> +#include <cstddef> +#include <fstream> +#include <functional> +#include <iostream> +#include <numeric> +#include <string> +#include <vector> + +// --------------------------------------------- +// StatsSettings implementation + +struct StatsTool::StatsSettings +{ + + // flags + bool HasInput; + bool HasInputFilelist; + bool IsShowingInsertSizeSummary; + + // filenames + std::vector<std::string> InputFiles; + std::string InputFilelist; + + // constructor + StatsSettings() + : HasInput(false) + , HasInputFilelist(false) + , IsShowingInsertSizeSummary(false) + {} +}; + +// --------------------------------------------- +// StatsToolPrivate implementation + +struct StatsTool::StatsToolPrivate +{ + + // ctor & dtor +public: + StatsToolPrivate(StatsTool::StatsSettings* _settings); + ~StatsToolPrivate() {} + + // 'public' interface +public: + bool Run(); + + // internal methods +private: + bool CalculateMedian(std::vector<int>& data, double& median); + void PrintStats(); + void ProcessAlignment(const BamAlignment& al); + + // data members +private: + StatsTool::StatsSettings* m_settings; + unsigned int m_numReads; + unsigned int m_numPaired; + unsigned int m_numProperPair; + unsigned int m_numMapped; + unsigned int m_numBothMatesMapped; + unsigned int m_numForwardStrand; + unsigned int m_numReverseStrand; + unsigned int m_numFirstMate; + unsigned int m_numSecondMate; + unsigned int m_numSingletons; + unsigned int m_numFailedQC; + unsigned int m_numDuplicates; + std::vector<int> m_insertSizes; +}; + +StatsTool::StatsToolPrivate::StatsToolPrivate(StatsTool::StatsSettings* settings) + : m_settings(settings) + , m_numReads(0) + , m_numPaired(0) + , m_numProperPair(0) + , m_numMapped(0) + , m_numBothMatesMapped(0) + , m_numForwardStrand(0) + , m_numReverseStrand(0) + , m_numFirstMate(0) + , m_numSecondMate(0) + , m_numSingletons(0) + , m_numFailedQC(0) + , m_numDuplicates(0) +{ + m_insertSizes.reserve(100000); +} + +// median is of type double because in the case of even number of data elements, +// we need to return the average of middle 2 elements +bool StatsTool::StatsToolPrivate::CalculateMedian(std::vector<int>& data, double& median) +{ + + // skip if data empty + if (data.empty()) return false; + + // find middle element + std::size_t middleIndex = data.size() / 2; + std::vector<int>::iterator target = data.begin() + middleIndex; + nth_element(data.begin(), target, data.end()); + + // odd number of elements + if ((data.size() % 2) != 0) { + median = (double)(*target); + return true; + } + + // even number of elements + else { + double rightTarget = (double)(*target); + std::vector<int>::iterator leftTarget = target - 1; + nth_element(data.begin(), leftTarget, data.end()); + median = (double)((rightTarget + *leftTarget) / 2.0); + return true; + } +} + +// print BAM file alignment stats +void StatsTool::StatsToolPrivate::PrintStats() +{ + + std::cout << std::endl; + std::cout << "**********************************************" << std::endl; + std::cout << "Stats for BAM file(s): " << std::endl; + std::cout << "**********************************************" << std::endl; + std::cout << std::endl; + std::cout << "Total reads: " << m_numReads << std::endl; + std::cout << "Mapped reads: " << m_numMapped << "\t(" + << ((float)m_numMapped / m_numReads) * 100 << "%)" << std::endl; + std::cout << "Forward strand: " << m_numForwardStrand << "\t(" + << ((float)m_numForwardStrand / m_numReads) * 100 << "%)" << std::endl; + std::cout << "Reverse strand: " << m_numReverseStrand << "\t(" + << ((float)m_numReverseStrand / m_numReads) * 100 << "%)" << std::endl; + std::cout << "Failed QC: " << m_numFailedQC << "\t(" + << ((float)m_numFailedQC / m_numReads) * 100 << "%)" << std::endl; + std::cout << "Duplicates: " << m_numDuplicates << "\t(" + << ((float)m_numDuplicates / m_numReads) * 100 << "%)" << std::endl; + std::cout << "Paired-end reads: " << m_numPaired << "\t(" + << ((float)m_numPaired / m_numReads) * 100 << "%)" << std::endl; + + if (m_numPaired != 0) { + std::cout << "'Proper-pairs': " << m_numProperPair << "\t(" + << ((float)m_numProperPair / m_numPaired) * 100 << "%)" << std::endl; + std::cout << "Both pairs mapped: " << m_numBothMatesMapped << "\t(" + << ((float)m_numBothMatesMapped / m_numPaired) * 100 << "%)" << std::endl; + std::cout << "Read 1: " << m_numFirstMate << std::endl; + std::cout << "Read 2: " << m_numSecondMate << std::endl; + std::cout << "Singletons: " << m_numSingletons << "\t(" + << ((float)m_numSingletons / m_numPaired) * 100 << "%)" << std::endl; + } + + if (m_settings->IsShowingInsertSizeSummary) { + + double avgInsertSize = 0.0; + if (!m_insertSizes.empty()) { + avgInsertSize = (accumulate(m_insertSizes.begin(), m_insertSizes.end(), 0.0) / + (double)m_insertSizes.size()); + std::cout << "Average insert size (absolute value): " << avgInsertSize << std::endl; + } + + double medianInsertSize = 0.0; + if (CalculateMedian(m_insertSizes, medianInsertSize)) + std::cout << "Median insert size (absolute value): " << medianInsertSize << std::endl; + } + std::cout << std::endl; +} + +// use current input alignment to update BAM file alignment stats +void StatsTool::StatsToolPrivate::ProcessAlignment(const BamAlignment& al) +{ + + // increment total alignment counter + ++m_numReads; + + // incrememt counters for pairing-independent flags + if (al.IsDuplicate()) ++m_numDuplicates; + if (al.IsFailedQC()) ++m_numFailedQC; + if (al.IsMapped()) ++m_numMapped; + + // increment strand counters + if (al.IsReverseStrand()) + ++m_numReverseStrand; + else + ++m_numForwardStrand; + + // if alignment is paired-end + if (al.IsPaired()) { + + // increment PE counter + ++m_numPaired; + + // increment first mate/second mate counters + if (al.IsFirstMate()) ++m_numFirstMate; + if (al.IsSecondMate()) ++m_numSecondMate; + + // if alignment is mapped, check mate status + if (al.IsMapped()) { + // if mate mapped + if (al.IsMateMapped()) ++m_numBothMatesMapped; + // else singleton + else + ++m_numSingletons; + } + + // check for explicit proper pair flag + if (al.IsProperPair()) ++m_numProperPair; + + // store insert size for first mate + if (m_settings->IsShowingInsertSizeSummary && al.IsFirstMate() && (al.InsertSize != 0)) { + int insertSize = abs(al.InsertSize); + m_insertSizes.push_back(insertSize); + } + } +} + +bool StatsTool::StatsToolPrivate::Run() +{ + + // set to default input if none provided + if (!m_settings->HasInput && !m_settings->HasInputFilelist) + m_settings->InputFiles.push_back(Options::StandardIn()); + + // add files in the filelist to the input file list + if (m_settings->HasInputFilelist) { + + std::ifstream filelist(m_settings->InputFilelist.c_str(), std::ios::in); + if (!filelist.is_open()) { + std::cerr << "bamtools stats ERROR: could not open input BAM file list... Aborting." + << std::endl; + return false; + } + + std::string line; + while (std::getline(filelist, line)) + m_settings->InputFiles.push_back(line); + } + + // open the BAM files + BamMultiReader reader; + if (!reader.Open(m_settings->InputFiles)) { + std::cerr << "bamtools stats ERROR: could not open input BAM file(s)... Aborting." + << std::endl; + reader.Close(); + return false; + } + + // plow through alignments, keeping track of stats + BamAlignment al; + while (reader.GetNextAlignmentCore(al)) + ProcessAlignment(al); + reader.Close(); + + // print stats & exit + PrintStats(); + return true; +} + +// --------------------------------------------- +// StatsTool implementation + +StatsTool::StatsTool() + : AbstractTool() + , m_settings(new StatsSettings) + , m_impl(0) +{ + // set program details + Options::SetProgramInfo( + "bamtools stats", "prints general alignment statistics", + "[-in <filename> -in <filename> ... | -list <filelist>] [statsOptions]"); + + // set up options + OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); + Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, + m_settings->InputFiles, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "", + m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts); + + OptionGroup* AdditionalOpts = Options::CreateOptionGroup("Additional Stats"); + Options::AddOption("-insert", "summarize insert size data", + m_settings->IsShowingInsertSizeSummary, AdditionalOpts); +} + +StatsTool::~StatsTool() +{ + + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int StatsTool::Help() +{ + Options::DisplayHelp(); + return 0; +} + +int StatsTool::Run(int argc, char* argv[]) +{ + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // initialize StatsTool with settings + m_impl = new StatsToolPrivate(m_settings); + + // run StatsTool, return success/fail + if (m_impl->Run()) + return 0; + else + return 1; +} diff --git a/src/toolkit/bamtools_stats.h b/src/toolkit/bamtools_stats.h new file mode 100644 index 0000000..dd2e25b --- /dev/null +++ b/src/toolkit/bamtools_stats.h @@ -0,0 +1,38 @@ +// *************************************************************************** +// bamtools_stats.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 7 April 2011 +// --------------------------------------------------------------------------- +// Prints general statistics for a single BAM file +// *************************************************************************** + +#ifndef BAMTOOLS_STATS_H +#define BAMTOOLS_STATS_H + +#include "bamtools_tool.h" + +namespace BamTools { + +class StatsTool : public AbstractTool +{ + +public: + StatsTool(); + ~StatsTool(); + +public: + int Help(); + int Run(int argc, char* argv[]); + +private: + struct StatsSettings; + StatsSettings* m_settings; + + struct StatsToolPrivate; + StatsToolPrivate* m_impl; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_STATS_H diff --git a/src/toolkit/bamtools_tool.h b/src/toolkit/bamtools_tool.h new file mode 100644 index 0000000..31ddcd7 --- /dev/null +++ b/src/toolkit/bamtools_tool.h @@ -0,0 +1,36 @@ +// *************************************************************************** +// bamtools_tool.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 2 June 2010 +// --------------------------------------------------------------------------- +// Base class for all other BamTools sub-tools +// All derived classes must provide Help() and Run() methods +// *************************************************************************** + +#ifndef BAMTOOLS_ABSTRACTTOOL_H +#define BAMTOOLS_ABSTRACTTOOL_H + +#include <string> + +namespace BamTools { + +class AbstractTool +{ + +public: + AbstractTool() {} + virtual ~AbstractTool() {} + +public: + virtual int Help() = 0; + virtual int Run(int argc, char* argv[]) = 0; + + // derived classes should also provide: + // static std::string Description(); + // static std::String Name(); +}; + +} // namespace BamTools + +#endif // BAMTOOLS_ABSTRACTTOOL_H diff --git a/src/toolkit/bamtools_version.h.in b/src/toolkit/bamtools_version.h.in new file mode 100644 index 0000000..34a6d2e --- /dev/null +++ b/src/toolkit/bamtools_version.h.in @@ -0,0 +1,20 @@ +// *************************************************************************** +// bamtools_version.h.in (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides version information for the BamTools toolkit. +// *************************************************************************** + +#ifndef BAMTOOLS_VERSION_H +#define BAMTOOLS_VERSION_H + +// CMake uses this file as a template to generate "bamtools_version.h". +// These constants are defined to match the variables set in the build system. +#define BAMTOOLS_VERSION_MAJOR @BamTools_VERSION_MAJOR@ +#define BAMTOOLS_VERSION_MINOR @BamTools_VERSION_MINOR@ +#define BAMTOOLS_VERSION_PATCH @BamTools_VERSION_PATCH@ + +#endif // BAMTOOLS_VERSION_H + diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt new file mode 100644 index 0000000..93cb62d --- /dev/null +++ b/src/utils/CMakeLists.txt @@ -0,0 +1,29 @@ +# ========================== +# BamTools CMakeLists.txt +# (c) 2010 Derek Barnett +# +# src/utils/ +# ========================== + +# list include paths +include_directories( ${BamTools_SOURCE_DIR}/src/api ) + +# add compiler definitions +add_definitions( -DBAMTOOLS_UTILS_LIBRARY ) # (for proper exporting of library symbols) + +# create BamTools utils library +add_library( BamTools-utils STATIC + bamtools_fasta.cpp + bamtools_options.cpp + bamtools_pileup_engine.cpp + bamtools_utilities.cpp + ) + +# link BamTools-utils library with BamTools automatically +target_link_libraries( BamTools-utils BamTools ) + +# set BamTools library properties +set_target_properties( BamTools-utils PROPERTIES + OUTPUT_NAME bamtools-utils + PREFIX "lib" + ) diff --git a/src/utils/bamtools_fasta.cpp b/src/utils/bamtools_fasta.cpp new file mode 100644 index 0000000..be55c43 --- /dev/null +++ b/src/utils/bamtools_fasta.cpp @@ -0,0 +1,643 @@ +// *************************************************************************** +// bamtools_fasta.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 9 March 2012 (DB) +// --------------------------------------------------------------------------- +// Provides FASTA reading/indexing functionality. +// *************************************************************************** + +#include "utils/bamtools_fasta.h" +using namespace BamTools; + +#include <cstddef> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <fstream> +#include <iostream> +#include <sstream> +#include <vector> + +struct Fasta::FastaPrivate +{ + + struct FastaIndexData + { + std::string Name; + int32_t Length; + int64_t Offset; + int32_t LineLength; + int32_t + ByteLength; // LineLength + newline character(s) - varies on OS where file was generated + }; + + // data members + FILE* Stream; + bool IsOpen; + + FILE* IndexStream; + bool HasIndex; + bool IsIndexOpen; + + std::vector<FastaIndexData> Index; + + // ctor + FastaPrivate(); + ~FastaPrivate(); + + // 'public' API methods + bool Close(); + bool CreateIndex(const std::string& indexFilename); + bool GetBase(const int& refId, const int& position, char& base); + bool GetSequence(const int& refId, const int& start, const int& stop, std::string& sequence); + bool Open(const std::string& filename, const std::string& indexFilename); + + // internal methods +private: + void Chomp(char* sequence); + bool GetNameFromHeader(const std::string& header, std::string& name); + bool GetNextHeader(std::string& header); + bool GetNextSequence(std::string& sequence); + bool LoadIndexData(); + bool Rewind(); + bool WriteIndexData(); +}; + +Fasta::FastaPrivate::FastaPrivate() + : IsOpen(false) + , HasIndex(false) + , IsIndexOpen(false) +{} + +Fasta::FastaPrivate::~FastaPrivate() +{ + Close(); +} + +// remove any trailing newlines +void Fasta::FastaPrivate::Chomp(char* sequence) +{ + + static const int CHAR_LF = 10; + static const int CHAR_CR = 13; + + int seqLength = strlen(sequence); + if (seqLength == 0) return; + --seqLength; // ignore null terminator + + while (sequence[seqLength] == CHAR_LF || sequence[seqLength] == CHAR_CR) { + sequence[seqLength] = 0; + --seqLength; + if (seqLength < 0) break; + } +} + +bool Fasta::FastaPrivate::Close() +{ + + // close fasta file + if (IsOpen) { + fclose(Stream); + IsOpen = false; + } + + // close index file + if (HasIndex && IsIndexOpen) { + fclose(IndexStream); + HasIndex = false; + IsIndexOpen = false; + } + + // return success + return true; +} + +bool Fasta::FastaPrivate::CreateIndex(const std::string& indexFilename) +{ + + // check that file is open + if (!IsOpen) { + std::cerr << "FASTA error : cannot create index, FASTA file not open" << std::endl; + return false; + } + + // rewind FASTA file + if (!Rewind()) { + std::cerr << "FASTA error : could not rewind FASTA file" << std::endl; + return false; + } + + // clear out prior index data + Index.clear(); + + // ------------------------------------------- + // calculate lineLength & byteLength + + int lineLength = 0; + int byteLength = 0; + + // skip over header + char buffer[1024]; + if (fgets(buffer, 1024, Stream) == 0) { + std::cerr << "FASTA error : could not read from file" << std::endl; + return false; + } + if (feof(Stream)) return false; + if (buffer[0] != '>') { + std::cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << std::endl; + return false; + } + + // read in first line of sequence + char c = fgetc(Stream); + while ((c >= 0) && (c != '\n')) { + ++byteLength; + if (isgraph(c)) ++lineLength; + c = fgetc(Stream); + } + ++byteLength; // store newline + + // rewind FASTA file + if (!Rewind()) { + std::cerr << "FASTA error : could not rewind FASTA file" << std::endl; + return false; + } + + // iterate through fasta entries + int currentId = 0; + std::string header; + std::string sequence; + while (GetNextHeader(header)) { + + // --------------------------- + // build index entry data + FastaIndexData data; + + // store file offset of beginning of DNA sequence (after header) + data.Offset = ftell64(Stream); + + // parse header, store sequence name in data.Name + if (!GetNameFromHeader(header, data.Name)) { + std::cerr << "FASTA error : could not parse read name from FASTA header" << std::endl; + return false; + } + + // retrieve FASTA sequence + if (!GetNextSequence(sequence)) { + std::cerr << "FASTA error : could not read in next sequence from FASTA file" + << std::endl; + return false; + } + + // store sequence length & line/byte lengths + data.Length = sequence.length(); + data.LineLength = lineLength; + data.ByteLength = byteLength; + + // store index entry + Index.push_back(data); + + // update ref Id + ++currentId; + } + + // open index file + if (!indexFilename.empty()) { + IndexStream = fopen(indexFilename.c_str(), "wb"); + if (!IndexStream) { + std::cerr << "FASTA error : Could not open " << indexFilename << " for writing." + << std::endl; + return false; + } + IsIndexOpen = true; + } + + // write index data + if (!WriteIndexData()) return false; + HasIndex = true; + + // close index file + fclose(IndexStream); + IsIndexOpen = false; + + // return succes status + return true; +} + +bool Fasta::FastaPrivate::GetBase(const int& refId, const int& position, char& base) +{ + + // make sure FASTA file is open + if (!IsOpen) { + std::cerr << "FASTA error : file not open for reading" << std::endl; + return false; + } + + // use index if available + if (HasIndex && !Index.empty()) { + + // validate reference id + if ((refId < 0) || (refId >= (int)Index.size())) { + std::cerr << "FASTA error: invalid refId specified: " << refId << std::endl; + return false; + } + + // retrieve reference index data + const FastaIndexData& referenceData = Index.at(refId); + + // validate position + if ((position < 0) || (position > referenceData.Length)) { + std::cerr << "FASTA error: invalid position specified: " << position << std::endl; + return false; + } + + // calculate seek position & attempt jump + const int64_t lines = position / referenceData.LineLength; + const int64_t lineOffset = position % referenceData.LineLength; + const int64_t seekTo = + referenceData.Offset + (lines * referenceData.ByteLength) + lineOffset; + if (fseek64(Stream, seekTo, SEEK_SET) != 0) { + std::cerr << "FASTA error : could not seek in file" << std::endl; + return false; + } + + // set base & return success + base = getc(Stream); + return true; + } + + // else plow through sequentially + else { + + // rewind FASTA file + if (!Rewind()) { + std::cerr << "FASTA error : could not rewind FASTA file" << std::endl; + return false; + } + + // iterate through fasta entries + int currentId = 0; + std::string header; + std::string sequence; + + // get first entry + GetNextHeader(header); + GetNextSequence(sequence); + + while (currentId != refId) { + GetNextHeader(header); + GetNextSequence(sequence); + ++currentId; + } + + // get desired base from sequence + // TODO: error reporting on invalid position + if (currentId == refId && (sequence.length() >= static_cast<std::size_t>(position))) { + base = sequence.at(position); + return true; + } + + // could not get sequence + return false; + } + + // return success + return true; +} + +bool Fasta::FastaPrivate::GetNameFromHeader(const std::string& header, std::string& name) +{ + + // get rid of the leading greater than sign + std::string s = header.substr(1); + + // extract the first non-whitespace segment + char* pName = (char*)s.data(); + unsigned int nameLen = (unsigned int)s.size(); + + unsigned int start = 0; + while ((pName[start] == 32) || (pName[start] == 9) || (pName[start] == 10) || + (pName[start] == 13)) { + start++; + if (start == nameLen) break; + } + + unsigned int stop = start; + if (stop < nameLen) { + while ((pName[stop] != 32) && (pName[stop] != 9) && (pName[stop] != 10) && + (pName[stop] != 13)) { + stop++; + if (stop == nameLen) break; + } + } + + if (start == stop) { + std::cerr << "FASTA error : could not parse read name from FASTA header" << std::endl; + return false; + } + + name = s.substr(start, stop - start).c_str(); + return true; +} + +bool Fasta::FastaPrivate::GetNextHeader(std::string& header) +{ + + // validate input stream + if (!IsOpen || feof(Stream)) return false; + + // read in header line + char buffer[1024]; + if (fgets(buffer, 1024, Stream) == 0) { + std::cerr << "FASTA error : could not read from file" << std::endl; + return false; + } + + // make sure it's a FASTA header + if (buffer[0] != '>') { + std::cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << std::endl; + return false; + } + + // import buffer contents to header string + std::stringstream headerBuffer; + headerBuffer << buffer; + header = headerBuffer.str(); + + // return success + return true; +} + +bool Fasta::FastaPrivate::GetNextSequence(std::string& sequence) +{ + + // validate input stream + if (!IsOpen || feof(Stream)) return false; + + // read in sequence + char buffer[1024]; + std::ostringstream seqBuffer; + while (true) { + + char ch = fgetc(Stream); + ungetc(ch, Stream); + if ((ch == '>') || feof(Stream)) break; + + if (fgets(buffer, 1024, Stream) == 0) { + std::cerr << "FASTA error : could not read from file" << std::endl; + return false; + } + + Chomp(buffer); + seqBuffer << buffer; + } + + // import buffer contents to sequence string + sequence = seqBuffer.str(); + + // return success + return true; +} + +bool Fasta::FastaPrivate::GetSequence(const int& refId, const int& start, const int& stop, + std::string& sequence) +{ + + // make sure FASTA file is open + if (!IsOpen) { + std::cerr << "FASTA error : file not open for reading" << std::endl; + return false; + } + + // use index if available + if (HasIndex && !Index.empty()) { + + // validate reference id + if ((refId < 0) || (refId >= (int)Index.size())) { + std::cerr << "FASTA error: invalid refId specified: " << refId << std::endl; + return false; + } + + // retrieve reference index data + const FastaIndexData& referenceData = Index.at(refId); + + // validate stop position + if ((start < 0) || (start > stop) || (stop > referenceData.Length)) { + std::cerr << "FASTA error: invalid start/stop positions specified: " << start << ", " + << stop << std::endl; + return false; + } + + // seek to beginning of sequence data + if (fseek64(Stream, referenceData.Offset, SEEK_SET) != 0) { + std::cerr << "FASTA error : could not sek in file" << std::endl; + return false; + } + + // retrieve full sequence + std::string fullSequence; + if (!GetNextSequence(fullSequence)) { + std::cerr << "FASTA error : could not retrieve sequence from FASTA file" << std::endl; + return false; + } + + // set sub-sequence & return success + const int seqLength = (stop - start) + 1; + sequence = fullSequence.substr(start, seqLength); + return true; + } + + // else plow through sequentially + else { + + // rewind FASTA file + if (!Rewind()) { + std::cerr << "FASTA error : could not rewind FASTA file" << std::endl; + return false; + } + + // iterate through fasta entries + int currentId = 0; + std::string header; + std::string fullSequence; + + // get first entry + GetNextHeader(header); + GetNextSequence(fullSequence); + + while (currentId != refId) { + GetNextHeader(header); + GetNextSequence(fullSequence); + ++currentId; + } + + // get desired substring from sequence + // TODO: error reporting on invalid start/stop positions + if (currentId == refId && (fullSequence.length() >= static_cast<std::size_t>(stop))) { + const int seqLength = (stop - start) + 1; + sequence = fullSequence.substr(start, seqLength); + return true; + } + + // could not get sequence + return false; + } + + // return success + return true; +} + +bool Fasta::FastaPrivate::LoadIndexData() +{ + + // skip if no index file available + if (!IsIndexOpen) return false; + + // clear any prior index data + Index.clear(); + + char buffer[1024]; + std::stringstream indexBuffer; + while (true) { + + char c = fgetc(IndexStream); + if ((c == '\n') || feof(IndexStream)) break; + ungetc(c, IndexStream); + + // clear index buffer + indexBuffer.str(std::string()); + + // read line from index file + if (fgets(buffer, 1024, IndexStream) == 0) { + std::cerr << "FASTA LoadIndexData() error : could not read from index file" + << std::endl; + HasIndex = false; + return false; + } + + // store line in indexBuffer + indexBuffer << buffer; + + // retrieve fasta index data from line + FastaIndexData data; + indexBuffer >> data.Name; + indexBuffer >> data.Length; + indexBuffer >> data.Offset; + indexBuffer >> data.LineLength; + indexBuffer >> data.ByteLength; + + // store index entry + Index.push_back(data); + } + + return true; +} + +bool Fasta::FastaPrivate::Open(const std::string& filename, const std::string& indexFilename) +{ + + bool success = true; + + // open FASTA filename + Stream = fopen(filename.c_str(), "rb"); + if (!Stream) { + std::cerr << "FASTA error: Could not open " << filename << " for reading" << std::endl; + return false; + } + IsOpen = true; + success &= IsOpen; + + // open index file if it exists + if (!indexFilename.empty()) { + IndexStream = fopen(indexFilename.c_str(), "rb"); + if (!IndexStream) { + std::cerr << "FASTA error : Could not open " << indexFilename << " for reading." + << std::endl; + return false; + } + IsIndexOpen = true; + success &= IsIndexOpen; + + // attempt to load index data + HasIndex = LoadIndexData(); + success &= HasIndex; + } + + // return success status + return success; +} + +bool Fasta::FastaPrivate::Rewind() +{ + if (!IsOpen) return false; + return (fseek64(Stream, 0, SEEK_SET) == 0); +} + +bool Fasta::FastaPrivate::WriteIndexData() +{ + + // skip if no index file available + if (!IsIndexOpen) return false; + + // iterate over index entries + bool success = true; + std::stringstream indexBuffer; + std::vector<FastaIndexData>::const_iterator indexIter = Index.begin(); + std::vector<FastaIndexData>::const_iterator indexEnd = Index.end(); + for (; indexIter != indexEnd; ++indexIter) { + + // clear stream + indexBuffer.str(std::string()); + + // write data to stream + const FastaIndexData& data = (*indexIter); + indexBuffer << data.Name << '\t' << data.Length << '\t' << data.Offset << '\t' + << data.LineLength << '\t' << data.ByteLength << std::endl; + + // write stream to file + success &= (fputs(indexBuffer.str().c_str(), IndexStream) >= 0); + } + + // return success status + return success; +} + +// -------------------------------- +// Fasta implementation + +Fasta::Fasta() +{ + d = new FastaPrivate; +} + +Fasta::~Fasta() +{ + delete d; + d = 0; +} + +bool Fasta::Close() +{ + return d->Close(); +} + +bool Fasta::CreateIndex(const std::string& indexFilename) +{ + return d->CreateIndex(indexFilename); +} + +bool Fasta::GetBase(const int& refId, const int& position, char& base) +{ + return d->GetBase(refId, position, base); +} + +bool Fasta::GetSequence(const int& refId, const int& start, const int& stop, std::string& sequence) +{ + return d->GetSequence(refId, start, stop, sequence); +} + +bool Fasta::Open(const std::string& filename, const std::string& indexFilename) +{ + return d->Open(filename, indexFilename); +} diff --git a/src/utils/bamtools_fasta.h b/src/utils/bamtools_fasta.h new file mode 100644 index 0000000..3b30623 --- /dev/null +++ b/src/utils/bamtools_fasta.h @@ -0,0 +1,48 @@ +// *************************************************************************** +// bamtools_fasta.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 +// --------------------------------------------------------------------------- +// Provides FASTA reading/indexing functionality. +// *************************************************************************** + +#ifndef BAMTOOLS_FASTA_H +#define BAMTOOLS_FASTA_H + +#include <string> +#include "utils/utils_global.h" + +namespace BamTools { + +class UTILS_EXPORT Fasta +{ + + // ctor & dtor +public: + Fasta(); + ~Fasta(); + + // file-handling methods +public: + bool Close(); + bool Open(const std::string& filename, const std::string& indexFilename = std::string()); + + // sequence access methods +public: + bool GetBase(const int& refID, const int& position, char& base); + bool GetSequence(const int& refId, const int& start, const int& stop, std::string& sequence); + + // index-handling methods +public: + bool CreateIndex(const std::string& indexFilename); + + // internal implementation +private: + struct FastaPrivate; + FastaPrivate* d; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_FASTA_H diff --git a/src/utils/bamtools_filter_engine.h b/src/utils/bamtools_filter_engine.h new file mode 100644 index 0000000..ed303a0 --- /dev/null +++ b/src/utils/bamtools_filter_engine.h @@ -0,0 +1,575 @@ +// *************************************************************************** +// bamtools_filter_engine.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 3 May 2013 +// --------------------------------------------------------------------------- +// Provides a generic filter engine based on filter-sets of properties, +// with possible "rules" (compound logical expressions) to create more complex +// queries on a data set. +// +// FilterEngine consists, most importantly, of : +// +// a list of possible properties (each tagged whether it has been 'enabled' as a filter) +// a map of filterName => propertySet +// queue for compound rule expression (i.e. "(filter1 AND filter2) OR !filter3" ) +// +// Each propertySet is a list of properties enabled for this particular filter object +// +// Implemented as a map of propertyNames to propertyFilterValue +// ( "property1" => pfv1 +// "property2" => pfv2 +// "property4" => pfv4 +// etc. ) +// +// Any properties that are 'possible', via FilterEngine::addProperty(), but not enabled +// via FilterEngine::setProperty() (in our example, say "property3"), evaluate to true +// for any query. Meaning that if a property is not set on this filter, we don't care +// about it here, so it passes though OK. +// +// A propertyFilterValue contains a value and comparison type +// +// ( pfv1: Value = 50, Type = GREATER_THAN_EQUAL +// pfv2: Value = "foo", Type = STARTS_WITH +// pfv4: Value = "bar", Type = CONTAINS +// etc. ) +// +// This allows for more complex queries (than simple isEqual?) against a variety of data types. +// +// *************************************************************************** + +#ifndef BAMTOOLS_FILTER_ENGINE_H +#define BAMTOOLS_FILTER_ENGINE_H + +#include "utils/bamtools_filter_properties.h" +#include "utils/bamtools_filter_ruleparser.h" +#include "utils/bamtools_utilities.h" +#include "utils/utils_global.h" + +#include <algorithm> +#include <iostream> +#include <map> +#include <queue> +#include <sstream> +#include <stack> +#include <string> +#include <utility> +#include <vector> + +namespace BamTools { + +struct UTILS_EXPORT FilterCompareType +{ + enum Type + { + AND = 0, + NOT, + OR + }; +}; + +// ----------------------------------------------------------- +// FilterEngine + +template <typename FilterChecker> +class UTILS_EXPORT FilterEngine +{ + + // ctor & dtor +public: + FilterEngine() + : m_isRuleQueueGenerated(false) + , m_defaultCompareType(FilterCompareType::OR) + , AND_OPERATOR(1, '&') + , OR_OPERATOR(1, '|') + , NOT_OPERATOR(1, '!') + {} + + ~FilterEngine() {} + + // 'filter set' methods +public: + // creates a new filter set, returns true if created, false if error or already exists + bool addFilter(const std::string& filterName); + + // return list of current filter names + const std::vector<std::string> filterNames(); + + // 'property' methods +public: + // add a new known property (& type) to engine + bool addProperty(const std::string& propertyName); + + // sets property filter (value, type) for propertyName, on a particular filter set + // setProperty("filter1", "mapQuality", 50, GREATER_THAN_EQUAL) + template <typename T> + bool setProperty( + const std::string& filterName, const std::string& propertyName, const T& value, + const PropertyFilterValue::ValueCompareType& type = PropertyFilterValue::EXACT); + + // returns list of all properties known by FilterEngine ( any created using addProperty() ) + const std::vector<std::string> allPropertyNames(); + + // returns list of property names that are 'enabled' ( only those touched by setProperty() ) + const std::vector<std::string> enabledPropertyNames(); + + // 'rule' methods +public: + // sets comparison operator between filters if no rule string given + // default is to do an OR on each filter + void setDefaultCompareType(const FilterCompareType::Type& type = FilterCompareType::OR); + + // sets rule string for building expression queue + // if empty, creates + void setRule(const std::string& ruleString = std::string()); + + // token parsing (for property filter generation) +public: + template <typename T> + static bool parseToken(const std::string& token, T& value, + PropertyFilterValue::ValueCompareType& type); + + // query evaluation +public: + // returns true if query passes all filters in FilterEngine + template <typename T> + bool check(const T& query); + + // internal rule-handling methods +private: + void buildDefaultRuleString(); + void buildRuleQueue(); + template <typename T> + bool evaluateFilterRules(const T& query); + + // data members +private: + // all 'filter sets' + FilterMap m_filters; + + // all known properties + std::vector<Property> m_properties; + + // infix expression of filter-set comparison rules + std::string m_ruleString; + + // postfix expression of tokens (filterNames) and operators (as strings) + // if this is empty, uses m_compareType to build default expression queue + std::queue<std::string> m_ruleQueue; + + // flag to test if the rule expression queue has been generated + bool m_isRuleQueueGenerated; + + // 'default' comparison operator between filters if no rule string given + // if this is changed, m_ruleString is used to build new m_ruleQueue + FilterCompareType::Type m_defaultCompareType; + + // client-specified checking type ( provides method: bool check(PropertyFilter, T object) ) + FilterChecker m_checker; + + // token-parsing constants + static const int NOT_CHAR = (int)'!'; + static const int EQUAL_CHAR = (int)'='; + static const int GREATER_THAN_CHAR = (int)'>'; + static const int LESS_THAN_CHAR = (int)'<'; + static const int WILDCARD_CHAR = (int)'*'; + + // filter evaluation constants + const std::string AND_OPERATOR; + const std::string OR_OPERATOR; + const std::string NOT_OPERATOR; +}; + +// creates a new filter set, returns true if created, false if error or already exists +template <typename FilterChecker> +inline bool FilterEngine<FilterChecker>::addFilter(const std::string& filterName) +{ + return (m_filters.insert(std::make_pair(filterName, PropertyFilter()))).second; +} + +// add a new known property & type to engine +template <typename FilterChecker> +inline bool FilterEngine<FilterChecker>::addProperty(const std::string& propertyName) +{ + const std::vector<std::string> propertyNames = allPropertyNames(); + bool found = std::binary_search(propertyNames.begin(), propertyNames.end(), propertyName); + if (found) return false; + m_properties.push_back(Property(propertyName)); + std::sort(m_properties.begin(), m_properties.end()); + return true; +} + +// returns list of all properties known by FilterEngine +// ( any that were created using addProperty() ) +template <typename FilterChecker> +inline const std::vector<std::string> FilterEngine<FilterChecker>::allPropertyNames() +{ + // set up stringlist + std::vector<std::string> names; + names.reserve(m_properties.size()); + // iterate through all properties, appending to stringlist + std::vector<Property>::const_iterator propIter = m_properties.begin(); + std::vector<Property>::const_iterator propEnd = m_properties.end(); + for (; propIter != propEnd; ++propIter) + names.push_back((*propIter).Name); + // return stringlist + return names; +} + +// builds a default rule string based on m_defaultCompareType +// used if user supplied an explicit rule string +template <typename FilterChecker> +inline void FilterEngine<FilterChecker>::buildDefaultRuleString() +{ + + // set up temp string stream + std::stringstream ruleStream; + + // get first filterName + FilterMap::const_iterator mapIter = m_filters.begin(); + ruleStream << (*mapIter).first; + + // if there are more filters present + // iterate over remaining filters, appending compare operator and filter name + if (m_filters.size() > 1) { + for (++mapIter; mapIter != m_filters.end(); ++mapIter) + ruleStream << ((m_defaultCompareType == FilterCompareType::AND) ? " & " : " | ") + << (*mapIter).first; + } + + // set m_ruleString from temp stream + m_ruleString = ruleStream.str(); +} + +// build expression queue based on ruleString +template <typename FilterChecker> +inline void FilterEngine<FilterChecker>::buildRuleQueue() +{ + + // skip if no filters present + if (m_filters.empty()) return; + + // clear out any prior expression queue data + while (!m_ruleQueue.empty()) + m_ruleQueue.pop(); + + // create a rule string, if not provided + if (m_ruleString.empty()) buildDefaultRuleString(); + + // initialize RuleParser, run, and retrieve results + RuleParser ruleParser(m_ruleString); + ruleParser.parse(); + m_ruleQueue = ruleParser.results(); + + // set flag if rule queue contains any values + m_isRuleQueueGenerated = (!m_ruleQueue.empty()); +} + +// returns whether query value passes filter engine rules +template <class FilterChecker> +template <typename T> +bool FilterEngine<FilterChecker>::check(const T& query) +{ + + // return result of querying against filter rules + return evaluateFilterRules(query); +} + +// returns list of property names that are 'enabled' ( only those touched by setProperty() ) +template <typename FilterChecker> +inline const std::vector<std::string> FilterEngine<FilterChecker>::enabledPropertyNames() +{ + // initialize stringlist + std::vector<std::string> names; + names.reserve(m_properties.size()); + // iterate over all properties, appending if enabled + std::vector<Property>::const_iterator propIter = m_properties.begin(); + std::vector<Property>::const_iterator propEnd = m_properties.end(); + for (; propIter != propEnd; ++propIter) + if ((*propIter).IsEnabled) names.push_back((*propIter).Name); + // return stringlist + return names; +} + +// evaluates postfix rule queue - with each filter as an operand, AND|OR|NOT as operators +template <class FilterChecker> +template <typename T> +bool FilterEngine<FilterChecker>::evaluateFilterRules(const T& query) +{ + + // build ruleQueue if not done before + if (!m_isRuleQueueGenerated) buildRuleQueue(); + + std::stack<bool> resultStack; + FilterMap::const_iterator filterIter; + std::queue<std::string> ruleQueueCopy = m_ruleQueue; + while (!ruleQueueCopy.empty()) { + const std::string& token = ruleQueueCopy.front(); + + // token is NOT_OPERATOR + if (token == FilterEngine<FilterChecker>::NOT_OPERATOR) { + BAMTOOLS_ASSERT_MESSAGE(!resultStack.empty(), + "Empty result stack - cannot apply operator: !"); + resultStack.top() = !resultStack.top(); + } + + // token is AND_OPERATOR + else if (token == FilterEngine<FilterChecker>::AND_OPERATOR) { + BAMTOOLS_ASSERT_MESSAGE(resultStack.size() >= 2, + "Not enough operands - cannot apply operator: &"); + bool topResult = resultStack.top(); + resultStack.pop(); + resultStack.top() &= topResult; + } + + // token is OR_OPERATOR + else if (token == FilterEngine<FilterChecker>::OR_OPERATOR) { + BAMTOOLS_ASSERT_MESSAGE(resultStack.size() >= 2, + "Not enough operands - cannot apply operator: |"); + bool topResult = resultStack.top(); + resultStack.pop(); + resultStack.top() |= topResult; + } + + // token is an operand + else { + // look up PropertyFilter that matches this token + filterIter = m_filters.find(token); + BAMTOOLS_ASSERT_MESSAGE((filterIter != m_filters.end()), + "Filter mentioned in rule, not found in FilterEngine"); + const PropertyFilter& filter = (*filterIter).second; + bool result = m_checker.check(filter, query); + resultStack.push(result); + } + + // pop token from ruleQueue + ruleQueueCopy.pop(); + } + + // return last result + BAMTOOLS_ASSERT_MESSAGE( + resultStack.size() == 1, + "Result stack should only have one value remaining - cannot return result"); + return resultStack.top(); +} + +// return list of current filter names +template <typename FilterChecker> +inline const std::vector<std::string> FilterEngine<FilterChecker>::filterNames() +{ + // initialize stringlist + std::vector<std::string> names; + names.reserve(m_filters.size()); + // iterate over all filters, appending filter name + FilterMap::const_iterator mapIter = m_filters.begin(); + FilterMap::const_iterator mapEnd = m_filters.end(); + for (; mapIter != mapEnd; ++mapIter) + names.push_back((*mapIter).first); + // return stringlist + return names; +} + +// parse a filterValue token string that may contain comparison qualifiers (">50", "*SRR", etc.) +template <class FilterChecker> +template <typename T> +bool FilterEngine<FilterChecker>::parseToken(const std::string& token, T& value, + PropertyFilterValue::ValueCompareType& type) +{ + + // skip if token is empty + if (token.empty()) return false; + + // will store token after special chars are removed + std::string strippedToken; + + // if only single character + if (token.length() == 1) { + strippedToken = token; + type = PropertyFilterValue::EXACT; + } + + // more than one character, check for special chars + else { + const int firstChar = (int)token.at(0); + switch (firstChar) { + + case (FilterEngine<FilterChecker>::NOT_CHAR): + strippedToken = token.substr(1); + type = PropertyFilterValue::NOT; + break; + + case (FilterEngine<FilterChecker>::GREATER_THAN_CHAR): + + // check for '>=' case + if (token.at(1) == FilterEngine<FilterChecker>::EQUAL_CHAR) { + if (token.length() == 2) return false; + strippedToken = token.substr(2); + type = PropertyFilterValue::GREATER_THAN_EQUAL; + } + + // otherwise only '>' + else { + strippedToken = token.substr(1); + type = PropertyFilterValue::GREATER_THAN; + } + + break; + + case (FilterEngine<FilterChecker>::LESS_THAN_CHAR): + + // check for '<=' case + if (token.at(1) == FilterEngine<FilterChecker>::EQUAL_CHAR) { + if (token.length() == 2) return false; + strippedToken = token.substr(2); + type = PropertyFilterValue::LESS_THAN_EQUAL; + } + + // otherwise only '<' + else { + strippedToken = token.substr(1); + type = PropertyFilterValue::LESS_THAN; + } + + break; + + case (FilterEngine<FilterChecker>::WILDCARD_CHAR): + + // check for *str* case (CONTAINS) + if (token.at(token.length() - 1) == FilterEngine<FilterChecker>::WILDCARD_CHAR) { + if (token.length() == 2) return false; + strippedToken = token.substr(1, token.length() - 2); + type = PropertyFilterValue::CONTAINS; + } + + // otherwise *str case (ENDS_WITH) + else { + strippedToken = token.substr(1); + type = PropertyFilterValue::ENDS_WITH; + } + + break; + + default: + // check for str* case (STARTS_WITH) + if (token.at(token.length() - 1) == FilterEngine<FilterChecker>::WILDCARD_CHAR) { + if (token.length() == 2) return false; + strippedToken = token.substr(0, token.length() - 1); + type = PropertyFilterValue::STARTS_WITH; + } + + // otherwise EXACT + else { + strippedToken = token; + type = PropertyFilterValue::EXACT; + } + + break; + } + } + + // convert stripped token to value + std::stringstream stream(strippedToken); + if (strippedToken == "true" || strippedToken == "false") + stream >> std::boolalpha >> value; + else + stream >> value; + + // check for valid CompareType on type T + Variant variantCheck = value; + + // if T is not string AND CompareType is for string values, return false + if (!variantCheck.is_type<std::string>()) { + if (type == PropertyFilterValue::CONTAINS || type == PropertyFilterValue::ENDS_WITH || + type == PropertyFilterValue::STARTS_WITH) + + return false; + } + + // return success + return true; +} + +// sets comparison operator between filters if no rule string given +// default is to do an OR on each filter +template <typename FilterChecker> +inline void FilterEngine<FilterChecker>::setDefaultCompareType(const FilterCompareType::Type& type) +{ + // check for supported compare type + if (type == FilterCompareType::AND || type == FilterCompareType::OR) { + // if not the current compare type + if (m_defaultCompareType != type) { + m_defaultCompareType = type; + buildRuleQueue(); + } + } +} + +// sets property filter (value, type) for propertyName, on a particular filter set +// setProperty("filter1", "mapQuality", 50, GREATER_THAN_EQUAL) +template <class FilterChecker> +template <typename T> +bool FilterEngine<FilterChecker>::setProperty(const std::string& filterName, + const std::string& propertyName, const T& value, + const PropertyFilterValue::ValueCompareType& type) +{ + // lookup filter by name, return false if not found + FilterMap::iterator filterIter = m_filters.find(filterName); + if (filterIter == m_filters.end()) return false; + + // lookup property for filter, add new PropertyFilterValue if not found, modify if already exists + PropertyFilter& filter = (*filterIter).second; + PropertyMap::iterator propertyIter = filter.Properties.find(propertyName); + + bool success; + + // property not found for this filter, create new entry + if (propertyIter == filter.Properties.end()) + success = (filter.Properties.insert( + std::make_pair(propertyName, PropertyFilterValue(value, type)))) + .second; + + // property already exists, modify + else { + PropertyFilterValue& filterValue = (*propertyIter).second; + filterValue.Value = value; + filterValue.Type = type; + success = true; + } + + // if error so far, return false + if (!success) return false; + + // -------------------------------------------- + // otherwise, set Property.IsEnabled to true + + // lookup property + std::vector<Property>::iterator knownPropertyIter = + std::find(m_properties.begin(), m_properties.end(), propertyName); + + // if not found, create a new (enabled) entry (& re-sort list) + if (knownPropertyIter == m_properties.end()) { + m_properties.push_back(Property(propertyName, true)); + std::sort(m_properties.begin(), m_properties.end()); + } + + // property already known, set as enabled + else + (*knownPropertyIter).IsEnabled = true; + + // return success + return true; +} + +// sets user-specified rule string & signals update of rule-expression queue +template <typename FilterChecker> +inline void FilterEngine<FilterChecker>::setRule(const std::string& ruleString) +{ + if (m_ruleString != ruleString) { + m_ruleString = ruleString; + buildRuleQueue(); + } +} + +} // namespace BamTools + +#endif // BAMTOOLS_FILTER_ENGINE_H diff --git a/src/utils/bamtools_filter_properties.h b/src/utils/bamtools_filter_properties.h new file mode 100644 index 0000000..550b08f --- /dev/null +++ b/src/utils/bamtools_filter_properties.h @@ -0,0 +1,234 @@ +// *************************************************************************** +// bamtools_filter_properties.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 +// --------------------------------------------------------------------------- +// Provides support data structures & methods for FilterEngine +// +// The FilterEngine consists, most importantly, of : +// +// a list of possible properties (each tagged whether it has been 'enabled' as a filter) +// a map of filterName => propertySet +// queue for compound rule expression (i.e. "(filter1 AND filter2) OR !filter3" ) +// +// Each propertySet is a list of properties enabled for this particular filter object +// +// Implemented as a map of propertyNames to propertyFilterValue +// ( "property1" => pfv1 +// "property2" => pfv2 +// "property4" => pfv4 +// etc. ) +// +// Any properties that are 'possible', via FilterEngine::addProperty(), but not enabled +// via FilterEngine::setProperty() (in our example, say "property3"), evaluate to true +// for any query. Meaning that if a property is not set on this filter, we don't care +// about it here, so it passes though OK. +// +// A propertyFilterValue contains a value and comparison type +// +// ( pfv1: Value = 50, Type = GREATER_THAN_EQUAL +// pfv2: Value = "foo", Type = STARTS_WITH +// pfv4: Value = "bar", Type = CONTAINS +// etc. ) +// +// This allows for more complex queries (than simple isEqual?) against a variety of data types. +// +// *************************************************************************** + +#ifndef BAMTOOLS_FILTER_PROPERTIES_H +#define BAMTOOLS_FILTER_PROPERTIES_H + +#include <iostream> +#include <map> +#include <string> +#include "utils/bamtools_utilities.h" +#include "utils/bamtools_variant.h" +#include "utils/utils_global.h" + +namespace BamTools { + +// ---------------------------------------------------------- +// PropertyFilterValue + +struct UTILS_EXPORT PropertyFilterValue +{ + + // define valid ValueCompareTypes + enum ValueCompareType + { + CONTAINS = 0, + ENDS_WITH, + EXACT, + GREATER_THAN, + GREATER_THAN_EQUAL, + LESS_THAN, + LESS_THAN_EQUAL, + NOT, + STARTS_WITH + }; + + // ctor + PropertyFilterValue(const Variant& value = Variant(), + const ValueCompareType& type = PropertyFilterValue::EXACT) + : Value(value) + , Type(type) + {} + + // filter check methods + template <typename T> + bool check(const T& query) const; + bool check(const std::string& query) const; + + // data members + Variant Value; + ValueCompareType Type; +}; + +// checks a query against a filter (value, compare type) +template <typename T> +bool PropertyFilterValue::check(const T& query) const +{ + + // ensure filter value & query are same type + if (!Value.is_type<T>()) { + std::cerr << "Cannot compare different types!" << std::endl; + return false; + } + + // string matching + if (Value.is_type<std::string>()) { + std::cerr << "Cannot compare different types - query is a string!" << std::endl; + return false; + } + + // numeric matching based on our filter type + switch (Type) { + case (PropertyFilterValue::EXACT): + return (query == Value.get<T>()); + case (PropertyFilterValue::GREATER_THAN): + return (query > Value.get<T>()); + case (PropertyFilterValue::GREATER_THAN_EQUAL): + return (query >= Value.get<T>()); + case (PropertyFilterValue::LESS_THAN): + return (query < Value.get<T>()); + case (PropertyFilterValue::LESS_THAN_EQUAL): + return (query <= Value.get<T>()); + case (PropertyFilterValue::NOT): + return (query != Value.get<T>()); + default: + BAMTOOLS_ASSERT_UNREACHABLE; + } + return false; +} + +// checks a string query against filter (value, compare type) +inline bool PropertyFilterValue::check(const std::string& query) const +{ + + // ensure filter value & query are same type + if (!Value.is_type<std::string>()) { + std::cerr << "Cannot compare different types!" << std::endl; + return false; + } + + // localize string version of our filter value + const std::string& valueString = Value.get<std::string>(); + + // string matching based on our filter type + switch (Type) { + case (PropertyFilterValue::CONTAINS): + return (query.find(valueString) != std::string::npos); + case (PropertyFilterValue::ENDS_WITH): + return (query.find(valueString) == (query.length() - valueString.length())); + case (PropertyFilterValue::EXACT): + return (query == valueString); + case (PropertyFilterValue::GREATER_THAN): + return (query > valueString); + case (PropertyFilterValue::GREATER_THAN_EQUAL): + return (query >= valueString); + case (PropertyFilterValue::LESS_THAN): + return (query < valueString); + case (PropertyFilterValue::LESS_THAN_EQUAL): + return (query <= valueString); + case (PropertyFilterValue::NOT): + return (query != valueString); + case (PropertyFilterValue::STARTS_WITH): + return (query.find(valueString) == 0); + default: + BAMTOOLS_ASSERT_UNREACHABLE; + } + return false; +} + +inline const std::string toString(const PropertyFilterValue::ValueCompareType& type) +{ + + switch (type) { + case (PropertyFilterValue::CONTAINS): + return std::string("CONTAINS"); + case (PropertyFilterValue::ENDS_WITH): + return std::string("ENDS_WITH"); + case (PropertyFilterValue::EXACT): + return std::string("EXACT"); + case (PropertyFilterValue::GREATER_THAN): + return std::string("GREATER_THAN"); + case (PropertyFilterValue::GREATER_THAN_EQUAL): + return std::string("GREATER_THAN_EQUAL"); + case (PropertyFilterValue::LESS_THAN): + return std::string("LESS_THAN"); + case (PropertyFilterValue::LESS_THAN_EQUAL): + return std::string("LESS_THAN_EQUAL"); + case (PropertyFilterValue::NOT): + return std::string("NOT"); + case (PropertyFilterValue::STARTS_WITH): + return std::string("STARTS_WITH"); + default: + BAMTOOLS_ASSERT_UNREACHABLE; + } + return std::string(); +} + +// property name => property filter value +// ('name' => ('SSR', STARTS_WITH), 'mapQuality' => (50, GREATER_THAN_EQUAL), etc...) +typedef std::map<std::string, PropertyFilterValue> PropertyMap; + +// ---------------------------------------------------------- +// PropertyFilter + +struct UTILS_EXPORT PropertyFilter +{ + // data members + PropertyMap Properties; +}; + +// filter name => properties +// ('filter1' => properties1, 'filter2' => properties2, etc...) +typedef std::map<std::string, PropertyFilter> FilterMap; + +// ---------------------------------------------------------- +// Property + +// used to store properties known to engine & keep track of enabled state +struct UTILS_EXPORT Property +{ + std::string Name; + bool IsEnabled; + Property(const std::string& name, bool isEnabled = false) + : Name(name) + , IsEnabled(isEnabled) + {} +}; + +inline bool operator<(const Property& lhs, const Property& rhs) +{ + return lhs.Name < rhs.Name; +} +inline bool operator==(const Property& lhs, const Property& rhs) +{ + return lhs.Name == rhs.Name; +} + +} // namespace BamTools + +#endif // BAMTOOLS_FILTER_PROPERTIES_H diff --git a/src/utils/bamtools_filter_ruleparser.h b/src/utils/bamtools_filter_ruleparser.h new file mode 100644 index 0000000..15a6ada --- /dev/null +++ b/src/utils/bamtools_filter_ruleparser.h @@ -0,0 +1,337 @@ +// *************************************************************************** +// bamtools_filter_ruleparser.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 +// --------------------------------------------------------------------------- +// Provides a compound rule parser for FilterEngine. +// *************************************************************************** + +#ifndef BAMTOOLS_FILTER_RULEPARSER_H +#define BAMTOOLS_FILTER_RULEPARSER_H + +#include <queue> +#include <stack> +#include <string> +#include "utils/bamtools_utilities.h" + +namespace BamTools { + +// ------------------------------------------- +// char constants + +const char LEFT_PARENTHESIS_CHAR = '('; +const char RIGHT_PARENTHESIS_CHAR = ')'; +const char AND_OPERATOR_CHAR = '&'; +const char OR_OPERATOR_CHAR = '|'; +const char NOT_OPERATOR_CHAR = '!'; +const char SPACE_CHAR = ' '; + +// ------------------------------------------- +// RuleToken implementation + +struct RuleToken +{ + + // enums + enum RuleTokenType + { + OPERAND = 0, + AND_OPERATOR, + OR_OPERATOR, + NOT_OPERATOR, + LEFT_PARENTHESIS, + RIGHT_PARENTHESIS + }; + + // data members + RuleTokenType Type; + std::string Value; +}; + +inline int priority(const RuleToken& token) +{ + switch (token.Type) { + case (RuleToken::NOT_OPERATOR): + return 3; + case (RuleToken::AND_OPERATOR): + return 2; + case (RuleToken::OR_OPERATOR): + return 1; + case (RuleToken::LEFT_PARENTHESIS): + return 0; + case (RuleToken::RIGHT_PARENTHESIS): + return 0; + default: + BAMTOOLS_ASSERT_UNREACHABLE; + return -1; + } +} + +inline bool isRightAssociative(const RuleToken& token) +{ + return (token.Type == RuleToken::NOT_OPERATOR || token.Type == RuleToken::LEFT_PARENTHESIS); +} + +inline bool isLeftAssociative(const RuleToken& token) +{ + return !isRightAssociative(token); +} + +inline bool isLeftParenthesis(const RuleToken& token) +{ + return (token.Type == RuleToken::LEFT_PARENTHESIS); +} + +inline bool isRightParenthesis(const RuleToken& token) +{ + return (token.Type == RuleToken::RIGHT_PARENTHESIS); +} + +inline bool isOperand(const RuleToken& token) +{ + return (token.Type == RuleToken::OPERAND); +} + +inline bool isOperator(const RuleToken& token) +{ + return (token.Type == RuleToken::AND_OPERATOR || token.Type == RuleToken::OR_OPERATOR || + token.Type == RuleToken::NOT_OPERATOR); +} + +// ------------------------------------------- +// RuleParser implementation + +class RuleParser +{ + + // ctor & dtor +public: + RuleParser(const std::string& ruleString) + : m_ruleString(ruleString) + { + // initialize char markers + m_begin = (char*)m_ruleString.c_str(); + m_end = m_begin + m_ruleString.length(); + ignoreQuotes(); + } + + ~RuleParser() {} + + // public interface +public: + void parse(); + std::queue<std::string> results() const + { + return m_ruleQueue; + } + + // internal methods +private: + char getNextChar(); + void ignoreQuotes(); + bool readToken(RuleToken& token); + void skipSpaces(); + + // data members +private: + std::string m_ruleString; + char* m_begin; + char* m_current; + char* m_end; + + std::queue<std::string> m_ruleQueue; + std::stack<RuleToken> m_operatorStack; +}; + +inline char RuleParser::getNextChar() +{ + if (m_current == m_end) return 0; + return *m_current++; +} + +inline void RuleParser::ignoreQuotes() +{ + if (*m_begin == '\"') ++m_begin; + if (*m_end == '\"') --m_end; +} + +inline void RuleParser::parse() +{ + + // clear out any prior data + while (!m_ruleQueue.empty()) + m_ruleQueue.pop(); + + // skip if no rule to parse + if (m_ruleString.empty()) return; + + // start at beginning of ruleString + m_current = m_begin; + + // iterate through tokens in rule string + RuleToken token; + while (readToken(token)) { + + if (token.Value.empty()) break; + + // if token is an operand + if (isOperand(token)) m_ruleQueue.push(token.Value); + + // if token is an operator + else if (isOperator(token)) { + + // pop any operators at top of stack with higher priority + while (!m_operatorStack.empty()) { + const RuleToken& opToken = m_operatorStack.top(); + if ((isLeftAssociative(token) && (priority(token) <= priority(opToken))) || + (isRightAssociative(token) && (priority(token) < priority(opToken)))) { + m_ruleQueue.push(opToken.Value); + m_operatorStack.pop(); + } else + break; + } + + // push current operator token onto stack + m_operatorStack.push(token); + } + + // if token is left parenthesis + else if (isLeftParenthesis(token)) + m_operatorStack.push(token); + + // if token is right parenthesis + else if (isRightParenthesis(token)) { + + bool foundLeftParenthesis = false; + + // push operators into rule queue until left parenthesis found + while (!m_operatorStack.empty() && !foundLeftParenthesis) { + const RuleToken& opToken = m_operatorStack.top(); + if (!isLeftParenthesis(opToken)) + m_ruleQueue.push(opToken.Value); + else + foundLeftParenthesis = true; + m_operatorStack.pop(); + } + + // no left parenthesis found, error + BAMTOOLS_ASSERT_MESSAGE(foundLeftParenthesis, + "ERROR: Mismatched parenthesis in rule string.1"); + } + + // error: unknown operand + else + BAMTOOLS_ASSERT_UNREACHABLE; + } + + // while there are still operators on stack + while (!m_operatorStack.empty()) { + const RuleToken& token = m_operatorStack.top(); + BAMTOOLS_ASSERT_MESSAGE((!isLeftParenthesis(token) && !isRightParenthesis(token)), + "ERROR: Mismatched parenthesis in rule string.2"); + m_ruleQueue.push(token.Value); + m_operatorStack.pop(); + } +} + +inline bool RuleParser::readToken(RuleToken& token) +{ + + // skip any preceding whitespace + skipSpaces(); + if (m_current == m_end) return false; + + // clear out prior token value + token.Value.clear(); + + // read chars while still in token + char c = 1; + bool keepReading = true; + bool inOperandString = false; + while (keepReading && (c != 0)) { + + // get next char + c = getNextChar(); + switch (c) { + + // current char is '(' + case (LEFT_PARENTHESIS_CHAR): + token.Type = RuleToken::LEFT_PARENTHESIS; + token.Value.append(1, LEFT_PARENTHESIS_CHAR); + keepReading = false; + break; + + // current char is ')' + case (RIGHT_PARENTHESIS_CHAR): + if (inOperandString) + --m_current; + else { + token.Type = RuleToken::RIGHT_PARENTHESIS; + token.Value.append(1, RIGHT_PARENTHESIS_CHAR); + } + keepReading = false; + break; + + // current char is '&' + case (AND_OPERATOR_CHAR): + if (inOperandString) + --m_current; + else { + token.Type = RuleToken::AND_OPERATOR; + token.Value.append(1, AND_OPERATOR_CHAR); + } + keepReading = false; + break; + + // current char is '|' + case (OR_OPERATOR_CHAR): + if (inOperandString) + --m_current; + else { + token.Type = RuleToken::OR_OPERATOR; + token.Value.append(1, OR_OPERATOR_CHAR); + } + keepReading = false; + break; + + // current char is '!' + case (NOT_OPERATOR_CHAR): + token.Type = RuleToken::NOT_OPERATOR; + token.Value.append(1, NOT_OPERATOR_CHAR); + keepReading = false; + break; + + // current char is ' ' + case (SPACE_CHAR): + keepReading = false; + break; + + // current char is a true value token + default: + if (c != 0) { + token.Type = RuleToken::OPERAND; + token.Value.append(1, c); + inOperandString = true; + keepReading = true; + } + } + } + + return true; +} + +inline void RuleParser::skipSpaces() +{ + while (m_current != m_end) { + const char c = *m_current; + if (c == ' ' || c == '\t' || c == '\r' || c == '\n') + ++m_current; + else + break; + } +} + +} // namespace BamTools + +#endif // BAMTOOLS_FILTER_RULEPARSER_H diff --git a/src/utils/bamtools_options.cpp b/src/utils/bamtools_options.cpp new file mode 100644 index 0000000..115581b --- /dev/null +++ b/src/utils/bamtools_options.cpp @@ -0,0 +1,305 @@ +// *************************************************************************** +// bamtools_options.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 +// --------------------------------------------------------------------------- +// Parses command line arguments and creates a help menu +// --------------------------------------------------------------------------- +// Modified from: +// The Mosaik suite's command line parser class: COptions +// (c) 2006 - 2009 Michael Str�mberg +// Marth Lab, Department of Biology, Boston College +// Re-licensed under MIT License with author's permission. +// +// * Modified slightly to fit BamTools, otherwise code is same. +// * (BamTools namespace, added stdin/stdout) (DB) +// *************************************************************************** + +#include "utils/bamtools_options.h" +using namespace BamTools; + +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <iomanip> +#include <sstream> + +std::string Options::m_programName; // the program name +std::string Options::m_description; // the main description +std::string Options::m_exampleArguments; // the example arguments +std::vector<OptionGroup> Options::m_optionGroups; // stores the option groups +std::map<std::string, OptionValue> Options::m_optionsMap; // stores the options in a map +const std::string Options::m_stdin = "stdin"; // string representation of stdin +const std::string Options::m_stdout = "stdout"; // string representation of stdout + +// adds a simple option to the parser +void Options::AddOption(const std::string& argument, const std::string& optionDescription, + bool& foundArgument, OptionGroup* group) +{ + Option o; + o.Argument = argument; + o.Description = optionDescription; + o.StoreValue = false; + group->Options.push_back(o); + + OptionValue ov; + ov.pFoundArgument = &foundArgument; + ov.StoreValue = false; + + m_optionsMap[argument] = ov; +} + +// creates an option group +OptionGroup* Options::CreateOptionGroup(const std::string& groupName) +{ + OptionGroup og; + og.Name = groupName; + m_optionGroups.push_back(og); + return &m_optionGroups[m_optionGroups.size() - 1]; +} + +// displays the help menu +void Options::DisplayHelp() +{ + + // initialize + char argumentBuffer[ARGUMENT_LENGTH + 1]; + std::ostringstream sb; + + char indentBuffer[MAX_LINE_LENGTH - DESC_LENGTH + 1]; + memset(indentBuffer, ' ', MAX_LINE_LENGTH - DESC_LENGTH); + indentBuffer[MAX_LINE_LENGTH - DESC_LENGTH] = 0; + + // display the menu + printf("Description: %s.\n\n", m_description.c_str()); + printf("Usage: "); + printf("%s", m_programName.c_str()); + printf(" %s\n\n", m_exampleArguments.c_str()); + + std::vector<Option>::const_iterator optionIter; + std::vector<OptionGroup>::const_iterator groupIter; + for (groupIter = m_optionGroups.begin(); groupIter != m_optionGroups.end(); ++groupIter) { + + printf("%s:\n", groupIter->Name.c_str()); + + for (optionIter = groupIter->Options.begin(); optionIter != groupIter->Options.end(); + ++optionIter) { + + if (optionIter->StoreValue) + snprintf(argumentBuffer, ARGUMENT_LENGTH + 1, " %s <%s>", + optionIter->Argument.c_str(), optionIter->ValueDescription.c_str()); + else + snprintf(argumentBuffer, ARGUMENT_LENGTH + 1, " %s", optionIter->Argument.c_str()); + printf("%-35s ", argumentBuffer); + + std::string description = optionIter->Description; + + // handle default values + if (optionIter->HasDefaultValue) { + + sb.str(std::string()); + sb << description << " ["; + + if (optionIter->DefaultValue.is_type<unsigned int>()) { + sb << (unsigned int)optionIter->DefaultValue; + } else if (optionIter->DefaultValue.is_type<unsigned char>()) { + sb << (unsigned short)(unsigned char)optionIter->DefaultValue; + } else if (optionIter->DefaultValue.is_type<float>()) { + sb << std::fixed << std::setprecision(2) << (float)optionIter->DefaultValue; + } else if (optionIter->DefaultValue.is_type<double>()) { + sb << std::fixed << std::setprecision(4) << (double)optionIter->DefaultValue; + } else if (optionIter->DefaultValue.is_type<std::string>()) { + const std::string stringValue = optionIter->DefaultValue; + sb << stringValue; + } else { + printf( + "ERROR: Found an unsupported data type for argument %s when casting the " + "default value.\n", + optionIter->Argument.c_str()); + std::exit(EXIT_FAILURE); + } + + sb << ']'; + description = sb.str(); + } + + if (description.size() <= DESC_LENGTH_FIRST_ROW) { + printf("%s\n", description.c_str()); + } else { + + // handle the first row + const char* pDescription = description.data(); + unsigned int cutIndex = DESC_LENGTH_FIRST_ROW; + while (pDescription[cutIndex] != ' ') + cutIndex--; + printf("%s\n", description.substr(0, cutIndex).c_str()); + description = description.substr(cutIndex + 1); + + // handle subsequent rows + while (description.size() > DESC_LENGTH) { + pDescription = description.data(); + cutIndex = DESC_LENGTH; + while (pDescription[cutIndex] != ' ') + cutIndex--; + printf("%s%s\n", indentBuffer, description.substr(0, cutIndex).c_str()); + description = description.substr(cutIndex + 1); + } + + // handle last row + printf("%s%s\n", indentBuffer, description.c_str()); + } + } + + printf("\n"); + } + + printf("Help:\n"); + printf(" --help, -h shows this help text\n"); + std::exit(EXIT_FAILURE); +} + +// parses the command line +void Options::Parse(int argc, char* argv[], int offset) +{ + + // initialize + std::map<std::string, OptionValue>::const_iterator ovMapIter; + std::map<std::string, OptionValue>::const_iterator checkMapIter; + const int LAST_INDEX = argc - 1; + std::ostringstream errorBuilder; + bool foundError = false; + char* end_ptr = NULL; + const std::string ERROR_SPACER(7, ' '); + + // check if we should show the help menu + bool showHelpMenu = false; + if (argc > 1) { + for (int i = 1; i < argc; i++) { + const std::string argument = argv[i]; + if ((argument == "-h") || (argument == "--help") || (argument == "help")) + showHelpMenu = true; + } + } else + showHelpMenu = true; + + if (showHelpMenu) DisplayHelp(); + + // check each argument + for (int i = offset + 1; i < argc; i++) { + + const std::string argument = argv[i]; + ovMapIter = m_optionsMap.find(argument); + + if (ovMapIter == m_optionsMap.end()) { + errorBuilder << ERROR_SPACER << "An unrecognized argument was found: " << argument + << std::endl; + foundError = true; + } else { + + *ovMapIter->second.pFoundArgument = true; + + // grab the value + if (ovMapIter->second.StoreValue) { + + if (i < LAST_INDEX) { + + // check if the next argument is really a command line option + const std::string val = argv[i + 1]; + checkMapIter = m_optionsMap.find(val); + + if (checkMapIter == m_optionsMap.end()) { + + ++i; + + if (ovMapIter->second.VariantValue.is_type<unsigned int>()) { + const unsigned int uint32 = + (unsigned int)strtoul(val.c_str(), &end_ptr, 10); + unsigned int* varValue = (unsigned int*)ovMapIter->second.pValue; + *varValue = uint32; + } else if (ovMapIter->second.VariantValue.is_type<unsigned char>()) { + const unsigned char uint8 = + (unsigned char)strtoul(val.c_str(), &end_ptr, 10); + unsigned char* varValue = (unsigned char*)ovMapIter->second.pValue; + *varValue = uint8; + } else if (ovMapIter->second.VariantValue.is_type<uint64_t>()) { + const uint64_t uint64 = strtoui64(val.c_str(), &end_ptr, 10); + uint64_t* varValue = (uint64_t*)ovMapIter->second.pValue; + *varValue = uint64; + } else if (ovMapIter->second.VariantValue.is_type<double>()) { + const double d = strtod(val.c_str(), &end_ptr); + double* varValue = (double*)ovMapIter->second.pValue; + *varValue = d; + } else if (ovMapIter->second.VariantValue.is_type<float>()) { + const float f = (float)strtod(val.c_str(), &end_ptr); + float* varValue = (float*)ovMapIter->second.pValue; + *varValue = f; + } else if (ovMapIter->second.VariantValue.is_type<std::string>()) { + std::string* pStringValue = (std::string*)ovMapIter->second.pValue; + *pStringValue = val; + } else if (ovMapIter->second.VariantValue + .is_type<std::vector<std::string> >()) { + std::vector<std::string>* pVectorValue = + (std::vector<std::string>*)ovMapIter->second.pValue; + pVectorValue->push_back(val); + } else { + printf( + "ERROR: Found an unsupported data type for argument %s when " + "parsing the arguments.\n", + argument.c_str()); + std::exit(EXIT_FAILURE); + } + } else { + errorBuilder << ERROR_SPACER << "The argument (" << argument + << ") expects a value, but none was found." << std::endl; + foundError = true; + } + } else { + errorBuilder << ERROR_SPACER << "The argument (" << argument + << ") expects a value, but none was found." << std::endl; + foundError = true; + } + } + } + } + + // check if we missed any required parameters + for (ovMapIter = m_optionsMap.begin(); ovMapIter != m_optionsMap.end(); ++ovMapIter) { + if (ovMapIter->second.IsRequired && !*ovMapIter->second.pFoundArgument) { + errorBuilder << ERROR_SPACER << ovMapIter->second.ValueTypeDescription + << " was not specified. Please use the " << ovMapIter->first + << " parameter." << std::endl; + foundError = true; + } + } + + // print the errors if any were found + if (foundError) { + printf("ERROR: Some problems were encountered when parsing the command line options:\n"); + printf("%s\n", errorBuilder.str().c_str()); + printf("For a complete list of command line options, type \"%s help %s\"\n", argv[0], + argv[1]); + std::exit(EXIT_FAILURE); + } +} + +// sets the program info +void Options::SetProgramInfo(const std::string& programName, const std::string& description, + const std::string& arguments) +{ + m_programName = programName; + m_description = description; + m_exampleArguments = arguments; +} + +// return string representations of stdin +const std::string& Options::StandardIn() +{ + return m_stdin; +} + +// return string representations of stdout +const std::string& Options::StandardOut() +{ + return m_stdout; +} diff --git a/src/utils/bamtools_options.h b/src/utils/bamtools_options.h new file mode 100644 index 0000000..d47b7b8 --- /dev/null +++ b/src/utils/bamtools_options.h @@ -0,0 +1,200 @@ +// *************************************************************************** +// bamtools_options.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 +// --------------------------------------------------------------------------- +// Parses command line arguments and creates a help menu +// --------------------------------------------------------------------------- +// Modified from: +// The Mosaik suite's command line parser class: COptions +// (c) 2006 - 2009 Michael Str�mberg +// Marth Lab, Department of Biology, Boston College +// Re-licensed under MIT License with author's permission. +// +// * Modified slightly to fit BamTools, otherwise code is same. +// * (BamTools namespace, added stdin/stdout) (DB) +// *************************************************************************** + +#ifndef BAMTOOLS_OPTIONS_H +#define BAMTOOLS_OPTIONS_H + +#include "utils/bamtools_variant.h" +#include "utils/utils_global.h" + +#include <map> +#include <string> +#include <vector> + +#ifndef WIN32 +#include <stdint.h> +#endif + +namespace BamTools { + +#define ARGUMENT_LENGTH 35 +#define DESC_LENGTH_FIRST_ROW 30 +#define DESC_LENGTH 42 +#define MAX_LINE_LENGTH 78 + +#ifdef WIN32 +#define snprintf _snprintf +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +#define strtoui64 _strtoui64 +#else +#define strtoui64 strtoull +#endif + +struct UTILS_EXPORT Option +{ + + // data members + std::string Argument; + std::string ValueDescription; + std::string Description; + bool StoreValue; + bool HasDefaultValue; + Variant DefaultValue; + + // constructor + Option() + : StoreValue(true) + , HasDefaultValue(false) + {} +}; + +struct UTILS_EXPORT OptionValue +{ + + // data members + bool* pFoundArgument; + void* pValue; + std::string ValueTypeDescription; + bool UseVector; + bool StoreValue; + bool IsRequired; + Variant VariantValue; + + // constructor + OptionValue() + : pFoundArgument(NULL) + , pValue(NULL) + , UseVector(false) + , StoreValue(true) + , IsRequired(false) + {} +}; + +struct UTILS_EXPORT OptionGroup +{ + std::string Name; + std::vector<Option> Options; +}; + +class UTILS_EXPORT Options +{ + + // add option/argument rules +public: + // adds a simple option to the parser + static void AddOption(const std::string& argument, const std::string& optionDescription, + bool& foundArgument, OptionGroup* group); + + // adds a value option to the parser + template <typename T> + static void AddValueOption(const std::string& argument, const std::string& valueDescription, + const std::string& optionDescription, + const std::string& valueTypeDescription, bool& foundArgument, T& val, + OptionGroup* group); + + // adds a value option to the parser (with a default value) + template <typename T, typename D> + static void AddValueOption(const std::string& argument, const std::string& valueDescription, + const std::string& optionDescription, + const std::string& valueTypeDescription, bool& foundArgument, T& val, + OptionGroup* group, D& defaultValue); + + // other API methods +public: + // creates an option group + static OptionGroup* CreateOptionGroup(const std::string& groupName); + // displays the help menu + static void DisplayHelp(); + // parses the command line + static void Parse(int argc, char* argv[], int offset = 0); + // sets the program info + static void SetProgramInfo(const std::string& programName, const std::string& description, + const std::string& arguments); + // returns string representation of stdin + static const std::string& StandardIn(); + // returns string representation of stdout + static const std::string& StandardOut(); + + // static data members +private: + // the program name + static std::string m_programName; + // the main description + static std::string m_description; + // the example arguments + static std::string m_exampleArguments; + // stores the option groups + static std::vector<OptionGroup> m_optionGroups; + // stores the options in a map + static std::map<std::string, OptionValue> m_optionsMap; + // string representation of stdin + static const std::string m_stdin; + // string representation of stdout + static const std::string m_stdout; +}; + +// adds a value option to the parser +template <typename T> +void Options::AddValueOption(const std::string& argument, const std::string& valueDescription, + const std::string& optionDescription, + const std::string& valueTypeDescription, bool& foundArgument, T& val, + OptionGroup* group) +{ + Option o; + o.Argument = argument; + o.ValueDescription = valueDescription; + o.Description = optionDescription; + group->Options.push_back(o); + + OptionValue ov; + ov.pFoundArgument = &foundArgument; + ov.pValue = (void*)&val; + ov.VariantValue = val; + ov.IsRequired = (valueTypeDescription.empty() ? false : true); + ov.ValueTypeDescription = valueTypeDescription; + m_optionsMap[argument] = ov; +} + +// adds a value option to the parser (with a default value) +template <typename T, typename D> +void Options::AddValueOption(const std::string& argument, const std::string& valueDescription, + const std::string& optionDescription, + const std::string& valueTypeDescription, bool& foundArgument, T& val, + OptionGroup* group, D& defaultValue) +{ + Option o; + o.Argument = argument; + o.ValueDescription = valueDescription; + o.Description = optionDescription; + o.DefaultValue = defaultValue; + o.HasDefaultValue = true; + group->Options.push_back(o); + + OptionValue ov; + ov.pFoundArgument = &foundArgument; + ov.pValue = (void*)&val; + ov.VariantValue = val; + ov.IsRequired = (valueTypeDescription.empty() ? false : true); + ov.ValueTypeDescription = valueTypeDescription; + m_optionsMap[argument] = ov; +} + +} // namespace BamTools + +#endif // BAMTOOLS_OPTIONS_H diff --git a/src/utils/bamtools_pileup_engine.cpp b/src/utils/bamtools_pileup_engine.cpp new file mode 100644 index 0000000..8874bb7 --- /dev/null +++ b/src/utils/bamtools_pileup_engine.cpp @@ -0,0 +1,355 @@ +// *************************************************************************** +// bamtools_pileup_engine.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 9 March 2012 (DB) +// --------------------------------------------------------------------------- +// Provides pileup at position functionality for various tools. +// *************************************************************************** + +#include "utils/bamtools_pileup_engine.h" +using namespace BamTools; + +#include <cstddef> +#include <iostream> + +// --------------------------------------------- +// PileupEnginePrivate implementation + +struct PileupEngine::PileupEnginePrivate +{ + + // data members + int CurrentId; + int CurrentPosition; + std::vector<BamAlignment> CurrentAlignments; + PileupPosition CurrentPileupData; + + bool IsFirstAlignment; + std::vector<PileupVisitor*> Visitors; + + // ctor & dtor + PileupEnginePrivate() + : CurrentId(-1) + , CurrentPosition(-1) + , IsFirstAlignment(true) + {} + ~PileupEnginePrivate() {} + + // 'public' methods + bool AddAlignment(const BamAlignment& al); + void Flush(); + + // internal methods +private: + void ApplyVisitors(); + void ClearOldData(); + void CreatePileupData(); + void ParseAlignmentCigar(const BamAlignment& al); +}; + +bool PileupEngine::PileupEnginePrivate::AddAlignment(const BamAlignment& al) +{ + + // if first time + if (IsFirstAlignment) { + + // set initial markers + CurrentId = al.RefID; + CurrentPosition = al.Position; + + // store first entry + CurrentAlignments.clear(); + CurrentAlignments.push_back(al); + + // set flag & return + IsFirstAlignment = false; + return true; + } + + // if same reference + if (al.RefID == CurrentId) { + + // if same position, store and move on + if (al.Position == CurrentPosition) CurrentAlignments.push_back(al); + + // if less than CurrentPosition - sorting error => ABORT + else if (al.Position < CurrentPosition) { + std::cerr << "Pileup::Run() : Data not sorted correctly!" << std::endl; + return false; + } + + // else print pileup data until 'catching up' to CurrentPosition + else { + while (al.Position > CurrentPosition) { + ApplyVisitors(); + ++CurrentPosition; + } + CurrentAlignments.push_back(al); + } + } + + // if reference ID less than CurrentId - sorting error => ABORT + else if (al.RefID < CurrentId) { + std::cerr << "Pileup::Run() : Data not sorted correctly!" << std::endl; + return false; + } + + // else moved forward onto next reference + else { + + // print any remaining pileup data from previous reference + while (!CurrentAlignments.empty()) { + ApplyVisitors(); + ++CurrentPosition; + } + + // store first entry on this new reference, update markers + CurrentAlignments.clear(); + CurrentAlignments.push_back(al); + CurrentId = al.RefID; + CurrentPosition = al.Position; + } + + return true; +} + +void PileupEngine::PileupEnginePrivate::ApplyVisitors() +{ + + // parse CIGAR data in BamAlignments to build up current pileup data + CreatePileupData(); + + // apply all visitors to current alignment set + std::vector<PileupVisitor*>::const_iterator visitorIter = Visitors.begin(); + std::vector<PileupVisitor*>::const_iterator visitorEnd = Visitors.end(); + for (; visitorIter != visitorEnd; ++visitorIter) + (*visitorIter)->Visit(CurrentPileupData); +} + +void PileupEngine::PileupEnginePrivate::ClearOldData() +{ + + // remove any alignments that end before our CurrentPosition + // N.B. - BAM positions are 0-based, half-open. GetEndPosition() returns a 1-based position, + // while our CurrentPosition is 0-based. For example, an alignment with 'endPosition' of + // 100 does not overlap a 'CurrentPosition' of 100, and should be discarded. + + std::size_t i = 0; + std::size_t j = 0; + const std::size_t numAlignments = CurrentAlignments.size(); + while (i < numAlignments) { + + // skip over alignment if its (1-based) endPosition is <= to (0-based) CurrentPosition + // i.e. this entry will not be saved upon vector resize + const int endPosition = CurrentAlignments[i].GetEndPosition(); + if (endPosition <= CurrentPosition) { + ++i; + continue; + } + + // otherwise alignment ends after CurrentPosition + // move it towards vector beginning, at index j + if (i != j) CurrentAlignments[j] = CurrentAlignments[i]; + + // increment our indices + ++i; + ++j; + } + + // 'squeeze' vector to size j, discarding all remaining alignments in the container + CurrentAlignments.resize(j); +} + +void PileupEngine::PileupEnginePrivate::CreatePileupData() +{ + + // remove any non-overlapping alignments + ClearOldData(); + + // set pileup refId, position to current markers + CurrentPileupData.RefId = CurrentId; + CurrentPileupData.Position = CurrentPosition; + CurrentPileupData.PileupAlignments.clear(); + + // parse CIGAR data in remaining alignments + std::vector<BamAlignment>::const_iterator alIter = CurrentAlignments.begin(); + std::vector<BamAlignment>::const_iterator alEnd = CurrentAlignments.end(); + for (; alIter != alEnd; ++alIter) + ParseAlignmentCigar((*alIter)); +} + +void PileupEngine::PileupEnginePrivate::Flush() +{ + while (!CurrentAlignments.empty()) { + ApplyVisitors(); + ++CurrentPosition; + } +} + +void PileupEngine::PileupEnginePrivate::ParseAlignmentCigar(const BamAlignment& al) +{ + + // skip if unmapped + if (!al.IsMapped()) return; + + // intialize local variables + int genomePosition = al.Position; + int positionInAlignment = 0; + bool isNewReadSegment = true; + bool saveAlignment = true; + PileupAlignment pileupAlignment(al); + + // iterate over CIGAR operations + const int numCigarOps = (const int)al.CigarData.size(); + for (int i = 0; i < numCigarOps; ++i) { + const CigarOp& op = al.CigarData.at(i); + + // if op is MATCH + if (op.Type == 'M') { + + // if match op overlaps current position + if (genomePosition + (int)op.Length > CurrentPosition) { + + // set pileup data + pileupAlignment.IsCurrentDeletion = false; + pileupAlignment.IsNextDeletion = false; + pileupAlignment.IsNextInsertion = false; + pileupAlignment.PositionInAlignment = + positionInAlignment + (CurrentPosition - genomePosition); + + // check for beginning of read segment + if (genomePosition == CurrentPosition && isNewReadSegment) + pileupAlignment.IsSegmentBegin = true; + + // if we're at the end of a match operation + if (genomePosition + (int)op.Length - 1 == CurrentPosition) { + + // if not last operation + if (i < numCigarOps - 1) { + + // check next CIGAR op + const CigarOp& nextOp = al.CigarData.at(i + 1); + + // if next CIGAR op is DELETION + if (nextOp.Type == 'D') { + pileupAlignment.IsNextDeletion = true; + pileupAlignment.DeletionLength = nextOp.Length; + } + + // if next CIGAR op is INSERTION + else if (nextOp.Type == 'I') { + pileupAlignment.IsNextInsertion = true; + pileupAlignment.InsertionLength = nextOp.Length; + } + + // if next CIGAR op is either DELETION or INSERTION + if (nextOp.Type == 'D' || nextOp.Type == 'I') { + + // if there is a CIGAR op after the DEL/INS + if (i < numCigarOps - 2) { + const CigarOp& nextNextOp = al.CigarData.at(i + 2); + + // if next CIGAR op is clipping or ref_skip + if (nextNextOp.Type == 'S' || nextNextOp.Type == 'N' || + nextNextOp.Type == 'H') + pileupAlignment.IsSegmentEnd = true; + } else { + pileupAlignment.IsSegmentEnd = true; + + // if next CIGAR op is clipping or ref_skip + if (nextOp.Type == 'S' || nextOp.Type == 'N' || nextOp.Type == 'H') + pileupAlignment.IsSegmentEnd = true; + } + } + + // otherwise + else { + + // if next CIGAR op is clipping or ref_skip + if (nextOp.Type == 'S' || nextOp.Type == 'N' || nextOp.Type == 'H') + pileupAlignment.IsSegmentEnd = true; + } + } + + // else this is last operation + else + pileupAlignment.IsSegmentEnd = true; + } + } + + // increment markers + genomePosition += op.Length; + positionInAlignment += op.Length; + } + + // if op is DELETION + else if (op.Type == 'D') { + + // if deletion op overlaps current position + if (genomePosition + (int)op.Length > CurrentPosition) { + + // set pileup data + pileupAlignment.IsCurrentDeletion = true; + pileupAlignment.IsNextDeletion = false; + pileupAlignment.IsNextInsertion = true; + pileupAlignment.PositionInAlignment = + positionInAlignment + (CurrentPosition - genomePosition); + } + + // increment marker + genomePosition += op.Length; + } + + // if op is REF_SKIP + else if (op.Type == 'N') { + genomePosition += op.Length; + } + + // if op is INSERTION or SOFT_CLIP + else if (op.Type == 'I' || op.Type == 'S') { + positionInAlignment += op.Length; + } + + // checl for beginning of new read segment + if (op.Type == 'N' || op.Type == 'S' || op.Type == 'H') + isNewReadSegment = true; + else + isNewReadSegment = false; + + // if we've moved beyond current position + if (genomePosition > CurrentPosition) { + if (op.Type == 'N') saveAlignment = false; // ignore alignment if REF_SKIP + break; + } + } + + // save pileup position if flag is true + if (saveAlignment) CurrentPileupData.PileupAlignments.push_back(pileupAlignment); +} + +// --------------------------------------------- +// PileupEngine implementation + +PileupEngine::PileupEngine() + : d(new PileupEnginePrivate) +{} + +PileupEngine::~PileupEngine() +{ + delete d; + d = 0; +} + +bool PileupEngine::AddAlignment(const BamAlignment& al) +{ + return d->AddAlignment(al); +} +void PileupEngine::AddVisitor(PileupVisitor* visitor) +{ + d->Visitors.push_back(visitor); +} +void PileupEngine::Flush() +{ + d->Flush(); +} diff --git a/src/utils/bamtools_pileup_engine.h b/src/utils/bamtools_pileup_engine.h new file mode 100644 index 0000000..5533c01 --- /dev/null +++ b/src/utils/bamtools_pileup_engine.h @@ -0,0 +1,98 @@ +// *************************************************************************** +// bamtools_pileup_engine.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 +// --------------------------------------------------------------------------- +// Provides pileup at position functionality for various tools. +// *************************************************************************** + +#ifndef BAMTOOLS_PILEUP_ENGINE_H +#define BAMTOOLS_PILEUP_ENGINE_H + +#include "utils/utils_global.h" + +#include <api/BamAlignment.h> +#include <vector> + +namespace BamTools { + +// contains auxiliary data about a single BamAlignment +// at current position considered +struct UTILS_EXPORT PileupAlignment +{ + + // data members + BamAlignment Alignment; + int32_t PositionInAlignment; + bool IsCurrentDeletion; + bool IsNextDeletion; + bool IsNextInsertion; + int DeletionLength; + int InsertionLength; + bool IsSegmentBegin; + bool IsSegmentEnd; + + // ctor + PileupAlignment(const BamAlignment& al) + : Alignment(al) + , PositionInAlignment(-1) + , IsCurrentDeletion(false) + , IsNextDeletion(false) + , IsNextInsertion(false) + , DeletionLength(0) + , InsertionLength(0) + , IsSegmentBegin(false) + , IsSegmentEnd(false) + {} +}; + +// contains all data at a position +struct UTILS_EXPORT PileupPosition +{ + + // data members + int RefId; + int Position; + std::vector<PileupAlignment> PileupAlignments; + + // ctor + PileupPosition(const int& refId = 0, const int& position = 0, + const std::vector<PileupAlignment>& alignments = std::vector<PileupAlignment>()) + : RefId(refId) + , Position(position) + , PileupAlignments(alignments) + {} +}; + +class UTILS_EXPORT PileupVisitor +{ + +public: + PileupVisitor() {} + virtual ~PileupVisitor() {} + +public: + virtual void Visit(const PileupPosition& pileupData) = 0; +}; + +class UTILS_EXPORT PileupEngine +{ + +public: + PileupEngine(); + ~PileupEngine(); + +public: + bool AddAlignment(const BamAlignment& al); + void AddVisitor(PileupVisitor* visitor); + void Flush(); + +private: + struct PileupEnginePrivate; + PileupEnginePrivate* d; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_PILEUP_ENGINE_H diff --git a/src/utils/bamtools_utilities.cpp b/src/utils/bamtools_utilities.cpp new file mode 100644 index 0000000..77a09b1 --- /dev/null +++ b/src/utils/bamtools_utilities.cpp @@ -0,0 +1,343 @@ +// *************************************************************************** +// bamtools_utilities.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 8 October 2011 +// --------------------------------------------------------------------------- +// Provides general utilities used by BamTools sub-tools. +// *************************************************************************** + +#include <api/BamMultiReader.h> +#include <api/BamReader.h> +#include <utils/bamtools_utilities.h> +using namespace BamTools; + +#include <algorithm> +#include <cstddef> +#include <cstdlib> +#include <cstring> +#include <fstream> +#include <iostream> +#include <sstream> + +namespace BamTools { + +const char REVCOMP_LOOKUP[] = {'T', 0, 'G', 'H', 0, 0, 'C', 'D', 0, 0, 0, 0, 'K', + 'N', 0, 0, 0, 'Y', 'W', 'A', 'A', 'B', 'S', 'X', 'R', 0}; + +} // namespace BamTools + +// returns true if 'source' contains 'pattern' +bool Utilities::Contains(const std::string& source, const std::string& pattern) +{ + return (source.find(pattern) != std::string::npos); +} + +// returns true if 'source' contains 'c' +bool Utilities::Contains(const std::string& source, const char c) +{ + return (source.find(c) != std::string::npos); +} + +// returns true if 'source' ends with 'pattern' +bool Utilities::EndsWith(const std::string& source, const std::string& pattern) +{ + return (source.find(pattern) == (source.length() - pattern.length())); +} + +// returns true if 'source' ends with 'c' +bool Utilities::EndsWith(const std::string& source, const char c) +{ + return (source.find(c) == (source.length() - 1)); +} + +// check if a file exists +bool Utilities::FileExists(const std::string& filename) +{ + std::ifstream f(filename.c_str(), std::ifstream::in); + return !f.fail(); +} + +// Parses a region string, does validation (valid ID's, positions), stores in Region struct +// Returns success (true/false) +bool Utilities::ParseRegionString(const std::string& regionString, const BamReader& reader, + BamRegion& region) +{ + // ------------------------------- + // parse region string + + // check first for empty string + if (regionString.empty()) return false; + + // non-empty string, look for a colom + std::size_t foundFirstColon = regionString.find(':'); + + // store chrom strings, and numeric positions + std::string startChrom; + std::string stopChrom; + int startPos; + int stopPos; + + // no colon found + // going to use entire contents of requested chromosome + // just store entire region string as startChrom name + // use BamReader methods to check if its valid for current BAM file + if (foundFirstColon == std::string::npos) { + startChrom = regionString; + startPos = 0; + stopChrom = regionString; + stopPos = 0; + } + + // colon found, so we at least have some sort of startPos requested + else { + + // store start chrom from beginning to first colon + startChrom = regionString.substr(0, foundFirstColon); + + // look for ".." after the colon + std::size_t foundRangeDots = regionString.find("..", foundFirstColon + 1); + + // no dots found + // so we have a startPos but no range + // store contents before colon as startChrom, after as startPos + if (foundRangeDots == std::string::npos) { + startPos = std::atoi(regionString.substr(foundFirstColon + 1).c_str()); + stopChrom = startChrom; + stopPos = -1; + } + + // ".." found, so we have some sort of range selected + else { + + // store startPos between first colon and range dots ".." + startPos = std::atoi( + regionString.substr(foundFirstColon + 1, foundRangeDots - foundFirstColon - 1) + .c_str()); + + // look for second colon + std::size_t foundSecondColon = regionString.find(':', foundRangeDots + 1); + + // no second colon found + // so we have a "standard" chrom:start..stop input format (on single chrom) + if (foundSecondColon == std::string::npos) { + stopChrom = startChrom; + stopPos = std::atoi(regionString.substr(foundRangeDots + 2).c_str()); + } + + // second colon found + // so we have a range requested across 2 chrom's + else { + stopChrom = regionString.substr(foundRangeDots + 2, + foundSecondColon - (foundRangeDots + 2)); + stopPos = std::atoi(regionString.substr(foundSecondColon + 1).c_str()); + } + } + } + + // ------------------------------- + // validate reference IDs & genomic positions + + const RefVector references = reader.GetReferenceData(); + + // if startRefID not found, return false + int startRefID = reader.GetReferenceID(startChrom); + if (startRefID == -1) return false; + + // startPos cannot be greater than or equal to reference length + const RefData& startReference = references.at(startRefID); + if (startPos >= startReference.RefLength) return false; + + // if stopRefID not found, return false + int stopRefID = reader.GetReferenceID(stopChrom); + if (stopRefID == -1) return false; + + // stopPosition cannot be larger than reference length + const RefData& stopReference = references.at(stopRefID); + if (stopPos > stopReference.RefLength) return false; + + // if no stopPosition specified, set to reference end + if (stopPos == -1) stopPos = stopReference.RefLength; + + // ------------------------------- + // set up Region struct & return + + region.LeftRefID = startRefID; + region.LeftPosition = startPos; + region.RightRefID = stopRefID; + ; + region.RightPosition = stopPos; + return true; +} + +// Same as ParseRegionString() above, but accepts a BamMultiReader +bool Utilities::ParseRegionString(const std::string& regionString, const BamMultiReader& reader, + BamRegion& region) +{ + // ------------------------------- + // parse region string + + // check first for empty string + if (regionString.empty()) return false; + + // non-empty string, look for a colom + std::size_t foundFirstColon = regionString.find(':'); + + // store chrom strings, and numeric positions + std::string startChrom; + std::string stopChrom; + int startPos; + int stopPos; + + // no colon found + // going to use entire contents of requested chromosome + // just store entire region string as startChrom name + // use BamReader methods to check if its valid for current BAM file + if (foundFirstColon == std::string::npos) { + startChrom = regionString; + startPos = 0; + stopChrom = regionString; + stopPos = -1; + } + + // colon found, so we at least have some sort of startPos requested + else { + + // store start chrom from beginning to first colon + startChrom = regionString.substr(0, foundFirstColon); + + // look for ".." after the colon + std::size_t foundRangeDots = regionString.find("..", foundFirstColon + 1); + + // no dots found + // so we have a startPos but no range + // store contents before colon as startChrom, after as startPos + if (foundRangeDots == std::string::npos) { + startPos = std::atoi(regionString.substr(foundFirstColon + 1).c_str()); + stopChrom = startChrom; + stopPos = -1; + } + + // ".." found, so we have some sort of range selected + else { + + // store startPos between first colon and range dots ".." + startPos = std::atoi( + regionString.substr(foundFirstColon + 1, foundRangeDots - foundFirstColon - 1) + .c_str()); + + // look for second colon + std::size_t foundSecondColon = regionString.find(':', foundRangeDots + 1); + + // no second colon found + // so we have a "standard" chrom:start..stop input format (on single chrom) + if (foundSecondColon == std::string::npos) { + stopChrom = startChrom; + stopPos = std::atoi(regionString.substr(foundRangeDots + 2).c_str()); + } + + // second colon found + // so we have a range requested across 2 chrom's + else { + stopChrom = regionString.substr(foundRangeDots + 2, + foundSecondColon - (foundRangeDots + 2)); + stopPos = std::atoi(regionString.substr(foundSecondColon + 1).c_str()); + } + } + } + + // ------------------------------- + // validate reference IDs & genomic positions + + const RefVector references = reader.GetReferenceData(); + + // if startRefID not found, return false + int startRefID = reader.GetReferenceID(startChrom); + if (startRefID == -1) return false; + + // startPos cannot be greater than or equal to reference length + const RefData& startReference = references.at(startRefID); + if (startPos >= startReference.RefLength) return false; + + // if stopRefID not found, return false + int stopRefID = reader.GetReferenceID(stopChrom); + if (stopRefID == -1) return false; + + // stopPosition cannot be larger than reference length + const RefData& stopReference = references.at(stopRefID); + if (stopPos > stopReference.RefLength) return false; + + // if no stopPosition specified, set to reference end + if (stopPos == -1) stopPos = stopReference.RefLength; + + // ------------------------------- + // set up Region struct & return + + region.LeftRefID = startRefID; + region.LeftPosition = startPos; + region.RightRefID = stopRefID; + ; + region.RightPosition = stopPos; + return true; +} + +void Utilities::Reverse(std::string& sequence) +{ + reverse(sequence.begin(), sequence.end()); +} + +void Utilities::ReverseComplement(std::string& sequence) +{ + + // do complement, in-place + std::size_t seqLength = sequence.length(); + for (std::size_t i = 0; i < seqLength; ++i) + sequence.replace(i, 1, 1, REVCOMP_LOOKUP[(int)sequence.at(i) - 65]); + + // reverse it + Reverse(sequence); +} + +std::vector<std::string> Utilities::Split(const std::string& source, const char delim) +{ + + std::stringstream ss(source); + std::string field; + std::vector<std::string> fields; + + while (std::getline(ss, field, delim)) + fields.push_back(field); + return fields; +} + +std::vector<std::string> Utilities::Split(const std::string& source, const std::string& delims) +{ + + std::vector<std::string> fields; + + char* tok; + char* cchars = new char[source.size() + 1]; + char* cstr = &cchars[0]; + strcpy(cstr, source.c_str()); + tok = strtok(cstr, delims.c_str()); + while (tok != NULL) { + fields.push_back(tok); + tok = strtok(NULL, delims.c_str()); + } + + delete[] cchars; + + return fields; +} + +// returns true if 'source' starts with 'pattern' +bool Utilities::StartsWith(const std::string& source, const std::string& pattern) +{ + return (source.find(pattern) == 0); +} + +// returns true if 'source' starts with 'c' +bool Utilities::StartsWith(const std::string& source, const char c) +{ + return (source.find(c) == 0); +} diff --git a/src/utils/bamtools_utilities.h b/src/utils/bamtools_utilities.h new file mode 100644 index 0000000..c85c452 --- /dev/null +++ b/src/utils/bamtools_utilities.h @@ -0,0 +1,64 @@ +// *************************************************************************** +// bamtools_utilities.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 7 October 2011 +// --------------------------------------------------------------------------- +// Provides general utilities used by BamTools sub-tools. +// *************************************************************************** + +#ifndef BAMTOOLS_UTILITIES_H +#define BAMTOOLS_UTILITIES_H + +#include <api/BamAux.h> +#include <utils/utils_global.h> +#include <string> +#include <vector> + +#define BAMTOOLS_ASSERT_UNREACHABLE BT_ASSERT_UNREACHABLE +#define BAMTOOLS_ASSERT_MESSAGE(condition, message) BT_ASSERT_X(condition, message) + +namespace BamTools { + +class BamReader; +class BamMultiReader; + +class UTILS_EXPORT Utilities +{ + +public: + // returns true if 'source' contains 'pattern' or 'c' + static bool Contains(const std::string& source, const std::string& pattern); + static bool Contains(const std::string& source, const char c); + + // returns true if 'source' ends with 'pattern' or 'c' + static bool EndsWith(const std::string& source, const std::string& pattern); + static bool EndsWith(const std::string& source, const char c); + + // check if a file exists + static bool FileExists(const std::string& fname); + + // Parses a region string, uses reader to do validation (valid ID's, positions), stores in Region struct + // Returns success (true/false) + static bool ParseRegionString(const std::string& regionString, const BamReader& reader, + BamRegion& region); + // Same as above, but accepts a BamMultiReader + static bool ParseRegionString(const std::string& regionString, const BamMultiReader& reader, + BamRegion& region); + + // sequence utilities + static void Reverse(std::string& sequence); + static void ReverseComplement(std::string& sequence); + + // split string on delimiter character (or string of allowed delimiters) + static std::vector<std::string> Split(const std::string& source, const char delim); + static std::vector<std::string> Split(const std::string& source, const std::string& delims); + + // returns true if 'source' starts with 'pattern' or 'c' + static bool StartsWith(const std::string& source, const std::string& pattern); + static bool StartsWith(const std::string& source, const char c); +}; + +} // namespace BamTools + +#endif // BAMTOOLS_UTILITIES_H diff --git a/src/utils/bamtools_variant.h b/src/utils/bamtools_variant.h new file mode 100644 index 0000000..50641bc --- /dev/null +++ b/src/utils/bamtools_variant.h @@ -0,0 +1,146 @@ +// *************************************************************************** +// bamtools_variant.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 +// --------------------------------------------------------------------------- +// Provides a template-based variant type +// --------------------------------------------------------------------------- +// Modified from: +// variant_t - An Improved Variant Type Based on Member Templates +// (c) 2000 Fernando Cacciola +// Dr. Dobb's (http://www.ddj.com/cpp/184401293) +// +// * Modified to be in BamTools namespace, otherwise code is same. (DB) +// *************************************************************************** + +#ifndef BAMTOOLS_VARIANT_H +#define BAMTOOLS_VARIANT_H + +#include <cstddef> +#include <stdexcept> +#include <string> +#include <typeinfo> +#include "utils/utils_global.h" + +namespace BamTools { + +class UTILS_EXPORT Variant +{ + +public: + Variant() + : data(NULL) + {} + + Variant(const Variant& other) + { + if (other.data != NULL) other.data->AddRef(); + data = other.data; + } + + ~Variant() + { + if (data != NULL) data->Release(); + } + + // NOTE: This code takes care of self-assignment. + // DO NOT CHANGE THE ORDER of the statements. + Variant& operator=(const Variant& rhs) + { + if (rhs.data != NULL) rhs.data->AddRef(); + if (data != NULL) data->Release(); + data = rhs.data; + return *this; + } + + // This member template constructor allows you to + // instance a variant_t object with a value of any type. + template <typename T> + Variant(T v) + : data(new Impl<T>(v)) + { + data->AddRef(); + } + + // This generic conversion operator let you retrieve + // the value held. To avoid template specialization conflicts, + // it returns an instance of type T, which will be a COPY + // of the value contained. + template <typename T> + operator T() const + { + return CastFromBase<T>(data)->data; + } + + // This forms returns a REFERENCE and not a COPY, which + // will be significant in some cases. + template <typename T> + const T& get() const + { + return CastFromBase<T>(data)->data; + } + + template <typename T> + bool is_type() const + { + return typeid(*data) == typeid(Impl<T>); + } + + template <typename T> + bool is_type(T v) const + { + return typeid(*data) == typeid(v); + } + +private: + struct ImplBase + { + + ImplBase() + : refs(0) + {} + virtual ~ImplBase() {} + + void AddRef() + { + ++refs; + } + void Release() + { + --refs; + if (refs == 0) delete this; + } + + std::size_t refs; + }; + + template <typename T> + struct Impl : ImplBase + { + Impl(T v) + : data(v) + {} + ~Impl() {} + T data; + }; + + // The following method is static because it doesn't + // operate on variant_t instances. + template <typename T> + static Impl<T>* CastFromBase(ImplBase* v) + { + // This upcast will fail if T is other than the T used + // with the constructor of variant_t. + Impl<T>* p = dynamic_cast<Impl<T>*>(v); + if (p == NULL) + throw std::invalid_argument(typeid(T).name() + std::string(" is not a valid type")); + return p; + } + + ImplBase* data; +}; + +} // namespace BamTools + +#endif // BAMTOOLS_VARIANT_H diff --git a/src/utils/utils_global.h b/src/utils/utils_global.h new file mode 100644 index 0000000..6080ec4 --- /dev/null +++ b/src/utils/utils_global.h @@ -0,0 +1,21 @@ +// *************************************************************************** +// utils_global.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides macros for exporting & importing BamTools-utils library symbols +// *************************************************************************** + +#ifndef UTILS_GLOBAL_H +#define UTILS_GLOBAL_H + +#include "shared/bamtools_global.h" + +#ifdef BAMTOOLS_UTILS_LIBRARY +#define UTILS_EXPORT BAMTOOLS_LIBRARY_EXPORT +#else +#define UTILS_EXPORT BAMTOOLS_LIBRARY_IMPORT +#endif + +#endif // UTILS_GLOBAL_H |