summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael R. Crusoe <michael.crusoe@gmail.com>2018-04-02 18:47:32 +0200
committerMichael R. Crusoe <michael.crusoe@gmail.com>2018-04-02 18:47:32 +0200
commit92f4514af4d8bfb50daf300a2fe8825c5552702a (patch)
treee57a5f8b7d788e0ed12c6c00a88c0904b3331fb2
Import bamtools_2.5.1+dfsg.orig.tar.gz
[dgit import orig bamtools_2.5.1+dfsg.orig.tar.gz]
-rw-r--r--CMakeLists.txt99
-rw-r--r--LICENSE22
-rw-r--r--README60
-rw-r--r--docs/Doxyfile1605
-rw-r--r--src/CMakeLists.txt20
-rw-r--r--src/ExportHeader.cmake27
-rw-r--r--src/api/BamAlgorithms.h21
-rw-r--r--src/api/BamAlignment.cpp1127
-rw-r--r--src/api/BamAlignment.h644
-rw-r--r--src/api/BamAux.h519
-rw-r--r--src/api/BamConstants.h323
-rw-r--r--src/api/BamIndex.h98
-rw-r--r--src/api/BamMultiReader.cpp442
-rw-r--r--src/api/BamMultiReader.h127
-rw-r--r--src/api/BamReader.cpp402
-rw-r--r--src/api/BamReader.h117
-rw-r--r--src/api/BamWriter.cpp155
-rw-r--r--src/api/BamWriter.h70
-rw-r--r--src/api/CMakeLists.txt77
-rw-r--r--src/api/IBamIODevice.h100
-rw-r--r--src/api/SamConstants.h97
-rw-r--r--src/api/SamHeader.cpp246
-rw-r--r--src/api/SamHeader.h78
-rw-r--r--src/api/SamProgram.cpp134
-rw-r--r--src/api/SamProgram.h66
-rw-r--r--src/api/SamProgramChain.cpp363
-rw-r--r--src/api/SamProgramChain.h86
-rw-r--r--src/api/SamReadGroup.cpp211
-rw-r--r--src/api/SamReadGroup.h73
-rw-r--r--src/api/SamReadGroupDictionary.cpp317
-rw-r--r--src/api/SamReadGroupDictionary.h87
-rw-r--r--src/api/SamSequence.cpp152
-rw-r--r--src/api/SamSequence.h66
-rw-r--r--src/api/SamSequenceDictionary.cpp321
-rw-r--r--src/api/SamSequenceDictionary.h87
-rw-r--r--src/api/algorithms/Sort.h364
-rw-r--r--src/api/api_global.h21
-rw-r--r--src/api/internal/CMakeLists.txt25
-rw-r--r--src/api/internal/bam/BamHeader_p.cpp132
-rw-r--r--src/api/internal/bam/BamHeader_p.h72
-rw-r--r--src/api/internal/bam/BamMultiMerger_p.h278
-rw-r--r--src/api/internal/bam/BamMultiReader_p.cpp905
-rw-r--r--src/api/internal/bam/BamMultiReader_p.h104
-rw-r--r--src/api/internal/bam/BamRandomAccessController_p.cpp302
-rw-r--r--src/api/internal/bam/BamRandomAccessController_p.h96
-rw-r--r--src/api/internal/bam/BamReader_p.cpp591
-rw-r--r--src/api/internal/bam/BamReader_p.h119
-rw-r--r--src/api/internal/bam/BamWriter_p.cpp599
-rw-r--r--src/api/internal/bam/BamWriter_p.h74
-rw-r--r--src/api/internal/bam/CMakeLists.txt19
-rw-r--r--src/api/internal/index/BamIndexFactory_p.cpp111
-rw-r--r--src/api/internal/index/BamIndexFactory_p.h49
-rw-r--r--src/api/internal/index/BamStandardIndex_p.cpp1023
-rw-r--r--src/api/internal/index/BamStandardIndex_p.h236
-rw-r--r--src/api/internal/index/BamToolsIndex_p.cpp677
-rw-r--r--src/api/internal/index/BamToolsIndex_p.h195
-rw-r--r--src/api/internal/index/CMakeLists.txt17
-rw-r--r--src/api/internal/io/BamDeviceFactory_p.cpp34
-rw-r--r--src/api/internal/io/BamDeviceFactory_p.h38
-rw-r--r--src/api/internal/io/BamFile_p.cpp73
-rw-r--r--src/api/internal/io/BamFile_p.h52
-rw-r--r--src/api/internal/io/BamFtp_p.cpp491
-rw-r--r--src/api/internal/io/BamFtp_p.h91
-rw-r--r--src/api/internal/io/BamHttp_p.cpp554
-rw-r--r--src/api/internal/io/BamHttp_p.h92
-rw-r--r--src/api/internal/io/BamPipe_p.cpp73
-rw-r--r--src/api/internal/io/BamPipe_p.h47
-rw-r--r--src/api/internal/io/BgzfStream_p.cpp468
-rw-r--r--src/api/internal/io/BgzfStream_p.h95
-rw-r--r--src/api/internal/io/ByteArray_p.cpp120
-rw-r--r--src/api/internal/io/ByteArray_p.h70
-rw-r--r--src/api/internal/io/CMakeLists.txt48
-rw-r--r--src/api/internal/io/HostAddress_p.cpp393
-rw-r--r--src/api/internal/io/HostAddress_p.h117
-rw-r--r--src/api/internal/io/HostInfo_p.cpp229
-rw-r--r--src/api/internal/io/HostInfo_p.h78
-rw-r--r--src/api/internal/io/HttpHeader_p.cpp403
-rw-r--r--src/api/internal/io/HttpHeader_p.h136
-rw-r--r--src/api/internal/io/ILocalIODevice_p.cpp61
-rw-r--r--src/api/internal/io/ILocalIODevice_p.h51
-rw-r--r--src/api/internal/io/NetUnix_p.h43
-rw-r--r--src/api/internal/io/NetWin_p.h62
-rw-r--r--src/api/internal/io/RollingBuffer_p.cpp317
-rw-r--r--src/api/internal/io/RollingBuffer_p.h88
-rw-r--r--src/api/internal/io/TcpSocketEngine_p.cpp212
-rw-r--r--src/api/internal/io/TcpSocketEngine_p.h105
-rw-r--r--src/api/internal/io/TcpSocketEngine_unix_p.cpp220
-rw-r--r--src/api/internal/io/TcpSocketEngine_win_p.cpp242
-rw-r--r--src/api/internal/io/TcpSocket_p.cpp446
-rw-r--r--src/api/internal/io/TcpSocket_p.h132
-rw-r--r--src/api/internal/sam/CMakeLists.txt17
-rw-r--r--src/api/internal/sam/SamFormatParser_p.cpp263
-rw-r--r--src/api/internal/sam/SamFormatParser_p.h62
-rw-r--r--src/api/internal/sam/SamFormatPrinter_p.cpp240
-rw-r--r--src/api/internal/sam/SamFormatPrinter_p.h60
-rw-r--r--src/api/internal/sam/SamHeaderValidator_p.cpp536
-rw-r--r--src/api/internal/sam/SamHeaderValidator_p.h103
-rw-r--r--src/api/internal/sam/SamHeaderVersion_p.h154
-rw-r--r--src/api/internal/utils/BamException_p.cpp14
-rw-r--r--src/api/internal/utils/BamException_p.h53
-rw-r--r--src/api/internal/utils/CMakeLists.txt15
-rw-r--r--src/bamtools.pc.in10
-rw-r--r--src/shared/bamtools_global.h89
-rw-r--r--src/toolkit/CMakeLists.txt47
-rw-r--r--src/toolkit/bamtools.cpp174
-rw-r--r--src/toolkit/bamtools_convert.cpp967
-rw-r--r--src/toolkit/bamtools_convert.h38
-rw-r--r--src/toolkit/bamtools_count.cpp228
-rw-r--r--src/toolkit/bamtools_count.h38
-rw-r--r--src/toolkit/bamtools_coverage.cpp207
-rw-r--r--src/toolkit/bamtools_coverage.h38
-rw-r--r--src/toolkit/bamtools_filter.cpp1048
-rw-r--r--src/toolkit/bamtools_filter.h38
-rw-r--r--src/toolkit/bamtools_header.cpp152
-rw-r--r--src/toolkit/bamtools_header.h39
-rw-r--r--src/toolkit/bamtools_index.cpp137
-rw-r--r--src/toolkit/bamtools_index.h38
-rw-r--r--src/toolkit/bamtools_merge.cpp257
-rw-r--r--src/toolkit/bamtools_merge.h38
-rw-r--r--src/toolkit/bamtools_random.cpp316
-rw-r--r--src/toolkit/bamtools_random.h38
-rw-r--r--src/toolkit/bamtools_resolve.cpp1523
-rw-r--r--src/toolkit/bamtools_resolve.h43
-rw-r--r--src/toolkit/bamtools_revert.cpp212
-rw-r--r--src/toolkit/bamtools_revert.h38
-rw-r--r--src/toolkit/bamtools_sort.cpp381
-rw-r--r--src/toolkit/bamtools_sort.h38
-rw-r--r--src/toolkit/bamtools_split.cpp750
-rw-r--r--src/toolkit/bamtools_split.h39
-rw-r--r--src/toolkit/bamtools_stats.cpp330
-rw-r--r--src/toolkit/bamtools_stats.h38
-rw-r--r--src/toolkit/bamtools_tool.h36
-rw-r--r--src/toolkit/bamtools_version.h.in20
-rw-r--r--src/utils/CMakeLists.txt29
-rw-r--r--src/utils/bamtools_fasta.cpp643
-rw-r--r--src/utils/bamtools_fasta.h48
-rw-r--r--src/utils/bamtools_filter_engine.h575
-rw-r--r--src/utils/bamtools_filter_properties.h234
-rw-r--r--src/utils/bamtools_filter_ruleparser.h337
-rw-r--r--src/utils/bamtools_options.cpp305
-rw-r--r--src/utils/bamtools_options.h200
-rw-r--r--src/utils/bamtools_pileup_engine.cpp355
-rw-r--r--src/utils/bamtools_pileup_engine.h98
-rw-r--r--src/utils/bamtools_utilities.cpp343
-rw-r--r--src/utils/bamtools_utilities.h64
-rw-r--r--src/utils/bamtools_variant.h146
-rw-r--r--src/utils/utils_global.h21
147 files changed, 32429 insertions, 0 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..e2b96c3
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,99 @@
+# ==========================
+# BamTools CMakeLists.txt
+# (c) 2010 Derek Barnett
+#
+# top-level
+# ==========================
+
+# CMake requirements
+cmake_minimum_required( VERSION 3.0 )
+
+# allow setting project version in project()
+# https://cmake.org/cmake/help/v3.0/policy/CMP0048.html#policy:CMP0048
+cmake_policy( SET CMP0048 NEW )
+
+# set project name and version
+project( BamTools LANGUAGES CXX VERSION 2.5.1 )
+
+# on macOS, MACOSX_RPATH is enabled by default on more recent versions
+# of CMake. Disable this behaviour, and let user enable it if need be.
+cmake_policy( SET CMP0042 OLD )
+
+# Set Release type for builds where CMAKE_BUILD_TYPE is unset
+# This is usually a good default as this implictly enables
+#
+# CXXFLAGS = -O3 -DNDEBUG
+#
+if( NOT CMAKE_BUILD_TYPE )
+ set( CMAKE_BUILD_TYPE "Release" )
+endif()
+
+# Adhere to GNU filesystem layout conventions
+include( GNUInstallDirs )
+
+# Force the build directory to be different from source directory
+macro( ENSURE_OUT_OF_SOURCE_BUILD MSG )
+ string( COMPARE EQUAL "${CMAKE_SOURCE_DIR}" "${CMAKE_BINARY_DIR}" insource )
+ get_filename_component( PARENTDIR ${CMAKE_SOURCE_DIR} PATH )
+ string( COMPARE EQUAL "${CMAKE_SOURCE_DIR}" "${PARENTDIR}" insourcesubdir )
+ IF( insource OR insourcesubdir )
+ message( FATAL_ERROR "${MSG}" )
+ ENDIF( insource OR insourcesubdir )
+endmacro( ENSURE_OUT_OF_SOURCE_BUILD )
+
+ensure_out_of_source_build( "
+ ${PROJECT_NAME} requires an out of source build.
+ $ mkdir build
+ $ cd build
+ $ cmake ..
+ $ make
+(or the Windows equivalent)\n" )
+
+# define compiler flags for all code, copied from Autoconf's AC_SYS_LARGEFILE
+if( NOT WIN32 )
+ add_definitions( -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE )
+ add_compile_options( -Wall )
+endif()
+
+# -----------------------------------------------
+# handle platform-/environment-specific defines
+
+# By default build bamtools as a static library
+# Most users will prefer static libraries, distributions
+# can always switch the standard CMake variable over to ON.
+set( BUILD_SHARED_LIBS OFF CACHE BOOL "Build all libraries as shared" )
+
+# If planning to run in Node.js environment, run:
+# cmake -DEnableNodeJS=true
+if( EnableNodeJS )
+ add_definitions( -DSYSTEM_NODEJS=1 )
+endif()
+
+# If running on SunOS
+if( "${CMAKE_SYSTEM_NAME}" MATCHES "SunOS" )
+ add_definitions( -DSUN_OS )
+endif()
+
+# find system JsonCpp
+find_package( PkgConfig )
+pkg_search_module( JSONCPP jsoncpp>=1 )
+
+set( BAMTOOLS_PRIVATE_DEPS "zlib" )
+
+if( JSONCPP_FOUND )
+ message( "Found system JsonCpp, not using bundled version" )
+ set( BAMTOOLS_PRIVATE_DEPS "${BAMTOOLS_PRIVATE_DEPS} jsoncpp" )
+else()
+ message( "Did NOT find system JsonCpp, instead using bundled version" )
+ set( JSONCPP_LDFLAGS jsoncpp )
+ set( JSONCPP_INCLUDE_DIRS ${BamTools_SOURCE_DIR}/src/third_party/jsoncpp )
+endif()
+
+
+# -------------------------------------------
+
+# add our includes root path
+include_directories( src )
+
+# list subdirectories to build in
+add_subdirectory( src )
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..eaee1fd
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,22 @@
+The MIT License
+
+Copyright (c) 2009-2010 Derek Barnett, Erik Garrison, Gabor Marth, Michael Stromberg
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
diff --git a/README b/README
new file mode 100644
index 0000000..498f4be
--- /dev/null
+++ b/README
@@ -0,0 +1,60 @@
+--------------------------------------------------------------------------------
+README : BAMTOOLS
+--------------------------------------------------------------------------------
+
+BamTools provides both a programmer's API and an end-user's toolkit for handling
+BAM files.
+
+I. Learn More
+
+II. License
+
+III. Acknowledgements
+
+IV. Contact
+
+--------------------------------------------------------------------------------
+I. Learn More:
+--------------------------------------------------------------------------------
+
+Installation steps, tutorial, API documentation, etc. are all now available
+through the BamTools project wiki:
+
+https://github.com/pezmaster31/bamtools/wiki
+
+Join the mailing list(s) to stay informed of updates or get involved with
+contributing:
+
+https://github.com/pezmaster31/bamtools/wiki/Mailing-lists
+
+--------------------------------------------------------------------------------
+II. License :
+--------------------------------------------------------------------------------
+
+Both the BamTools API and toolkit are released under the MIT License.
+Copyright (c) 2009-2010 Derek Barnett, Erik Garrison, Gabor Marth,
+ Michael Stromberg
+
+See included file LICENSE for details.
+
+--------------------------------------------------------------------------------
+III. Acknowledgements :
+--------------------------------------------------------------------------------
+
+ * Aaron Quinlan for several key feature ideas and bug fix contributions
+ * Baptiste Lepilleur for the public-domain JSON parser (JsonCPP)
+ * Heng Li, author of SAMtools - the original C-language BAM API/toolkit.
+
+--------------------------------------------------------------------------------
+IV. Contact :
+--------------------------------------------------------------------------------
+
+Feel free to contact me with any questions, comments, suggestions, bug reports,
+ etc.
+
+Derek Barnett
+Marth Lab
+Biology Dept., Boston College
+
+Email: derekwbarnett@gmail.com
+Project Website: http://github.com/pezmaster31/bamtools
diff --git a/docs/Doxyfile b/docs/Doxyfile
new file mode 100644
index 0000000..ee7ba2d
--- /dev/null
+++ b/docs/Doxyfile
@@ -0,0 +1,1605 @@
+# Doxyfile 1.6.3
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+# TAG = value [value, ...]
+# For lists items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME = BamTools
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER = 2.5.1
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY =
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF = "The $name class" \
+ "The $name widget" \
+ "The $name file" \
+ is \
+ provides \
+ specifies \
+ contains \
+ represents \
+ a \
+ an \
+ the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful is your file systems
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE = 1
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES = samSpecURL=http://samtools.sourceforge.net/SAM1.pdf
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it parses.
+# With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this tag.
+# The format is ext=language, where ext is a file extension, and language is one of
+# the parsers supported by doxygen: IDL, Java, Javascript, C#, C, C++, D, PHP,
+# Objective-C, Python, Fortran, VHDL, C, C++. For instance to make doxygen treat
+# .inc files as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C. Note that for custom extensions you also need to set
+# FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING =
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen to replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penality.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will rougly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols
+
+SYMBOL_CACHE_SIZE = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespace are hidden.
+
+EXTRACT_ANON_NSPACES = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or define consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and defines in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page. This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed by
+# doxygen. The layout file controls the global structure of the generated output files
+# in an output format independent way. The create the layout file that represents
+# doxygen's defaults, run doxygen with the -l option. You can optionally specify a
+# file name after the option, if omitted DoxygenLayout.xml will be used as the name
+# of the layout file.
+
+LAYOUT_FILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR = YES
+
+# This WARN_NO_PARAMDOC option can be abled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT = src/api
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx
+# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
+
+FILE_PATTERNS = *.c \
+ *.cc \
+ *.cxx \
+ *.cpp \
+ *.c++ \
+ *.d \
+ *.java \
+ *.ii \
+ *.ixx \
+ *.ipp \
+ *.i++ \
+ *.inl \
+ *.h \
+ *.hh \
+ *.hxx \
+ *.hpp \
+ *.h++ \
+ *.idl \
+ *.odl \
+ *.cs \
+ *.php \
+ *.php3 \
+ *.inc \
+ *.m \
+ *.mm \
+ *.dox \
+ *.py \
+ *.f90 \
+ *.f \
+ *.vhd \
+ *.vhdl
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE = src/api/internal
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix filesystem feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS = BamTools::Internal \
+ BamTools::BamAlignment::BamAlignmentSupportData \
+ BamTools::RaiiBuffer \
+ UsesCharData \
+ sort_helper \
+ AlignmentSortBase
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output. If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER
+# is applied to all files.
+
+FILTER_PATTERNS =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code. Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header.
+
+HTML_HEADER =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET =
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP = YES
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS = YES
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html for more information.
+
+GENERATE_DOCSET = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID = org.doxygen.Project
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and QHP_VIRTUAL_FOLDER
+# are set, an additional index file will be generated that can be used as input for
+# Qt's qhelpgenerator to generate a Qt Compressed Help (.qch) of the generated
+# HTML documentation.
+
+GENERATE_QHP = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to add.
+# For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the custom filter to add.For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+# will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX = NO
+
+# This tag can be used to set the number of enum values (range [1..20])
+# that doxygen will group on one line in the generated HTML documentation.
+
+ENUM_VALUES_PER_LINE = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW = NO
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH = 250
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE = 10
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvances is that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, a4wide, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader. This is useful
+# if you want to understand what is going on. On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition.
+
+EXPAND_AS_DEFINED =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all function-like macros that are alone
+# on a line, have an all uppercase name, and do not end with a semicolon. Such
+# function macros are typically used for boiler-plate code, and will confuse
+# the parser if not removed.
+
+SKIP_FUNCTION_MACROS = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS = NO
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option is superseded by the HAVE_DOT option below. This is only a
+# fallback. It is recommended to install and use dot, since it yields more
+# powerful graphs.
+
+CLASS_DIAGRAMS = NO
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT = NO
+
+# By default doxygen will write a font called FreeSans.ttf to the output
+# directory and reference it in all dot files that doxygen generates. This
+# font does not include all possible unicode characters however, so when you need
+# these (or just want a differently looking font) you can specify the font name
+# using DOT_FONTNAME. You need need to make sure dot is able to find the font,
+# which can be done by putting it in a standard location or by setting the
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory
+# containing the font.
+
+# DOT_FONTNAME = FreeSans
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE = 10
+
+# By default doxygen will tell dot to use the output directory to look for the
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a
+# different font using DOT_FONTNAME you can set the path where dot
+# can find it using this tag.
+
+DOT_FONTPATH =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are png, jpg, or gif
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP = YES
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..04ecf6e
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,20 @@
+# ==========================
+# BamTools CMakeLists.txt
+# (c) 2010 Derek Barnett
+#
+# src/
+# ==========================
+
+add_subdirectory( api )
+add_subdirectory( third_party )
+add_subdirectory( toolkit )
+add_subdirectory( utils )
+
+# export shared headers
+include( ExportHeader.cmake )
+set( SharedIncludeDir "shared" )
+ExportHeader( SharedHeaders shared/bamtools_global.h ${SharedIncludeDir} )
+
+# configure and install pkg-config file
+configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/bamtools.pc.in ${CMAKE_CURRENT_BINARY_DIR}/bamtools-1.pc @ONLY )
+install( FILES ${CMAKE_CURRENT_BINARY_DIR}/bamtools-1.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig )
diff --git a/src/ExportHeader.cmake b/src/ExportHeader.cmake
new file mode 100644
index 0000000..d62a5bc
--- /dev/null
+++ b/src/ExportHeader.cmake
@@ -0,0 +1,27 @@
+#
+# ExportHeader
+#
+
+function( ExportHeader MODULE FILE DEST )
+
+ # if haven't defined our custom 'build target'
+ # not exactly a build target, but lets this command get
+ # checked any time build step happens
+ if( NOT TARGET ${MODULE} )
+ add_custom_target( ${MODULE} ALL COMMENT "Exporting ${MODULE}" )
+ endif( NOT TARGET ${MODULE} )
+
+ # get the filename (without path)
+ get_filename_component( FILENAME "${FILE}" NAME )
+
+ # copy header to destination
+ add_custom_command( TARGET ${MODULE} COMMAND
+ ${CMAKE_COMMAND} -E copy_if_different
+ "${CMAKE_CURRENT_SOURCE_DIR}/${FILE}"
+ "${CMAKE_CURRENT_BINARY_DIR}/include/${DEST}/${FILENAME}" )
+
+ # make sure files are properly 'installed'
+ install( FILES "${FILE}" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/bamtools/${DEST}" )
+
+endfunction( ExportHeader )
+
diff --git a/src/api/BamAlgorithms.h b/src/api/BamAlgorithms.h
new file mode 100644
index 0000000..7f4b36f
--- /dev/null
+++ b/src/api/BamAlgorithms.h
@@ -0,0 +1,21 @@
+// ***************************************************************************
+// BamAlgorithms.h (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides generic algorithms that are intended to work with BamTools data
+// structures. Where possible, these are intended to be STL-compatible.
+// ***************************************************************************
+
+#ifndef BAMALGORITHMS_H
+#define BAMALGORITHMS_H
+
+#include "api/algorithms/Sort.h"
+
+/*! \namespace BamTools::Algorithms
+ \brief Provides convenient classes & methods for working with BAM data
+*/
+
+#endif // BAM_ALGORITHMS_H
diff --git a/src/api/BamAlignment.cpp b/src/api/BamAlignment.cpp
new file mode 100644
index 0000000..8173dcf
--- /dev/null
+++ b/src/api/BamAlignment.cpp
@@ -0,0 +1,1127 @@
+// ***************************************************************************
+// BamAlignment.cpp (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 4 December 2012 (DB)
+// ---------------------------------------------------------------------------
+// Provides the BamAlignment data structure
+// ***************************************************************************
+
+#include "api/BamAlignment.h"
+#include "api/BamConstants.h"
+using namespace BamTools;
+
+#include <cstddef>
+
+/*! \class BamTools::BamAlignment
+ \brief The main BAM alignment data structure.
+
+ Provides methods to query/modify BAM alignment data fields.
+*/
+/*! \var BamAlignment::Name
+ \brief read name
+*/
+/*! \var BamAlignment::Length
+ \brief length of query sequence
+*/
+/*! \var BamAlignment::QueryBases
+ \brief 'original' sequence (as reported from sequencing machine)
+
+ \note Setting this field to "*" indicates that the sequence is not to be stored on output.
+ In this case, the contents of the Qualities field should be invalidated as well (cleared or marked as "*").
+*/
+/*! \var BamAlignment::AlignedBases
+ \brief 'aligned' sequence (includes any indels, padding, clipping)
+
+ This field will be completely empty after reading from BamReader/BamMultiReader when
+ QueryBases is empty.
+*/
+/*! \var BamAlignment::Qualities
+ \brief FASTQ qualities (ASCII characters, not numeric values)
+
+ \note Setting this field to "*" indicates to BamWriter that the quality scores are not to be stored,
+ but instead will be output as a sequence of '0xFF'. Otherwise, QueryBases must not be a "*" and
+ the length of this field should equal the length of QueryBases.
+*/
+/*! \var BamAlignment::TagData
+ \brief tag data (use the provided methods to query/modify)
+*/
+/*! \var BamAlignment::RefID
+ \brief ID number for reference sequence
+*/
+/*! \var BamAlignment::Position
+ \brief position (0-based) where alignment starts
+*/
+/*! \var BamAlignment::Bin
+ \brief BAM (standard) index bin number for this alignment
+*/
+/*! \var BamAlignment::MapQuality
+ \brief mapping quality score
+*/
+/*! \var BamAlignment::AlignmentFlag
+ \brief alignment bit-flag (use the provided methods to query/modify)
+*/
+/*! \var BamAlignment::CigarData
+ \brief CIGAR operations for this alignment
+*/
+/*! \var BamAlignment::MateRefID
+ \brief ID number for reference sequence where alignment's mate was aligned
+*/
+/*! \var BamAlignment::MatePosition
+ \brief position (0-based) where alignment's mate starts
+*/
+/*! \var BamAlignment::InsertSize
+ \brief mate-pair insert size
+*/
+/*! \var BamAlignment::Filename
+ \brief name of BAM file which this alignment comes from
+*/
+
+/*! \fn BamAlignment::BamAlignment()
+ \brief constructor
+*/
+BamAlignment::BamAlignment()
+ : Length(0)
+ , RefID(-1)
+ , Position(-1)
+ , Bin(0)
+ , MapQuality(0)
+ , AlignmentFlag(0)
+ , MateRefID(-1)
+ , MatePosition(-1)
+ , InsertSize(0)
+{}
+
+/*! \fn BamAlignment::BamAlignment(const BamAlignment& other)
+ \brief copy constructor
+*/
+BamAlignment::BamAlignment(const BamAlignment& other)
+ : Name(other.Name)
+ , Length(other.Length)
+ , QueryBases(other.QueryBases)
+ , AlignedBases(other.AlignedBases)
+ , Qualities(other.Qualities)
+ , TagData(other.TagData)
+ , RefID(other.RefID)
+ , Position(other.Position)
+ , Bin(other.Bin)
+ , MapQuality(other.MapQuality)
+ , AlignmentFlag(other.AlignmentFlag)
+ , CigarData(other.CigarData)
+ , MateRefID(other.MateRefID)
+ , MatePosition(other.MatePosition)
+ , InsertSize(other.InsertSize)
+ , Filename(other.Filename)
+ , SupportData(other.SupportData)
+{}
+
+/*! \fn BamAlignment::~BamAlignment()
+ \brief destructor
+*/
+BamAlignment::~BamAlignment() {}
+
+/*! \fn bool BamAlignment::BuildCharData()
+ \brief Populates alignment string fields (read name, bases, qualities, tag data).
+
+ An alignment retrieved using BamReader::GetNextAlignmentCore() lacks this data.
+ Using that method makes parsing much quicker when only positional data is required.
+
+ However, if you later want to access the character data fields from such an alignment,
+ use this method to populate those fields. Provides ability to do 'lazy evaluation' of
+ alignment parsing.
+
+ \return \c true if character data populated successfully (or was already available to begin with)
+*/
+bool BamAlignment::BuildCharData()
+{
+
+ // skip if char data already parsed
+ if (!SupportData.HasCoreOnly) return true;
+
+ // check system endianness
+ bool IsBigEndian = BamTools::SystemIsBigEndian();
+
+ // calculate character lengths/offsets
+ const unsigned int dataLength = SupportData.BlockLength - Constants::BAM_CORE_SIZE;
+ const unsigned int seqDataOffset =
+ SupportData.QueryNameLength + (SupportData.NumCigarOperations * 4);
+ const unsigned int qualDataOffset = seqDataOffset + (SupportData.QuerySequenceLength + 1) / 2;
+ const unsigned int tagDataOffset = qualDataOffset + SupportData.QuerySequenceLength;
+ const unsigned int tagDataLength = dataLength - tagDataOffset;
+
+ // check offsets to see what char data exists
+ const bool hasSeqData = (seqDataOffset < qualDataOffset);
+ const bool hasQualData = (qualDataOffset < tagDataOffset);
+ const bool hasTagData = (tagDataOffset < dataLength);
+
+ // store alignment name (relies on null char in name as terminator)
+ Name.assign(SupportData.AllCharData.data());
+
+ // save query sequence
+ QueryBases.clear();
+ if (hasSeqData) {
+ const char* seqData = SupportData.AllCharData.data() + seqDataOffset;
+ QueryBases.reserve(SupportData.QuerySequenceLength);
+ for (std::size_t i = 0; i < SupportData.QuerySequenceLength; ++i) {
+ const char singleBase =
+ Constants::BAM_DNA_LOOKUP[((seqData[(i / 2)] >> (4 * (1 - (i % 2)))) & 0xf)];
+ QueryBases.append(1, singleBase);
+ }
+ }
+
+ // save qualities
+
+ Qualities.clear();
+ if (hasQualData) {
+ const char* qualData = SupportData.AllCharData.data() + qualDataOffset;
+
+ // if marked as unstored (sequence of 0xFF) - don't do conversion, just fill with 0xFFs
+ if (qualData[0] == (char)0xFF)
+ Qualities.resize(SupportData.QuerySequenceLength, (char)0xFF);
+
+ // otherwise convert from numeric QV to 'FASTQ-style' ASCII character
+ else {
+ Qualities.reserve(SupportData.QuerySequenceLength);
+ for (std::size_t i = 0; i < SupportData.QuerySequenceLength; ++i)
+ Qualities.append(1, qualData[i] + 33);
+ }
+ }
+
+ // clear previous AlignedBases
+ AlignedBases.clear();
+
+ // if QueryBases has data, build AlignedBases using CIGAR data
+ // otherwise, AlignedBases will remain empty (this case IS allowed)
+ if (!QueryBases.empty() && QueryBases != "*") {
+
+ // resize AlignedBases
+ AlignedBases.reserve(SupportData.QuerySequenceLength);
+
+ // iterate over CigarOps
+ int k = 0;
+ std::vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
+ std::vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
+ for (; cigarIter != cigarEnd; ++cigarIter) {
+ const CigarOp& op = (*cigarIter);
+
+ switch (op.Type) {
+
+ // for 'M', 'I', '=', 'X' - write bases
+ case (Constants::BAM_CIGAR_MATCH_CHAR):
+ case (Constants::BAM_CIGAR_INS_CHAR):
+ case (Constants::BAM_CIGAR_SEQMATCH_CHAR):
+ case (Constants::BAM_CIGAR_MISMATCH_CHAR):
+ AlignedBases.append(QueryBases.substr(k, op.Length));
+ // fall through
+
+ // for 'S' - soft clip, do not write bases
+ // but increment placeholder 'k'
+ case (Constants::BAM_CIGAR_SOFTCLIP_CHAR):
+ k += op.Length;
+ break;
+
+ // for 'D' - write gap character
+ case (Constants::BAM_CIGAR_DEL_CHAR):
+ AlignedBases.append(op.Length, Constants::BAM_DNA_DEL);
+ break;
+
+ // for 'P' - write padding character
+ case (Constants::BAM_CIGAR_PAD_CHAR):
+ AlignedBases.append(op.Length, Constants::BAM_DNA_PAD);
+ break;
+
+ // for 'N' - write N's, skip bases in original query sequence
+ case (Constants::BAM_CIGAR_REFSKIP_CHAR):
+ AlignedBases.append(op.Length, Constants::BAM_DNA_N);
+ break;
+
+ // for 'H' - hard clip, do nothing to AlignedBases, move to next op
+ case (Constants::BAM_CIGAR_HARDCLIP_CHAR):
+ break;
+
+ // invalid CIGAR op-code
+ default:
+ const std::string message =
+ std::string("invalid CIGAR operation type: ") + op.Type;
+ SetErrorString("BamAlignment::BuildCharData", message);
+ return false;
+ }
+ }
+ }
+
+ // save tag data
+ TagData.clear();
+ if (hasTagData) {
+
+ char* tagData = (((char*)SupportData.AllCharData.data()) + tagDataOffset);
+
+ if (IsBigEndian) {
+ std::size_t i = 0;
+ while (i < tagDataLength) {
+
+ i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.)
+ const char type = tagData[i]; // get tag type at position i
+ ++i; // move i past tag type
+
+ switch (type) {
+
+ case (Constants::BAM_TAG_TYPE_ASCII):
+ case (Constants::BAM_TAG_TYPE_INT8):
+ case (Constants::BAM_TAG_TYPE_UINT8):
+ // no endian swapping necessary for single-byte data
+ ++i;
+ break;
+
+ case (Constants::BAM_TAG_TYPE_INT16):
+ case (Constants::BAM_TAG_TYPE_UINT16):
+ BamTools::SwapEndian_16p(&tagData[i]);
+ i += sizeof(uint16_t);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_FLOAT):
+ case (Constants::BAM_TAG_TYPE_INT32):
+ case (Constants::BAM_TAG_TYPE_UINT32):
+ BamTools::SwapEndian_32p(&tagData[i]);
+ i += sizeof(uint32_t);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_HEX):
+ case (Constants::BAM_TAG_TYPE_STRING):
+ // no endian swapping necessary for hex-string/string data
+ while (tagData[i])
+ ++i;
+ // increment one more for null terminator
+ ++i;
+ break;
+
+ case (Constants::BAM_TAG_TYPE_ARRAY):
+
+ {
+ // read array type
+ const char arrayType = tagData[i];
+ ++i;
+
+ // swap endian-ness of number of elements in place, then retrieve for loop
+ BamTools::SwapEndian_32p(&tagData[i]);
+ uint32_t numElements;
+ memcpy(&numElements, &tagData[i], sizeof(uint32_t));
+ i += sizeof(uint32_t);
+
+ // swap endian-ness of array elements
+ for (std::size_t j = 0; j < numElements; ++j) {
+ switch (arrayType) {
+ case (Constants::BAM_TAG_TYPE_INT8):
+ case (Constants::BAM_TAG_TYPE_UINT8):
+ // no endian-swapping necessary
+ ++i;
+ break;
+ case (Constants::BAM_TAG_TYPE_INT16):
+ case (Constants::BAM_TAG_TYPE_UINT16):
+ BamTools::SwapEndian_16p(&tagData[i]);
+ i += sizeof(uint16_t);
+ break;
+ case (Constants::BAM_TAG_TYPE_FLOAT):
+ case (Constants::BAM_TAG_TYPE_INT32):
+ case (Constants::BAM_TAG_TYPE_UINT32):
+ BamTools::SwapEndian_32p(&tagData[i]);
+ i += sizeof(uint32_t);
+ break;
+ default:
+ const std::string message =
+ std::string("invalid binary array type: ") + arrayType;
+ SetErrorString("BamAlignment::BuildCharData", message);
+ return false;
+ }
+ }
+
+ break;
+ }
+
+ // invalid tag type-code
+ default:
+ const std::string message = std::string("invalid tag type: ") + type;
+ SetErrorString("BamAlignment::BuildCharData", message);
+ return false;
+ }
+ }
+ }
+
+ // store tagData in alignment
+ TagData.resize(tagDataLength);
+ memcpy((char*)(TagData.data()), tagData, tagDataLength);
+ }
+
+ // clear core-only flag & return success
+ SupportData.HasCoreOnly = false;
+ return true;
+}
+
+/*! \fn bool BamAlignment::FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) const
+ \internal
+
+ Searches for requested tag in BAM tag data.
+
+ \param[in] tag requested 2-character tag name
+ \param[in,out] pTagData pointer to current position in BamAlignment::TagData
+ \param[in] tagDataLength length of BamAlignment::TagData
+ \param[in,out] numBytesParsed number of bytes parsed so far
+
+ \return \c true if found
+
+ \post If \a tag is found, \a pTagData will point to the byte where the tag data begins.
+ \a numBytesParsed will correspond to the position in the full TagData string.
+
+*/
+bool BamAlignment::FindTag(const std::string& tag, char*& pTagData,
+ const unsigned int& tagDataLength, unsigned int& numBytesParsed) const
+{
+
+ while (numBytesParsed < tagDataLength) {
+
+ const char* pTagType = pTagData;
+ const char* pTagStorageType = pTagData + 2;
+ pTagData += 3;
+ numBytesParsed += 3;
+
+ // check the current tag, return true on match
+ if (strncmp(pTagType, tag.c_str(), 2) == 0) return true;
+
+ // get the storage class and find the next tag
+ if (*pTagStorageType == '\0') return false;
+ if (!SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed)) return false;
+ if (*pTagData == '\0') return false;
+ }
+
+ // checked all tags, none match
+ return false;
+}
+
+/*! \fn bool BamAlignment::GetArrayTagType(const std::string& tag, char& type) const
+ \brief Retrieves the BAM tag type-code for the array elements associated with requested tag name.
+
+ \param[in] tag 2-character tag name
+ \param[out] type retrieved (1-character) type-code
+
+ \return \c true if found. False if not found, or if tag is not an array type.
+ \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
+*/
+bool BamAlignment::GetArrayTagType(const std::string& tag, char& type) const
+{
+
+ // skip if alignment is core-only
+ if (SupportData.HasCoreOnly) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // skip if no tags present
+ if (TagData.empty()) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag not found, return failure
+ if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // check that tag type code is array
+ type = *(pTagData - 1);
+ if (type != Constants::BAM_TAG_TYPE_ARRAY) {
+ // TODO: set error string
+ return false;
+ }
+
+ // fetch element type
+ const char elementType = *pTagData;
+ switch (elementType) {
+
+ // allowable types
+ case (Constants::BAM_TAG_TYPE_INT8):
+ case (Constants::BAM_TAG_TYPE_UINT8):
+ case (Constants::BAM_TAG_TYPE_INT16):
+ case (Constants::BAM_TAG_TYPE_UINT16):
+ case (Constants::BAM_TAG_TYPE_INT32):
+ case (Constants::BAM_TAG_TYPE_UINT32):
+ case (Constants::BAM_TAG_TYPE_FLOAT):
+ type = elementType;
+ break;
+
+ default:
+ //TODO: set error string
+ return false;
+ }
+
+ // if we get here, return success
+ return true;
+}
+
+/*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool closedInterval = false) const
+ \brief Calculates alignment end position, based on its starting position and CIGAR data.
+
+ \warning The position returned now represents a zero-based, HALF-OPEN interval.
+ In previous versions of BamTools (0.x & 1.x) all intervals were treated
+ as zero-based, CLOSED.
+
+ \param[in] usePadded Allow inserted bases to affect the reported position. Default is
+ false, so that reported position stays synced with reference
+ coordinates.
+ \param[in] closedInterval Setting this to true will return a 0-based end coordinate. Default is
+ false, so that his value represents a standard, half-open interval.
+
+ \return alignment end position
+*/
+int BamAlignment::GetEndPosition(bool usePadded, bool closedInterval) const
+{
+
+ // initialize alignment end to starting position
+ int alignEnd = Position;
+
+ // iterate over cigar operations
+ std::vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
+ std::vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
+ for (; cigarIter != cigarEnd; ++cigarIter) {
+ const CigarOp& op = (*cigarIter);
+
+ switch (op.Type) {
+
+ // increase end position on CIGAR chars [DMXN=]
+ case Constants::BAM_CIGAR_DEL_CHAR:
+ case Constants::BAM_CIGAR_MATCH_CHAR:
+ case Constants::BAM_CIGAR_MISMATCH_CHAR:
+ case Constants::BAM_CIGAR_REFSKIP_CHAR:
+ case Constants::BAM_CIGAR_SEQMATCH_CHAR:
+ alignEnd += op.Length;
+ break;
+
+ // increase end position on insertion, only if @usePadded is true
+ case Constants::BAM_CIGAR_INS_CHAR:
+ if (usePadded) alignEnd += op.Length;
+ break;
+
+ // all other CIGAR chars do not affect end position
+ default:
+ break;
+ }
+ }
+
+ // adjust for closedInterval, if requested
+ if (closedInterval) alignEnd -= 1;
+
+ // return result
+ return alignEnd;
+}
+
+/*! \fn std::string BamAlignment::GetErrorString() const
+ \brief Returns a human-readable description of the last error that occurred
+
+ This method allows elimination of STDERR pollution. Developers of client code
+ may choose how the messages are displayed to the user, if at all.
+
+ \return error description
+*/
+std::string BamAlignment::GetErrorString() const
+{
+ return ErrorString;
+}
+
+/*! \fn bool BamAlignment::GetSoftClips(std::vector<int>& clipSizes, std::vector<int>& readPositions, std::vector<int>& genomePositions, bool usePadded = false) const
+ \brief Identifies if an alignment has a soft clip. If so, identifies the
+ sizes of the soft clips, as well as their positions in the read and reference.
+
+ \param[out] clipSizes vector of the sizes of each soft clip in the alignment
+ \param[out] readPositions vector of the 0-based read locations of each soft clip in the alignment.
+ These positions are basically indexes within the read, not genomic positions.
+ \param[out] genomePositions vector of the 0-based genome locations of each soft clip in the alignment
+ \param[in] usePadded inserted bases affect reported position. Default is false, so that
+ reported position stays 'sync-ed' with reference coordinates.
+
+ \return \c true if any soft clips were found in the alignment
+*/
+bool BamAlignment::GetSoftClips(std::vector<int>& clipSizes, std::vector<int>& readPositions,
+ std::vector<int>& genomePositions, bool usePadded) const
+{
+ // initialize positions & flags
+ int refPosition = Position;
+ int readPosition = 0;
+ bool softClipFound = false;
+ bool firstCigarOp = true;
+
+ // iterate over cigar operations
+ std::vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
+ std::vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
+ for (; cigarIter != cigarEnd; ++cigarIter) {
+ const CigarOp& op = (*cigarIter);
+
+ switch (op.Type) {
+
+ // increase both read & genome positions on CIGAR chars [DMXN=]
+ case Constants::BAM_CIGAR_DEL_CHAR:
+ case Constants::BAM_CIGAR_MATCH_CHAR:
+ case Constants::BAM_CIGAR_MISMATCH_CHAR:
+ case Constants::BAM_CIGAR_REFSKIP_CHAR:
+ case Constants::BAM_CIGAR_SEQMATCH_CHAR:
+ refPosition += op.Length;
+ readPosition += op.Length;
+ break;
+
+ // increase read position on insertion, genome position only if @usePadded is true
+ case Constants::BAM_CIGAR_INS_CHAR:
+ readPosition += op.Length;
+ if (usePadded) refPosition += op.Length;
+ break;
+
+ case Constants::BAM_CIGAR_SOFTCLIP_CHAR:
+
+ softClipFound = true;
+
+ //////////////////////////////////////////////////////////////////////////////
+ // if we are dealing with the *first* CIGAR operation
+ // for this alignment, we increment the read position so that
+ // the read and genome position of the clip are referring to the same base.
+ // For example, in the alignment below, the ref position would be 4, yet
+ // the read position would be 0. Thus, to "sync" the two,
+ // we need to increment the read position by the length of the
+ // soft clip.
+ // Read: ATCGTTTCGTCCCTGC
+ // Ref: GGGATTTCGTCCCTGC
+ // Cigar: SSSSMMMMMMMMMMMM
+ //
+ // NOTE: This only needs to be done if the soft clip is the _first_ CIGAR op.
+ //////////////////////////////////////////////////////////////////////////////
+ if (firstCigarOp) readPosition += op.Length;
+
+ // track the soft clip's size, read position, and genome position
+ clipSizes.push_back(op.Length);
+ readPositions.push_back(readPosition);
+ genomePositions.push_back(refPosition);
+
+ // any other CIGAR operations have no effect
+ default:
+ break;
+ }
+
+ // clear our "first pass" flag
+ firstCigarOp = false;
+ }
+
+ // return whether any soft clips found
+ return softClipFound;
+}
+
+/*! \fn std::vector<std::string> BamAlignment::GetTagNames() const
+ \brief Retrieves the BAM tag names.
+
+ When paired with GetTagType() and GetTag(), this method allows you
+ to iterate over an alignment's tag data without knowing the names (or types)
+ beforehand.
+
+ \return \c vector containing all tag names found (empty if none available)
+ \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
+*/
+std::vector<std::string> BamAlignment::GetTagNames() const
+{
+
+ std::vector<std::string> result;
+ if (SupportData.HasCoreOnly || TagData.empty()) return result;
+
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+ while (numBytesParsed < tagDataLength) {
+
+ // get current tag name & type
+ const char* pTagName = pTagData;
+ const char* pTagType = pTagData + 2;
+ pTagData += 3;
+ numBytesParsed += 3;
+
+ // store tag name
+ result.push_back(std::string(pTagName, 2));
+
+ // find the next tag
+ if (*pTagType == '\0') break;
+ if (!SkipToNextTag(*pTagType, pTagData, numBytesParsed)) break;
+ if (*pTagData == '\0') break;
+ }
+
+ return result;
+}
+
+/*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const
+ \brief Retrieves the BAM tag type-code associated with requested tag name.
+
+ \param[in] tag 2-character tag name
+ \param[out] type retrieved (1-character) type-code
+
+ \return \c true if found
+ \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
+*/
+bool BamAlignment::GetTagType(const std::string& tag, char& type) const
+{
+
+ // skip if alignment is core-only
+ if (SupportData.HasCoreOnly) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // skip if no tags present
+ if (TagData.empty()) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag not found, return failure
+ if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // otherwise, retrieve & validate tag type code
+ type = *(pTagData - 1);
+ switch (type) {
+ case (Constants::BAM_TAG_TYPE_ASCII):
+ case (Constants::BAM_TAG_TYPE_INT8):
+ case (Constants::BAM_TAG_TYPE_UINT8):
+ case (Constants::BAM_TAG_TYPE_INT16):
+ case (Constants::BAM_TAG_TYPE_UINT16):
+ case (Constants::BAM_TAG_TYPE_INT32):
+ case (Constants::BAM_TAG_TYPE_UINT32):
+ case (Constants::BAM_TAG_TYPE_FLOAT):
+ case (Constants::BAM_TAG_TYPE_STRING):
+ case (Constants::BAM_TAG_TYPE_HEX):
+ case (Constants::BAM_TAG_TYPE_ARRAY):
+ return true;
+
+ // unknown tag type
+ default:
+ const std::string message = std::string("invalid tag type: ") + type;
+ SetErrorString("BamAlignment::GetTagType", message);
+ return false;
+ }
+}
+
+/*! \fn bool BamAlignment::HasTag(const std::string& tag) const
+ \brief Returns true if alignment has a record for requested tag.
+
+ \param[in] tag 2-character tag name
+ \return \c true if alignment has a record for tag
+*/
+bool BamAlignment::HasTag(const std::string& tag) const
+{
+
+ // return false if no tag data present
+ if (SupportData.HasCoreOnly || TagData.empty()) return false;
+
+ // localize the tag data for lookup
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if result of tag lookup
+ return FindTag(tag, pTagData, tagDataLength, numBytesParsed);
+}
+
+/*! \fn bool BamAlignment::IsDuplicate() const
+ \return \c true if this read is a PCR duplicate
+*/
+bool BamAlignment::IsDuplicate() const
+{
+ return ((AlignmentFlag & Constants::BAM_ALIGNMENT_DUPLICATE) != 0);
+}
+
+/*! \fn bool BamAlignment::IsFailedQC() const
+ \return \c true if this read failed quality control
+*/
+bool BamAlignment::IsFailedQC() const
+{
+ return ((AlignmentFlag & Constants::BAM_ALIGNMENT_QC_FAILED) != 0);
+}
+
+/*! \fn bool BamAlignment::IsFirstMate() const
+ \return \c true if alignment is first mate on paired-end read
+*/
+bool BamAlignment::IsFirstMate() const
+{
+ return ((AlignmentFlag & Constants::BAM_ALIGNMENT_READ_1) != 0);
+}
+
+/*! \fn bool BamAlignment::IsMapped() const
+ \return \c true if alignment is mapped
+*/
+bool BamAlignment::IsMapped() const
+{
+ return ((AlignmentFlag & Constants::BAM_ALIGNMENT_UNMAPPED) == 0);
+}
+
+/*! \fn bool BamAlignment::IsMateMapped() const
+ \return \c true if alignment's mate is mapped
+*/
+bool BamAlignment::IsMateMapped() const
+{
+ return ((AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_UNMAPPED) == 0);
+}
+
+/*! \fn bool BamAlignment::IsMateReverseStrand() const
+ \return \c true if alignment's mate mapped to reverse strand
+*/
+bool BamAlignment::IsMateReverseStrand() const
+{
+ return ((AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND) != 0);
+}
+
+/*! \fn bool BamAlignment::IsPaired() const
+ \return \c true if alignment part of paired-end read
+*/
+bool BamAlignment::IsPaired() const
+{
+ return ((AlignmentFlag & Constants::BAM_ALIGNMENT_PAIRED) != 0);
+}
+
+/*! \fn bool BamAlignment::IsPrimaryAlignment() const
+ \return \c true if reported position is primary alignment
+*/
+bool BamAlignment::IsPrimaryAlignment() const
+{
+ return ((AlignmentFlag & Constants::BAM_ALIGNMENT_SECONDARY) == 0);
+}
+
+/*! \fn bool BamAlignment::IsProperPair() const
+ \return \c true if alignment is part of read that satisfied paired-end resolution
+*/
+bool BamAlignment::IsProperPair() const
+{
+ return ((AlignmentFlag & Constants::BAM_ALIGNMENT_PROPER_PAIR) != 0);
+}
+
+/*! \fn bool BamAlignment::IsReverseStrand() const
+ \return \c true if alignment mapped to reverse strand
+*/
+bool BamAlignment::IsReverseStrand() const
+{
+ return ((AlignmentFlag & Constants::BAM_ALIGNMENT_REVERSE_STRAND) != 0);
+}
+
+/*! \fn bool BamAlignment::IsSecondMate() const
+ \return \c true if alignment is second mate on read
+*/
+bool BamAlignment::IsSecondMate() const
+{
+ return ((AlignmentFlag & Constants::BAM_ALIGNMENT_READ_2) != 0);
+}
+
+/*! \fn bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const
+ \internal
+
+ Checks that tag name & type strings are expected sizes.
+
+ \param tag[in] BAM tag name
+ \param type[in] BAM tag type-code
+ \return \c true if both input strings are valid sizes
+*/
+bool BamAlignment::IsValidSize(const std::string& tag, const std::string& type) const
+{
+ return (tag.size() == Constants::BAM_TAG_TAGSIZE) &&
+ (type.size() == Constants::BAM_TAG_TYPESIZE);
+}
+
+/*! \fn void BamAlignment::RemoveTag(const std::string& tag)
+ \brief Removes field from BAM tags.
+
+ \param[in] tag 2-character name of field to remove
+*/
+void BamAlignment::RemoveTag(const std::string& tag)
+{
+
+ // if char data not populated, do that first
+ if (SupportData.HasCoreOnly) BuildCharData();
+
+ // skip if no tags available
+ if (TagData.empty()) return;
+
+ // localize the tag data
+ char* pOriginalTagData = (char*)TagData.data();
+ char* pTagData = pOriginalTagData;
+ const unsigned int originalTagDataLength = TagData.size();
+ unsigned int newTagDataLength = 0;
+ unsigned int numBytesParsed = 0;
+
+ // skip if tag not found
+ if (!FindTag(tag, pTagData, originalTagDataLength, numBytesParsed)) return;
+
+ // otherwise, remove it
+ RaiiBuffer newTagData(originalTagDataLength);
+
+ // copy original tag data up til desired tag
+ pTagData -= 3;
+ numBytesParsed -= 3;
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData.Buffer, pOriginalTagData, numBytesParsed);
+
+ // attemp to skip to next tag
+ const char* pTagStorageType = pTagData + 2;
+ pTagData += 3;
+ numBytesParsed += 3;
+ if (SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed)) {
+
+ // squeeze remaining tag data
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
+ const unsigned int endTagDataLength =
+ originalTagDataLength - beginningTagDataLength - skippedDataLength;
+ memcpy(newTagData.Buffer + beginningTagDataLength, pTagData, endTagDataLength);
+
+ // save modified tag data in alignment
+ TagData.assign(newTagData.Buffer, beginningTagDataLength + endTagDataLength);
+ }
+}
+
+/*! \fn void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const
+ \internal
+
+ Sets a formatted error string for this alignment.
+
+ \param[in] where class/method where error occurred
+ \param[in] what description of error
+*/
+void BamAlignment::SetErrorString(const std::string& where, const std::string& what) const
+{
+ static const std::string SEPARATOR(": ");
+ ErrorString = where + SEPARATOR + what;
+}
+
+/*! \fn void BamAlignment::SetIsDuplicate(bool ok)
+ \brief Sets value of "PCR duplicate" flag to \a ok.
+*/
+void BamAlignment::SetIsDuplicate(bool ok)
+{
+ if (ok)
+ AlignmentFlag |= Constants::BAM_ALIGNMENT_DUPLICATE;
+ else
+ AlignmentFlag &= ~Constants::BAM_ALIGNMENT_DUPLICATE;
+}
+
+/*! \fn void BamAlignment::SetIsFailedQC(bool ok)
+ \brief Sets "failed quality control" flag to \a ok.
+*/
+void BamAlignment::SetIsFailedQC(bool ok)
+{
+ if (ok)
+ AlignmentFlag |= Constants::BAM_ALIGNMENT_QC_FAILED;
+ else
+ AlignmentFlag &= ~Constants::BAM_ALIGNMENT_QC_FAILED;
+}
+
+/*! \fn void BamAlignment::SetIsFirstMate(bool ok)
+ \brief Sets "alignment is first mate" flag to \a ok.
+*/
+void BamAlignment::SetIsFirstMate(bool ok)
+{
+ if (ok)
+ AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_1;
+ else
+ AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_1;
+}
+
+/*! \fn void BamAlignment::SetIsMapped(bool ok)
+ \brief Sets "alignment is mapped" flag to \a ok.
+*/
+void BamAlignment::SetIsMapped(bool ok)
+{
+ if (ok)
+ AlignmentFlag &= ~Constants::BAM_ALIGNMENT_UNMAPPED;
+ else
+ AlignmentFlag |= Constants::BAM_ALIGNMENT_UNMAPPED;
+}
+
+/*! \fn void BamAlignment::SetIsMateMapped(bool ok)
+ \brief Sets "alignment's mate is mapped" flag to \a ok.
+*/
+void BamAlignment::SetIsMateMapped(bool ok)
+{
+ if (ok)
+ AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_UNMAPPED;
+ else
+ AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_UNMAPPED;
+}
+
+/*! \fn void BamAlignment::SetIsMateReverseStrand(bool ok)
+ \brief Sets "alignment's mate mapped to reverse strand" flag to \a ok.
+*/
+void BamAlignment::SetIsMateReverseStrand(bool ok)
+{
+ if (ok)
+ AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND;
+ else
+ AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND;
+}
+
+/*! \fn void BamAlignment::SetIsPaired(bool ok)
+ \brief Sets "alignment part of paired-end read" flag to \a ok.
+*/
+void BamAlignment::SetIsPaired(bool ok)
+{
+ if (ok)
+ AlignmentFlag |= Constants::BAM_ALIGNMENT_PAIRED;
+ else
+ AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PAIRED;
+}
+
+/*! \fn void BamAlignment::SetIsPrimaryAlignment(bool ok)
+ \brief Sets "position is primary alignment" flag to \a ok.
+*/
+void BamAlignment::SetIsPrimaryAlignment(bool ok)
+{
+ if (ok)
+ AlignmentFlag &= ~Constants::BAM_ALIGNMENT_SECONDARY;
+ else
+ AlignmentFlag |= Constants::BAM_ALIGNMENT_SECONDARY;
+}
+
+/*! \fn void BamAlignment::SetIsProperPair(bool ok)
+ \brief Sets "alignment is part of read that satisfied paired-end resolution" flag to \a ok.
+*/
+void BamAlignment::SetIsProperPair(bool ok)
+{
+ if (ok)
+ AlignmentFlag |= Constants::BAM_ALIGNMENT_PROPER_PAIR;
+ else
+ AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PROPER_PAIR;
+}
+
+/*! \fn void BamAlignment::SetIsReverseStrand(bool ok)
+ \brief Sets "alignment mapped to reverse strand" flag to \a ok.
+*/
+void BamAlignment::SetIsReverseStrand(bool ok)
+{
+ if (ok)
+ AlignmentFlag |= Constants::BAM_ALIGNMENT_REVERSE_STRAND;
+ else
+ AlignmentFlag &= ~Constants::BAM_ALIGNMENT_REVERSE_STRAND;
+}
+
+/*! \fn void BamAlignment::SetIsSecondMate(bool ok)
+ \brief Sets "alignment is second mate on read" flag to \a ok.
+*/
+void BamAlignment::SetIsSecondMate(bool ok)
+{
+ if (ok)
+ AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_2;
+ else
+ AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_2;
+}
+
+/*! \fn bool BamAlignment::SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) const
+ \internal
+
+ Moves to next available tag in tag data string
+
+ \param[in] storageType BAM tag type-code that determines how far to move cursor
+ \param[in,out] pTagData pointer to current position (cursor) in tag string
+ \param[in,out] numBytesParsed report of how many bytes were parsed (cumulatively)
+
+ \return \c if storageType was a recognized BAM tag type
+
+ \post \a pTagData will point to the byte where the next tag data begins.
+ \a numBytesParsed will correspond to the cursor's position in the full TagData string.
+*/
+bool BamAlignment::SkipToNextTag(const char storageType, char*& pTagData,
+ unsigned int& numBytesParsed) const
+{
+ switch (storageType) {
+
+ case (Constants::BAM_TAG_TYPE_ASCII):
+ case (Constants::BAM_TAG_TYPE_INT8):
+ case (Constants::BAM_TAG_TYPE_UINT8):
+ ++numBytesParsed;
+ ++pTagData;
+ break;
+
+ case (Constants::BAM_TAG_TYPE_INT16):
+ case (Constants::BAM_TAG_TYPE_UINT16):
+ numBytesParsed += sizeof(uint16_t);
+ pTagData += sizeof(uint16_t);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_FLOAT):
+ case (Constants::BAM_TAG_TYPE_INT32):
+ case (Constants::BAM_TAG_TYPE_UINT32):
+ numBytesParsed += sizeof(uint32_t);
+ pTagData += sizeof(uint32_t);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_STRING):
+ case (Constants::BAM_TAG_TYPE_HEX):
+ while (*pTagData) {
+ ++numBytesParsed;
+ ++pTagData;
+ }
+ // increment for null-terminator
+ ++numBytesParsed;
+ ++pTagData;
+ break;
+
+ case (Constants::BAM_TAG_TYPE_ARRAY):
+
+ {
+ // read array type
+ const char arrayType = *pTagData;
+ ++numBytesParsed;
+ ++pTagData;
+
+ // read number of elements
+ int32_t numElements;
+ memcpy(&numElements, pTagData, sizeof(uint32_t)); // already endian-swapped, if needed
+ numBytesParsed += sizeof(uint32_t);
+ pTagData += sizeof(uint32_t);
+
+ // calculate number of bytes to skip
+ int bytesToSkip = 0;
+ switch (arrayType) {
+ case (Constants::BAM_TAG_TYPE_INT8):
+ case (Constants::BAM_TAG_TYPE_UINT8):
+ bytesToSkip = numElements;
+ break;
+ case (Constants::BAM_TAG_TYPE_INT16):
+ case (Constants::BAM_TAG_TYPE_UINT16):
+ bytesToSkip = numElements * sizeof(uint16_t);
+ break;
+ case (Constants::BAM_TAG_TYPE_FLOAT):
+ case (Constants::BAM_TAG_TYPE_INT32):
+ case (Constants::BAM_TAG_TYPE_UINT32):
+ bytesToSkip = numElements * sizeof(uint32_t);
+ break;
+ default:
+ const std::string message =
+ std::string("invalid binary array type: ") + arrayType;
+ SetErrorString("BamAlignment::SkipToNextTag", message);
+ return false;
+ }
+
+ // skip binary array contents
+ numBytesParsed += bytesToSkip;
+ pTagData += bytesToSkip;
+ break;
+ }
+
+ default:
+ const std::string message = std::string("invalid tag type: ") + storageType;
+ SetErrorString("BamAlignment::SkipToNextTag", message);
+ return false;
+ }
+
+ // if we get here, tag skipped OK - return success
+ return true;
+}
diff --git a/src/api/BamAlignment.h b/src/api/BamAlignment.h
new file mode 100644
index 0000000..6491807
--- /dev/null
+++ b/src/api/BamAlignment.h
@@ -0,0 +1,644 @@
+// ***************************************************************************
+// BamAlignment.h (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 July 2013 (DB)
+// ---------------------------------------------------------------------------
+// Provides the BamAlignment data structure
+// ***************************************************************************
+
+#ifndef BAMALIGNMENT_H
+#define BAMALIGNMENT_H
+
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+#include "api/BamAux.h"
+#include "api/BamConstants.h"
+#include "api/api_global.h"
+
+namespace BamTools {
+
+//! \cond
+// forward declaration of BamAlignment's "friends"
+namespace Internal {
+class BamReaderPrivate;
+class BamWriterPrivate;
+} // namespace Internal
+//! \endcond
+
+// BamAlignment data structure
+class API_EXPORT BamAlignment
+{
+
+ // constructors & destructor
+public:
+ BamAlignment();
+ BamAlignment(const BamAlignment& other);
+ ~BamAlignment();
+
+ // queries against alignment flags
+public:
+ bool IsDuplicate() const; // returns true if this read is a PCR duplicate
+ bool IsFailedQC() const; // returns true if this read failed quality control
+ bool IsFirstMate() const; // returns true if alignment is first mate on read
+ bool IsMapped() const; // returns true if alignment is mapped
+ bool IsMateMapped() const; // returns true if alignment's mate is mapped
+ bool IsMateReverseStrand() const; // returns true if alignment's mate mapped to reverse strand
+ bool IsPaired() const; // returns true if alignment part of paired-end read
+ bool IsPrimaryAlignment() const; // returns true if reported position is primary alignment
+ bool IsProperPair()
+ const; // returns true if alignment is part of read that satisfied paired-end resolution
+ bool IsReverseStrand() const; // returns true if alignment mapped to reverse strand
+ bool IsSecondMate() const; // returns true if alignment is second mate on read
+
+ // manipulate alignment flags
+public:
+ void SetIsDuplicate(bool ok); // sets value of "PCR duplicate" flag
+ void SetIsFailedQC(bool ok); // sets value of "failed quality control" flag
+ void SetIsFirstMate(bool ok); // sets value of "alignment is first mate" flag
+ void SetIsMapped(bool ok); // sets value of "alignment is mapped" flag
+ void SetIsMateMapped(bool ok); // sets value of "alignment's mate is mapped" flag
+ void SetIsMateReverseStrand(
+ bool ok); // sets value of "alignment's mate mapped to reverse strand" flag
+ void SetIsPaired(bool ok); // sets value of "alignment part of paired-end read" flag
+ void SetIsPrimaryAlignment(bool ok); // sets value of "position is primary alignment" flag
+ void SetIsProperPair(
+ bool
+ ok); // sets value of "alignment is part of read that satisfied paired-end resolution" flag
+ void SetIsReverseStrand(bool ok); // sets value of "alignment mapped to reverse strand" flag
+ void SetIsSecondMate(bool ok); // sets value of "alignment is second mate on read" flag
+
+ // tag data access methods
+public:
+ // add a new tag
+ template <typename T>
+ bool AddTag(const std::string& tag, const std::string& type, const T& value);
+ template <typename T>
+ bool AddTag(const std::string& tag, const std::vector<T>& values);
+
+ // edit (or append) tag
+ template <typename T>
+ bool EditTag(const std::string& tag, const std::string& type, const T& value);
+ template <typename T>
+ bool EditTag(const std::string& tag, const std::vector<T>& values);
+
+ // retrieves tag data
+ template <typename T>
+ bool GetTag(const std::string& tag, T& destination) const;
+ template <typename T>
+ bool GetTag(const std::string& tag, std::vector<T>& destination) const;
+
+ // retrieves all current tag names
+ std::vector<std::string> GetTagNames() const;
+
+ // retrieves the SAM/BAM type-code for requested tag name
+ bool GetTagType(const std::string& tag, char& type) const;
+
+ // retrieves the SAM/BAM type-code for the data elements in an array tag
+ bool GetArrayTagType(const std::string& tag, char& type) const;
+
+ // returns true if alignment has a record for this tag name
+ bool HasTag(const std::string& tag) const;
+
+ // removes a tag
+ void RemoveTag(const std::string& tag);
+
+ // additional methods
+public:
+ // populates alignment string fields
+ bool BuildCharData();
+
+ // calculates alignment end position
+ int GetEndPosition(bool usePadded = false, bool closedInterval = false) const;
+
+ // returns a description of the last error that occurred
+ std::string GetErrorString() const;
+
+ // retrieves the size, read locations and reference locations of soft-clip operations
+ bool GetSoftClips(std::vector<int>& clipSizes, std::vector<int>& readPositions,
+ std::vector<int>& genomePositions, bool usePadded = false) const;
+
+ // public data fields
+public:
+ std::string Name; // read name
+ int32_t Length; // length of query sequence
+ std::string QueryBases; // 'original' sequence (contained in BAM file)
+ std::string
+ AlignedBases; // 'aligned' sequence (QueryBases plus deletion, padding, clipping chars)
+ std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
+ std::string TagData; // tag data (use provided methods to query/modify)
+ int32_t RefID; // ID number for reference sequence
+ int32_t Position; // position (0-based) where alignment starts
+ uint16_t Bin; // BAM (standard) index bin number for this alignment
+ uint16_t MapQuality; // mapping quality score
+ uint32_t AlignmentFlag; // alignment bit-flag (use provided methods to query/modify)
+ std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
+ int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
+ int32_t MatePosition; // position (0-based) where alignment's mate starts
+ int32_t InsertSize; // mate-pair insert size
+ std::string Filename; // name of BAM file which this alignment comes from
+
+ //! \internal
+ // internal utility methods
+private:
+ bool FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength,
+ unsigned int& numBytesParsed) const;
+ bool IsValidSize(const std::string& tag, const std::string& type) const;
+ void SetErrorString(const std::string& where, const std::string& what) const;
+ bool SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) const;
+
+ // internal data
+private:
+ struct BamAlignmentSupportData
+ {
+
+ // data members
+ std::string AllCharData;
+ uint32_t BlockLength;
+ uint32_t NumCigarOperations;
+ uint32_t QueryNameLength;
+ uint32_t QuerySequenceLength;
+ bool HasCoreOnly;
+
+ // constructor
+ BamAlignmentSupportData()
+ : BlockLength(0)
+ , NumCigarOperations(0)
+ , QueryNameLength(0)
+ , QuerySequenceLength(0)
+ , HasCoreOnly(false)
+ {}
+ };
+ BamAlignmentSupportData SupportData;
+ friend class Internal::BamReaderPrivate;
+ friend class Internal::BamWriterPrivate;
+
+ mutable std::string ErrorString; // mutable to allow updates even in logically const methods
+ //! \endinternal
+};
+
+// ---------------------------------------------------------
+// BamAlignment tag access methods
+
+/*! \fn bool AddTag(const std::string& tag, const std::string& type, const T& value)
+ \brief Adds a field to the BAM tags.
+
+ Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
+
+ \param[in] tag 2-character tag name
+ \param[in] type 1-character tag type
+ \param[in] value data to store
+ \return \c true if the \b new tag was added successfully
+ \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
+*/
+template <typename T>
+inline bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const T& value)
+{
+
+ // if char data not populated, do that first
+ if (SupportData.HasCoreOnly) BuildCharData();
+
+ // check tag/type size
+ if (!IsValidSize(tag, type)) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // check that storage type code is OK for T
+ if (!TagTypeHelper<T>::CanConvertTo(type.at(0))) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag already exists, return false
+ // use EditTag explicitly instead
+ if (FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // otherwise, convert value to string
+ union
+ {
+ T value;
+ char valueBuffer[sizeof(T)];
+ } un;
+ un.value = value;
+
+ // copy original tag data to temp buffer
+ const std::string newTag = tag + type;
+ const std::size_t newTagDataLength =
+ tagDataLength + newTag.size() + sizeof(T); // leave room for new T
+ RaiiBuffer originalTagData(newTagDataLength);
+ memcpy(originalTagData.Buffer, TagData.c_str(),
+ tagDataLength + 1); // '+1' for TagData null-term
+
+ // append newTag
+ strcat(originalTagData.Buffer + tagDataLength, newTag.data());
+ memcpy(originalTagData.Buffer + tagDataLength + newTag.size(), un.valueBuffer, sizeof(T));
+
+ // store temp buffer back in TagData
+ const char* newTagData = (const char*)originalTagData.Buffer;
+ TagData.assign(newTagData, newTagDataLength);
+ return true;
+}
+
+template <>
+inline bool BamAlignment::AddTag<std::string>(const std::string& tag, const std::string& type,
+ const std::string& value)
+{
+ // if char data not populated, do that first
+ if (SupportData.HasCoreOnly) BuildCharData();
+
+ // check tag/type size
+ if (!IsValidSize(tag, type)) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // check that storage type code is OK for string
+ if (!TagTypeHelper<std::string>::CanConvertTo(type.at(0))) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag already exists, return false
+ // use EditTag explicitly instead
+ if (FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // otherwise, copy tag data to temp buffer
+ const std::string newTag = tag + type + value;
+ const std::size_t newTagDataLength =
+ tagDataLength + newTag.size() + 1; // leave room for null-term
+ RaiiBuffer originalTagData(newTagDataLength);
+ memcpy(originalTagData.Buffer, TagData.c_str(),
+ tagDataLength + 1); // '+1' for TagData null-term
+
+ // append newTag (removes original null-term, then appends newTag + null-term)
+ strcat(originalTagData.Buffer + tagDataLength, newTag.data());
+
+ // store temp buffer back in TagData
+ const char* newTagData = (const char*)originalTagData.Buffer;
+ TagData.assign(newTagData, newTagDataLength);
+ return true;
+}
+
+/*! \fn template<typename T> bool AddTag(const std::string& tag, const std::vector<T>& values)
+ \brief Adds a numeric array field to the BAM tags.
+
+ Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead.
+
+ \param[in] tag 2-character tag name
+ \param[in] values vector of data values to store
+ \return \c true if the \b new tag was added successfully
+ \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
+*/
+template <typename T>
+inline bool BamAlignment::AddTag(const std::string& tag, const std::vector<T>& values)
+{
+
+ // if char data not populated, do that first
+ if (SupportData.HasCoreOnly) BuildCharData();
+
+ // check for valid tag name length
+ if (tag.size() != Constants::BAM_TAG_TAGSIZE) return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag already exists, return false
+ // use EditTag explicitly instead
+ if (FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // build new tag's base information
+ char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
+ memcpy(newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE);
+ newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
+ newTagBase[3] = TagTypeHelper<T>::TypeCode();
+
+ // add number of array elements to newTagBase
+ const int32_t numElements = values.size();
+ memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
+
+ // copy current TagData string to temp buffer, leaving room for new tag's contents
+ const std::size_t newTagDataLength =
+ tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE + numElements * sizeof(T);
+ RaiiBuffer originalTagData(newTagDataLength);
+ memcpy(originalTagData.Buffer, TagData.c_str(),
+ tagDataLength + 1); // '+1' for TagData's null-term
+
+ // write newTagBase (removes old null term)
+ strcat(originalTagData.Buffer + tagDataLength, (const char*)newTagBase);
+
+ // add vector elements to tag
+ int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
+ for (int i = 0; i < numElements; ++i) {
+ const T& value = values.at(i);
+ memcpy(originalTagData.Buffer + elementsBeginOffset + i * sizeof(T), &value, sizeof(T));
+ }
+
+ // store temp buffer back in TagData
+ const char* newTagData = (const char*)originalTagData.Buffer;
+ TagData.assign(newTagData, newTagDataLength);
+ return true;
+}
+
+/*! \fn template<typename T> bool EditTag(const std::string& tag, const std::string& type, const T& value)
+ \brief Edits a BAM tag field.
+
+ If \a tag does not exist, a new entry is created.
+
+ \param tag[in] 2-character tag name
+ \param type[in] 1-character tag type (must be "Z" or "H")
+ \param value[in] new data value
+
+ \return \c true if the tag was modified/created successfully
+
+ \sa BamAlignment::RemoveTag()
+ \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
+*/
+template <typename T>
+inline bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const T& value)
+{
+
+ // if char data not populated, do that first
+ if (SupportData.HasCoreOnly) BuildCharData();
+
+ // remove existing tag if present, then append tag with new value
+ if (HasTag(tag)) RemoveTag(tag);
+ return AddTag(tag, type, value);
+}
+
+/*! \fn template<typename T> bool EditTag(const std::string& tag, const std::vector<T>& values)
+ \brief Edits a BAM tag field containing a numeric array.
+
+ If \a tag does not exist, a new entry is created.
+
+ \param tag[in] 2-character tag name
+ \param value[in] vector of data values
+
+ \return \c true if the tag was modified/created successfully
+ \sa \samSpecURL for more details on reserved tag names, supported tag types, etc.
+*/
+template <typename T>
+inline bool BamAlignment::EditTag(const std::string& tag, const std::vector<T>& values)
+{
+
+ // if char data not populated, do that first
+ if (SupportData.HasCoreOnly) BuildCharData();
+
+ // remove existing tag if present, then append tag with new values
+ if (HasTag(tag)) RemoveTag(tag);
+ return AddTag(tag, values);
+}
+
+/*! \fn template<typename T> bool GetTag(const std::string& tag, T& destination) const
+ \brief Retrieves the value associated with a BAM tag.
+
+ \param tag[in] 2-character tag name
+ \param destination[out] retrieved value
+ \return \c true if found
+*/
+template <typename T>
+inline bool BamAlignment::GetTag(const std::string& tag, T& destination) const
+{
+
+ // skip if alignment is core-only
+ if (SupportData.HasCoreOnly) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // skip if no tags present
+ if (TagData.empty()) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // return failure if tag not found
+ if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // fetch data type
+ const char type = *(pTagData - 1);
+ if (!TagTypeHelper<T>::CanConvertFrom(type)) {
+ // TODO: set error string ?
+ return false;
+ }
+
+ // determine data length
+ int destinationLength = 0;
+ switch (type) {
+
+ // 1 byte data
+ case (Constants::BAM_TAG_TYPE_ASCII):
+ case (Constants::BAM_TAG_TYPE_INT8):
+ case (Constants::BAM_TAG_TYPE_UINT8):
+ destinationLength = 1;
+ break;
+
+ // 2 byte data
+ case (Constants::BAM_TAG_TYPE_INT16):
+ case (Constants::BAM_TAG_TYPE_UINT16):
+ destinationLength = 2;
+ break;
+
+ // 4 byte data
+ case (Constants::BAM_TAG_TYPE_INT32):
+ case (Constants::BAM_TAG_TYPE_UINT32):
+ case (Constants::BAM_TAG_TYPE_FLOAT):
+ destinationLength = 4;
+ break;
+
+ // var-length types not supported for numeric destination
+ case (Constants::BAM_TAG_TYPE_STRING):
+ case (Constants::BAM_TAG_TYPE_HEX):
+ case (Constants::BAM_TAG_TYPE_ARRAY):
+ SetErrorString("BamAlignment::GetTag",
+ "cannot store variable length tag data into a numeric destination");
+ return false;
+
+ // unrecognized tag type
+ default:
+ const std::string message = std::string("invalid tag type: ") + type;
+ SetErrorString("BamAlignment::GetTag", message);
+ return false;
+ }
+
+ // store data in destination
+ destination = 0;
+ memcpy(&destination, pTagData, destinationLength);
+
+ // return success
+ return true;
+}
+
+template <>
+inline bool BamAlignment::GetTag<std::string>(const std::string& tag,
+ std::string& destination) const
+{
+ // skip if alignment is core-only
+ if (SupportData.HasCoreOnly) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // skip if no tags present
+ if (TagData.empty()) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // return failure if tag not found
+ if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // otherwise copy data into destination
+ const unsigned int dataLength = strlen(pTagData);
+ destination.clear();
+ destination.resize(dataLength);
+ memcpy((char*)destination.data(), pTagData, dataLength);
+
+ // return success
+ return true;
+}
+
+/*! \fn template<typename T> bool GetTag(const std::string& tag, std::vector<T>& destination) const
+ \brief Retrieves the numeric array associated with a BAM tag.
+
+ \param tag[in] 2-character tag name
+ \param destination[out] retrieved values
+ \return \c true if found
+*/
+template <typename T>
+inline bool BamAlignment::GetTag(const std::string& tag, std::vector<T>& destination) const
+{
+
+ // skip if alignment is core-only
+ if (SupportData.HasCoreOnly) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // skip if no tags present
+ if (TagData.empty()) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // return false if tag not found
+ if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
+ // TODO: set error string?
+ return false;
+ }
+
+ // check that tag is array type
+ const char tagType = *(pTagData - 1);
+ if (tagType != Constants::BAM_TAG_TYPE_ARRAY) {
+ SetErrorString("BamAlignment::GetTag", "cannot store a non-array tag in array destination");
+ return false;
+ }
+
+ // fetch element type
+ const char elementType = *pTagData;
+ if (!TagTypeHelper<T>::CanConvertFrom(elementType)) {
+ // TODO: set error string ?
+ return false;
+ }
+ ++pTagData;
+
+ // calculate length of each element in tag's array
+ switch (elementType) {
+ case (Constants::BAM_TAG_TYPE_ASCII):
+ case (Constants::BAM_TAG_TYPE_INT8):
+ case (Constants::BAM_TAG_TYPE_UINT8):
+ break;
+
+ case (Constants::BAM_TAG_TYPE_INT16):
+ case (Constants::BAM_TAG_TYPE_UINT16):
+ break;
+
+ case (Constants::BAM_TAG_TYPE_INT32):
+ case (Constants::BAM_TAG_TYPE_UINT32):
+ case (Constants::BAM_TAG_TYPE_FLOAT):
+ break;
+
+ // var-length types not supported for numeric destination
+ case (Constants::BAM_TAG_TYPE_STRING):
+ case (Constants::BAM_TAG_TYPE_HEX):
+ case (Constants::BAM_TAG_TYPE_ARRAY):
+ SetErrorString("BamAlignment::GetTag",
+ "invalid array data, variable-length elements are not allowed");
+ return false;
+
+ // unknown tag type
+ default:
+ const std::string message = std::string("invalid array element type: ") + elementType;
+ SetErrorString("BamAlignment::GetTag", message);
+ return false;
+ }
+
+ // get number of elements
+ int32_t numElements;
+ memcpy(&numElements, pTagData, sizeof(int32_t));
+ pTagData += 4;
+ destination.clear();
+ destination.reserve(numElements);
+
+ // read in elements
+ T value;
+ for (int i = 0; i < numElements; ++i) {
+ memcpy(&value, pTagData, sizeof(T));
+ pTagData += sizeof(T);
+ destination.push_back(value);
+ }
+
+ // return success
+ return true;
+}
+
+typedef std::vector<BamAlignment> BamAlignmentVector;
+
+} // namespace BamTools
+
+#endif // BAMALIGNMENT_H
diff --git a/src/api/BamAux.h b/src/api/BamAux.h
new file mode 100644
index 0000000..e0f48f9
--- /dev/null
+++ b/src/api/BamAux.h
@@ -0,0 +1,519 @@
+// ***************************************************************************
+// BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides data structures & utility methods that are used throughout the API.
+// ***************************************************************************
+
+#ifndef BAMAUX_H
+#define BAMAUX_H
+
+#include <cstddef>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "api/api_global.h"
+
+/*! \file BamAux.h
+
+ Provides data structures & utility methods that are used throughout the API.
+*/
+
+/*! \namespace BamTools
+ \brief Contains all BamTools classes & methods.
+
+ The BamTools API contained in this namespace contains classes and methods
+ for reading, writing, and manipulating BAM alignment files.
+*/
+namespace BamTools {
+
+// ----------------------------------------------------------------
+// CigarOp
+
+/*! \struct BamTools::CigarOp
+ \brief Represents a CIGAR alignment operation.
+
+ \sa \samSpecURL for more details on using CIGAR operations.
+*/
+struct API_EXPORT CigarOp
+{
+
+ char Type; //!< CIGAR operation type (MIDNSHPX=)
+ uint32_t Length; //!< CIGAR operation length (number of bases)
+
+ //! constructor
+ CigarOp(const char type = '\0', const uint32_t& length = 0)
+ : Type(type)
+ , Length(length)
+ {}
+};
+
+// ----------------------------------------------------------------
+// RefData
+
+/*! \struct BamTools::RefData
+ \brief Represents a reference sequence entry
+*/
+struct API_EXPORT RefData
+{
+
+ std::string RefName; //!< name of reference sequence
+ int32_t RefLength; //!< length of reference sequence
+
+ //! constructor
+ RefData(const std::string& name = std::string(), const int32_t& length = 0)
+ : RefName(name)
+ , RefLength(length)
+ {}
+};
+
+//! convenience typedef for vector of RefData entries
+typedef std::vector<RefData> RefVector;
+
+// ----------------------------------------------------------------
+// BamRegion
+
+/*! \struct BamTools::BamRegion
+ \brief Represents a sequential genomic region
+
+ Allowed to span multiple (sequential) references.
+
+ \warning BamRegion now represents a zero-based, HALF-OPEN interval.
+ In previous versions of BamTools (0.x & 1.x) all intervals were treated
+ as zero-based, CLOSED.
+*/
+struct API_EXPORT BamRegion
+{
+
+ int LeftRefID; //!< reference ID for region's left boundary
+ int LeftPosition; //!< position for region's left boundary
+ int RightRefID; //!< reference ID for region's right boundary
+ int RightPosition; //!< position for region's right boundary
+
+ //! constructor
+ BamRegion(const int& leftID = -1, const int& leftPos = -1, const int& rightID = -1,
+ const int& rightPos = -1)
+ : LeftRefID(leftID)
+ , LeftPosition(leftPos)
+ , RightRefID(rightID)
+ , RightPosition(rightPos)
+ {}
+
+ //! copy constructor
+ BamRegion(const BamRegion& other)
+ : LeftRefID(other.LeftRefID)
+ , LeftPosition(other.LeftPosition)
+ , RightRefID(other.RightRefID)
+ , RightPosition(other.RightPosition)
+ {}
+
+ //! Clears region boundaries
+ void clear()
+ {
+ LeftRefID = -1;
+ LeftPosition = -1;
+ RightRefID = -1;
+ RightPosition = -1;
+ }
+
+ //! Returns true if region has a left boundary
+ bool isLeftBoundSpecified() const
+ {
+ return (LeftRefID >= 0 && LeftPosition >= 0);
+ }
+
+ //! Returns true if region boundaries are not defined
+ bool isNull() const
+ {
+ return (!isLeftBoundSpecified() && !isRightBoundSpecified());
+ }
+
+ //! Returns true if region has a right boundary
+ bool isRightBoundSpecified() const
+ {
+ return (RightRefID >= 0 && RightPosition >= 1);
+ }
+};
+
+struct CustomHeaderTag
+{
+ std::string TagName;
+ std::string TagValue;
+};
+
+// ----------------------------------------------------------------
+// General utility methods
+
+/*! \fn bool FileExists(const std::string& filename)
+ \brief returns true if the file exists
+*/
+API_EXPORT inline bool FileExists(const std::string& filename)
+{
+ std::ifstream f(filename.c_str(), std::ifstream::in);
+ return !f.fail();
+}
+
+/*! \fn void SwapEndian_16(int16_t& x)
+ \brief swaps endianness of signed 16-bit integer, in place
+*/
+API_EXPORT inline void SwapEndian_16(int16_t& x)
+{
+ x = ((x >> 8) | (x << 8));
+}
+
+/*! \fn void SwapEndian_16(uint16_t& x)
+ \brief swaps endianness of unsigned 16-bit integer, in place
+*/
+API_EXPORT inline void SwapEndian_16(uint16_t& x)
+{
+ x = ((x >> 8) | (x << 8));
+}
+
+/*! \fn void SwapEndian_32(int32_t& x)
+ \brief swaps endianness of signed 32-bit integer, in place
+*/
+API_EXPORT inline void SwapEndian_32(int32_t& x)
+{
+ x = ((x >> 24) | ((x << 8) & 0x00FF0000) | ((x >> 8) & 0x0000FF00) | (x << 24));
+}
+
+/*! \fn void SwapEndian_32(uint32_t& x)
+ \brief swaps endianness of unsigned 32-bit integer, in place
+*/
+API_EXPORT inline void SwapEndian_32(uint32_t& x)
+{
+ x = ((x >> 24) | ((x << 8) & 0x00FF0000) | ((x >> 8) & 0x0000FF00) | (x << 24));
+}
+
+/*! \fn void SwapEndian_64(int64_t& x)
+ \brief swaps endianness of signed 64-bit integer, in place
+*/
+API_EXPORT inline void SwapEndian_64(int64_t& x)
+{
+ x = ((x >> 56) | ((x << 40) & 0x00FF000000000000ll) | ((x << 24) & 0x0000FF0000000000ll) |
+ ((x << 8) & 0x000000FF00000000ll) | ((x >> 8) & 0x00000000FF000000ll) |
+ ((x >> 24) & 0x0000000000FF0000ll) | ((x >> 40) & 0x000000000000FF00ll) | (x << 56));
+}
+
+/*! \fn void SwapEndian_64(uint64_t& x)
+ \brief swaps endianness of unsigned 64-bit integer, in place
+*/
+API_EXPORT inline void SwapEndian_64(uint64_t& x)
+{
+ x = ((x >> 56) | ((x << 40) & 0x00FF000000000000ll) | ((x << 24) & 0x0000FF0000000000ll) |
+ ((x << 8) & 0x000000FF00000000ll) | ((x >> 8) & 0x00000000FF000000ll) |
+ ((x >> 24) & 0x0000000000FF0000ll) | ((x >> 40) & 0x000000000000FF00ll) | (x << 56));
+}
+
+/*! \fn void SwapEndian_16p(char* data)
+ \brief swaps endianness of the next 2 bytes in a buffer, in place
+*/
+API_EXPORT inline void SwapEndian_16p(char* data)
+{
+ uint16_t& value = (uint16_t&)*data;
+ SwapEndian_16(value);
+}
+
+/*! \fn void SwapEndian_32p(char* data)
+ \brief swaps endianness of the next 4 bytes in a buffer, in place
+*/
+API_EXPORT inline void SwapEndian_32p(char* data)
+{
+ uint32_t& value = (uint32_t&)*data;
+ SwapEndian_32(value);
+}
+
+/*! \fn void SwapEndian_64p(char* data)
+ \brief swaps endianness of the next 8 bytes in a buffer, in place
+*/
+API_EXPORT inline void SwapEndian_64p(char* data)
+{
+ uint64_t& value = (uint64_t&)*data;
+ SwapEndian_64(value);
+}
+
+/*! \fn bool SystemIsBigEndian()
+ \brief checks host architecture's byte order
+ \return \c true if system uses big-endian ordering
+*/
+API_EXPORT inline bool SystemIsBigEndian()
+{
+ const uint16_t one = 0x0001;
+ return ((*(char*)&one) == 0);
+}
+
+/*! \fn void PackUnsignedInt(char* buffer, unsigned int value)
+ \brief stores unsigned integer value in a byte buffer
+
+ \param[out] buffer destination buffer
+ \param[in] value value to 'pack' in buffer
+*/
+API_EXPORT inline void PackUnsignedInt(char* buffer, unsigned int value)
+{
+ buffer[0] = (char)value;
+ buffer[1] = (char)(value >> 8);
+ buffer[2] = (char)(value >> 16);
+ buffer[3] = (char)(value >> 24);
+}
+
+/*! \fn void PackUnsignedShort(char* buffer, unsigned short value)
+ \brief stores unsigned short integer value in a byte buffer
+
+ \param[out] buffer destination buffer
+ \param[in] value value to 'pack' in buffer
+*/
+API_EXPORT inline void PackUnsignedShort(char* buffer, unsigned short value)
+{
+ buffer[0] = (char)value;
+ buffer[1] = (char)(value >> 8);
+}
+
+/*! \fn double UnpackDouble(const char* buffer)
+ \brief reads a double value from byte buffer
+
+ \param[in] buffer source byte buffer
+ \return the (double) value read from the buffer
+*/
+API_EXPORT inline double UnpackDouble(const char* buffer)
+{
+ union
+ {
+ double value;
+ unsigned char valueBuffer[sizeof(double)];
+ } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ un.valueBuffer[4] = buffer[4];
+ un.valueBuffer[5] = buffer[5];
+ un.valueBuffer[6] = buffer[6];
+ un.valueBuffer[7] = buffer[7];
+ return un.value;
+}
+
+/*! \fn double UnpackDouble(char* buffer)
+ \brief reads a double value from byte buffer
+
+ This is an overloaded function.
+
+ \param[in] buffer source byte buffer
+ \return the (double) value read from the buffer
+*/
+API_EXPORT inline double UnpackDouble(char* buffer)
+{
+ return UnpackDouble((const char*)buffer);
+}
+
+/*! \fn double UnpackFloat(const char* buffer)
+ \brief reads a float value from byte buffer
+
+ \param[in] buffer source byte buffer
+ \return the (float) value read from the buffer
+*/
+API_EXPORT inline float UnpackFloat(const char* buffer)
+{
+ union
+ {
+ float value;
+ unsigned char valueBuffer[sizeof(float)];
+ } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ return un.value;
+}
+
+/*! \fn double UnpackFloat(char* buffer)
+ \brief reads a float value from byte buffer
+
+ This is an overloaded function.
+
+ \param[in] buffer source byte buffer
+ \return the (float) value read from the buffer
+*/
+API_EXPORT inline float UnpackFloat(char* buffer)
+{
+ return UnpackFloat((const char*)buffer);
+}
+
+/*! \fn signed int UnpackSignedInt(const char* buffer)
+ \brief reads a signed integer value from byte buffer
+
+ \param[in] buffer source byte buffer
+ \return the (signed int) value read from the buffer
+*/
+API_EXPORT inline signed int UnpackSignedInt(const char* buffer)
+{
+ union
+ {
+ signed int value;
+ unsigned char valueBuffer[sizeof(signed int)];
+ } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ return un.value;
+}
+
+/*! \fn signed int UnpackSignedInt(char* buffer)
+ \brief reads a signed integer value from byte buffer
+
+ This is an overloaded function.
+
+ \param[in] buffer source byte buffer
+ \return the (signed int) value read from the buffer
+*/
+API_EXPORT inline signed int UnpackSignedInt(char* buffer)
+{
+ return UnpackSignedInt((const char*)buffer);
+}
+
+/*! \fn signed short UnpackSignedShort(const char* buffer)
+ \brief reads a signed short integer value from byte buffer
+
+ \param[in] buffer source byte buffer
+ \return the (signed short) value read from the buffer
+*/
+API_EXPORT inline signed short UnpackSignedShort(const char* buffer)
+{
+ union
+ {
+ signed short value;
+ unsigned char valueBuffer[sizeof(signed short)];
+ } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ return un.value;
+}
+
+/*! \fn signed short UnpackSignedShort(char* buffer)
+ \brief reads a signed short integer value from byte buffer
+
+ This is an overloaded function.
+
+ \param[in] buffer source byte buffer
+ \return the (signed short) value read from the buffer
+*/
+API_EXPORT inline signed short UnpackSignedShort(char* buffer)
+{
+ return UnpackSignedShort((const char*)buffer);
+}
+
+/*! \fn unsigned int UnpackUnsignedInt(const char* buffer)
+ \brief reads an unsigned integer value from byte buffer
+
+ \param[in] buffer source byte buffer
+ \return the (unsigned int) value read from the buffer
+*/
+API_EXPORT inline unsigned int UnpackUnsignedInt(const char* buffer)
+{
+ union
+ {
+ unsigned int value;
+ unsigned char valueBuffer[sizeof(unsigned int)];
+ } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ return un.value;
+}
+
+/*! \fn unsigned int UnpackUnsignedInt(char* buffer)
+ \brief reads an unsigned integer value from byte buffer
+
+ This is an overloaded function.
+
+ \param[in] buffer source byte buffer
+ \return the (unsigned int) value read from the buffer
+*/
+API_EXPORT inline unsigned int UnpackUnsignedInt(char* buffer)
+{
+ return UnpackUnsignedInt((const char*)buffer);
+}
+
+/*! \fn unsigned short UnpackUnsignedShort(const char* buffer)
+ \brief reads an unsigned short integer value from byte buffer
+
+ \param[in] buffer source byte buffer
+ \return the (unsigned short) value read from the buffer
+*/
+API_EXPORT inline unsigned short UnpackUnsignedShort(const char* buffer)
+{
+ union
+ {
+ unsigned short value;
+ unsigned char valueBuffer[sizeof(unsigned short)];
+ } un;
+ un.value = 0;
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ un.valueBuffer[0] = buffer[1];
+ un.valueBuffer[1] = buffer[0];
+#else
+#error "Unsupported hardware"
+#endif
+ return un.value;
+}
+
+/*! \fn unsigned short UnpackUnsignedShort(char* buffer)
+ \brief reads an unsigned short integer value from byte buffer
+
+ This is an overloaded function.
+
+ \param[in] buffer source byte buffer
+ \return the (unsigned short) value read from the buffer
+*/
+API_EXPORT inline unsigned short UnpackUnsignedShort(char* buffer)
+{
+ return UnpackUnsignedShort((const char*)buffer);
+}
+
+// ----------------------------------------------------------------
+// 'internal' helper structs
+
+/*! \struct RaiiBuffer
+ \internal
+*/
+struct RaiiBuffer
+{
+
+ // data members
+ char* Buffer;
+ const std::size_t NumBytes;
+
+ // ctor & dtor
+ RaiiBuffer(const std::size_t n)
+ : Buffer(new char[n]())
+ , NumBytes(n)
+ {}
+
+ ~RaiiBuffer()
+ {
+ delete[] Buffer;
+ }
+
+ // add'l methods
+ void Clear()
+ {
+ memset(Buffer, 0, NumBytes);
+ }
+};
+
+} // namespace BamTools
+
+#endif // BAMAUX_H
diff --git a/src/api/BamConstants.h b/src/api/BamConstants.h
new file mode 100644
index 0000000..973c13b
--- /dev/null
+++ b/src/api/BamConstants.h
@@ -0,0 +1,323 @@
+// ***************************************************************************
+// BamConstants.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 16 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides basic constants for handling BAM files.
+// ***************************************************************************
+
+#ifndef BAM_CONSTANTS_H
+#define BAM_CONSTANTS_H
+
+#include <cassert>
+#include <string>
+#include "api/api_global.h"
+
+/*! \namespace BamTools::Constants
+ \brief Provides basic constants for handling BAM files.
+*/
+
+namespace BamTools {
+namespace Constants {
+
+const uint8_t BAM_SIZEOF_INT = 4;
+
+// header magic number
+const char* const BAM_HEADER_MAGIC = "BAM\1";
+const uint8_t BAM_HEADER_MAGIC_LENGTH = 4;
+
+// BAM alignment core size
+const uint8_t BAM_CORE_SIZE = 32;
+const uint8_t BAM_CORE_BUFFER_SIZE = 8;
+
+// BAM alignment flags
+const int BAM_ALIGNMENT_PAIRED = 0x0001;
+const int BAM_ALIGNMENT_PROPER_PAIR = 0x0002;
+const int BAM_ALIGNMENT_UNMAPPED = 0x0004;
+const int BAM_ALIGNMENT_MATE_UNMAPPED = 0x0008;
+const int BAM_ALIGNMENT_REVERSE_STRAND = 0x0010;
+const int BAM_ALIGNMENT_MATE_REVERSE_STRAND = 0x0020;
+const int BAM_ALIGNMENT_READ_1 = 0x0040;
+const int BAM_ALIGNMENT_READ_2 = 0x0080;
+const int BAM_ALIGNMENT_SECONDARY = 0x0100;
+const int BAM_ALIGNMENT_QC_FAILED = 0x0200;
+const int BAM_ALIGNMENT_DUPLICATE = 0x0400;
+
+// CIGAR constants
+const char* const BAM_CIGAR_LOOKUP = "MIDNSHP=X";
+const uint8_t BAM_CIGAR_MATCH = 0;
+const uint8_t BAM_CIGAR_INS = 1;
+const uint8_t BAM_CIGAR_DEL = 2;
+const uint8_t BAM_CIGAR_REFSKIP = 3;
+const uint8_t BAM_CIGAR_SOFTCLIP = 4;
+const uint8_t BAM_CIGAR_HARDCLIP = 5;
+const uint8_t BAM_CIGAR_PAD = 6;
+const uint8_t BAM_CIGAR_SEQMATCH = 7;
+const uint8_t BAM_CIGAR_MISMATCH = 8;
+
+const char BAM_CIGAR_MATCH_CHAR = 'M';
+const char BAM_CIGAR_INS_CHAR = 'I';
+const char BAM_CIGAR_DEL_CHAR = 'D';
+const char BAM_CIGAR_REFSKIP_CHAR = 'N';
+const char BAM_CIGAR_SOFTCLIP_CHAR = 'S';
+const char BAM_CIGAR_HARDCLIP_CHAR = 'H';
+const char BAM_CIGAR_PAD_CHAR = 'P';
+const char BAM_CIGAR_SEQMATCH_CHAR = '=';
+const char BAM_CIGAR_MISMATCH_CHAR = 'X';
+
+const int BAM_CIGAR_SHIFT = 4;
+const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1);
+
+// BAM tag types & sizes
+const char BAM_TAG_TYPE_ASCII = 'A';
+const char BAM_TAG_TYPE_INT8 = 'c';
+const char BAM_TAG_TYPE_UINT8 = 'C';
+const char BAM_TAG_TYPE_INT16 = 's';
+const char BAM_TAG_TYPE_UINT16 = 'S';
+const char BAM_TAG_TYPE_INT32 = 'i';
+const char BAM_TAG_TYPE_UINT32 = 'I';
+const char BAM_TAG_TYPE_FLOAT = 'f';
+const char BAM_TAG_TYPE_STRING = 'Z';
+const char BAM_TAG_TYPE_HEX = 'H';
+const char BAM_TAG_TYPE_ARRAY = 'B';
+
+const uint8_t BAM_TAG_TAGSIZE = 2;
+const uint8_t BAM_TAG_TYPESIZE = 1;
+const uint8_t BAM_TAG_ARRAYBASE_SIZE = 8;
+
+// DNA bases
+const char* const BAM_DNA_LOOKUP = "=ACMGRSVTWYHKDBN";
+const uint8_t BAM_BASECODE_EQUAL = 0;
+const uint8_t BAM_BASECODE_A = 1;
+const uint8_t BAM_BASECODE_C = 2;
+const uint8_t BAM_BASECODE_M = 3;
+const uint8_t BAM_BASECODE_G = 4;
+const uint8_t BAM_BASECODE_R = 5;
+const uint8_t BAM_BASECODE_S = 6;
+const uint8_t BAM_BASECODE_V = 7;
+const uint8_t BAM_BASECODE_T = 8;
+const uint8_t BAM_BASECODE_W = 9;
+const uint8_t BAM_BASECODE_Y = 10;
+const uint8_t BAM_BASECODE_H = 11;
+const uint8_t BAM_BASECODE_K = 12;
+const uint8_t BAM_BASECODE_D = 13;
+const uint8_t BAM_BASECODE_B = 14;
+const uint8_t BAM_BASECODE_N = 15;
+
+const char BAM_DNA_EQUAL = '=';
+const char BAM_DNA_A = 'A';
+const char BAM_DNA_C = 'C';
+const char BAM_DNA_M = 'M';
+const char BAM_DNA_G = 'G';
+const char BAM_DNA_R = 'R';
+const char BAM_DNA_S = 'S';
+const char BAM_DNA_V = 'V';
+const char BAM_DNA_T = 'T';
+const char BAM_DNA_W = 'W';
+const char BAM_DNA_Y = 'Y';
+const char BAM_DNA_H = 'H';
+const char BAM_DNA_K = 'K';
+const char BAM_DNA_D = 'D';
+const char BAM_DNA_B = 'B';
+const char BAM_DNA_N = 'N';
+const char BAM_DNA_DEL = '-';
+const char BAM_DNA_PAD = '*';
+
+// zlib & BGZF constants
+const char GZIP_ID1 = 31;
+const char GZIP_ID2 = static_cast<char>(139);
+const char CM_DEFLATE = 8;
+const char FLG_FEXTRA = 4;
+const char OS_UNKNOWN = static_cast<char>(255);
+const char BGZF_XLEN = 6;
+const char BGZF_ID1 = 66;
+const char BGZF_ID2 = 67;
+const char BGZF_LEN = 2;
+
+const int8_t GZIP_WINDOW_BITS = -15;
+const int8_t Z_DEFAULT_MEM_LEVEL = 8;
+const uint8_t BGZF_BLOCK_HEADER_LENGTH = 18;
+const uint8_t BGZF_BLOCK_FOOTER_LENGTH = 8;
+const uint32_t BGZF_MAX_BLOCK_SIZE = 65536;
+const uint32_t BGZF_DEFAULT_BLOCK_SIZE = 65536;
+
+} // namespace Constants
+
+//! \cond
+// -------------------------
+// tag-type helper structs
+// -------------------------
+
+// fail on any types not specified below
+template <typename T>
+struct TagTypeHelper
+{
+ static bool CanConvertFrom(const char)
+ {
+ assert(false);
+ return false;
+ }
+ static bool CanConvertTo(const char)
+ {
+ assert(false);
+ return false;
+ }
+ static char TypeCode()
+ {
+ assert(false);
+ return 0;
+ }
+};
+
+template <>
+struct TagTypeHelper<uint8_t>
+{
+ static bool CanConvertFrom(const char c)
+ {
+ return (c == Constants::BAM_TAG_TYPE_ASCII || c == Constants::BAM_TAG_TYPE_UINT8);
+ }
+ static bool CanConvertTo(const char c)
+ {
+ return (c == Constants::BAM_TAG_TYPE_ASCII || c == Constants::BAM_TAG_TYPE_UINT8 ||
+ c == Constants::BAM_TAG_TYPE_UINT16 || c == Constants::BAM_TAG_TYPE_UINT32);
+ }
+
+ static char TypeCode()
+ {
+ return Constants::BAM_TAG_TYPE_UINT8;
+ }
+};
+
+template <>
+struct TagTypeHelper<int8_t>
+{
+ static bool CanConvertFrom(const char c)
+ {
+ return (c == Constants::BAM_TAG_TYPE_ASCII || c == Constants::BAM_TAG_TYPE_INT8);
+ }
+ static bool CanConvertTo(const char c)
+ {
+ return (c == Constants::BAM_TAG_TYPE_ASCII || c == Constants::BAM_TAG_TYPE_INT8 ||
+ c == Constants::BAM_TAG_TYPE_INT16 || c == Constants::BAM_TAG_TYPE_INT32);
+ }
+ static char TypeCode()
+ {
+ return Constants::BAM_TAG_TYPE_INT8;
+ }
+};
+
+template <>
+struct TagTypeHelper<uint16_t>
+{
+ static bool CanConvertFrom(const char c)
+ {
+ return (c == Constants::BAM_TAG_TYPE_ASCII || c == Constants::BAM_TAG_TYPE_UINT8 ||
+ c == Constants::BAM_TAG_TYPE_UINT16);
+ }
+ static bool CanConvertTo(const char c)
+ {
+ return (c == Constants::BAM_TAG_TYPE_UINT16 || c == Constants::BAM_TAG_TYPE_UINT32);
+ }
+ static char TypeCode()
+ {
+ return Constants::BAM_TAG_TYPE_UINT16;
+ }
+};
+
+template <>
+struct TagTypeHelper<int16_t>
+{
+ static bool CanConvertFrom(const char c)
+ {
+ return (c == Constants::BAM_TAG_TYPE_ASCII || c == Constants::BAM_TAG_TYPE_INT8 ||
+ c == Constants::BAM_TAG_TYPE_INT16);
+ }
+ static bool CanConvertTo(const char c)
+ {
+ return (c == Constants::BAM_TAG_TYPE_INT16 || c == Constants::BAM_TAG_TYPE_INT32);
+ }
+ static char TypeCode()
+ {
+ return Constants::BAM_TAG_TYPE_INT16;
+ }
+};
+
+template <>
+struct TagTypeHelper<uint32_t>
+{
+ static bool CanConvertFrom(const char c)
+ {
+ return (c == Constants::BAM_TAG_TYPE_ASCII || c == Constants::BAM_TAG_TYPE_UINT8 ||
+ c == Constants::BAM_TAG_TYPE_UINT16 || c == Constants::BAM_TAG_TYPE_UINT32);
+ }
+ static bool CanConvertTo(const char c)
+ {
+ return (c == Constants::BAM_TAG_TYPE_UINT32);
+ }
+ static char TypeCode()
+ {
+ return Constants::BAM_TAG_TYPE_UINT32;
+ }
+};
+
+template <>
+struct TagTypeHelper<int32_t>
+{
+ static bool CanConvertFrom(const char c)
+ {
+ return (c == Constants::BAM_TAG_TYPE_ASCII || c == Constants::BAM_TAG_TYPE_INT8 ||
+ c == Constants::BAM_TAG_TYPE_INT16 || c == Constants::BAM_TAG_TYPE_INT32);
+ }
+ static bool CanConvertTo(const char c)
+ {
+ return (c == Constants::BAM_TAG_TYPE_INT32);
+ }
+ static char TypeCode()
+ {
+ return Constants::BAM_TAG_TYPE_INT32;
+ }
+};
+
+template <>
+struct TagTypeHelper<float>
+{
+ static bool CanConvertFrom(const char c)
+ {
+ return (c == Constants::BAM_TAG_TYPE_ASCII || c == Constants::BAM_TAG_TYPE_UINT8 ||
+ c == Constants::BAM_TAG_TYPE_INT8 || c == Constants::BAM_TAG_TYPE_UINT16 ||
+ c == Constants::BAM_TAG_TYPE_INT16 || c == Constants::BAM_TAG_TYPE_UINT32 ||
+ c == Constants::BAM_TAG_TYPE_INT32 || c == Constants::BAM_TAG_TYPE_FLOAT);
+ }
+ static bool CanConvertTo(const char c)
+ {
+ return (c == Constants::BAM_TAG_TYPE_FLOAT);
+ }
+ static char TypeCode()
+ {
+ return Constants::BAM_TAG_TYPE_FLOAT;
+ }
+};
+
+template <>
+struct TagTypeHelper<std::string>
+{
+ static bool CanConvertFrom(const char c)
+ {
+ return (c == Constants::BAM_TAG_TYPE_HEX || c == Constants::BAM_TAG_TYPE_STRING);
+ }
+ static bool CanConvertTo(const char c)
+ {
+ return (c == Constants::BAM_TAG_TYPE_HEX || c == Constants::BAM_TAG_TYPE_STRING);
+ }
+ static char TypeCode()
+ {
+ return Constants::BAM_TAG_TYPE_STRING;
+ }
+};
+
+//! \endcond
+
+} // namespace BamTools
+
+#endif // BAM_CONSTANTS_H
diff --git a/src/api/BamIndex.h b/src/api/BamIndex.h
new file mode 100644
index 0000000..ccf3036
--- /dev/null
+++ b/src/api/BamIndex.h
@@ -0,0 +1,98 @@
+// ***************************************************************************
+// BamIndex.h (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides basic BAM index interface
+// ***************************************************************************
+
+#ifndef BAM_INDEX_H
+#define BAM_INDEX_H
+
+#include <string>
+#include "api/BamAux.h"
+#include "api/api_global.h"
+
+namespace BamTools {
+
+namespace Internal {
+class BamReaderPrivate;
+} // namespace Internal
+
+/*! \class BamTools::BamIndex
+ \brief Provides methods for generating & loading BAM index files.
+
+ This class straddles the line between public API and internal
+ implementation detail. Most client code should never have to use this
+ class directly.
+
+ It is exposed to the public API to allow advanced users to implement
+ their own custom indexing schemes.
+*/
+
+class API_EXPORT BamIndex
+{
+
+ // enums
+public:
+ // list of supported BamIndex types
+ enum IndexType
+ {
+ BAMTOOLS = 0,
+ STANDARD
+ };
+
+ // ctor & dtor
+public:
+ BamIndex(Internal::BamReaderPrivate* reader)
+ : m_reader(reader)
+ {}
+ virtual ~BamIndex() {}
+
+ // index interface
+public:
+ // builds index from associated BAM file & writes out to index file
+ virtual bool Create() = 0;
+
+ // returns a human-readable description of the last error encountered
+ std::string GetErrorString()
+ {
+ return m_errorString;
+ }
+
+ // returns whether reference has alignments or no
+ virtual bool HasAlignments(const int& referenceID) const = 0;
+
+ // attempts to use index data to jump to @region, returns success/fail
+ // a "successful" jump indicates no error, but not whether this region has data
+ // * thus, the method sets a flag to indicate whether there are alignments
+ // available after the jump position
+ virtual bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) = 0;
+
+ // loads existing data from file into memory
+ virtual bool Load(const std::string& filename) = 0;
+
+ // returns the 'type' enum for derived index format
+ virtual BamIndex::IndexType Type() const = 0;
+
+ //! \cond
+
+ // internal methods
+protected:
+ void SetErrorString(const std::string& where, const std::string& what) const
+ {
+ m_errorString = where + ": " + what;
+ }
+
+ // data members
+protected:
+ Internal::BamReaderPrivate* m_reader; // copy, not owned
+ mutable std::string m_errorString;
+
+ //! \endcond
+};
+
+} // namespace BamTools
+
+#endif // BAM_INDEX_H
diff --git a/src/api/BamMultiReader.cpp b/src/api/BamMultiReader.cpp
new file mode 100644
index 0000000..82a98a5
--- /dev/null
+++ b/src/api/BamMultiReader.cpp
@@ -0,0 +1,442 @@
+// ***************************************************************************
+// BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 14 January 2013 (DB)
+// ---------------------------------------------------------------------------
+// Convenience class for reading multiple BAM files.
+//
+// This functionality allows applications to work on very large sets of files
+// without requiring intermediate merge, sort, and index steps for each file
+// subset. It also improves the performance of our merge system as it
+// precludes the need to sort merged files.
+// ***************************************************************************
+
+#include "api/BamMultiReader.h"
+#include "api/internal/bam/BamMultiReader_p.h"
+using namespace BamTools;
+
+#include <string>
+#include <vector>
+
+/*! \class BamTools::BamMultiReader
+ \brief Convenience class for reading multiple BAM files.
+*/
+/*! \enum BamMultiReader::MergeOrder
+ \brief Used to describe the merge strategy of the BamMultiReader.
+
+ The merge strategy determines which alignment is 'next' from across
+ all opened BAM files.
+*/
+/*! \var BamMultiReader::MergeOrder BamMultiReader::RoundRobinMerge
+ \brief Merge strategy when BAM files are unsorted, or their sorted status is either unknown or ignored
+*/
+/*! \var BamMultiReader::MergeOrder BamMultiReader::MergeByCoordinate
+ \brief Merge strategy when BAM files are sorted by position ('coordinate')
+*/
+/*! \var BamMultiReader::MergeOrder BamMultiReader::MergeByName
+ \brief Merge strategy when BAM files are sorted by read name ('queryname')
+*/
+
+/*! \fn BamMultiReader::BamMultiReader()
+ \brief constructor
+*/
+BamMultiReader::BamMultiReader()
+ : d(new Internal::BamMultiReaderPrivate)
+{}
+
+/*! \fn BamMultiReader::~BamMultiReader()
+ \brief destructor
+*/
+BamMultiReader::~BamMultiReader()
+{
+ delete d;
+ d = 0;
+}
+
+/*! \fn void BamMultiReader::Close()
+ \brief Closes all open BAM files.
+
+ Also clears out all header and reference data.
+
+ \sa CloseFile(), IsOpen(), Open(), BamReader::Close()
+*/
+bool BamMultiReader::Close()
+{
+ return d->Close();
+}
+
+/*! \fn void BamMultiReader::CloseFile(const std::string& filename)
+ \brief Closes requested BAM file.
+
+ Leaves any other file(s) open, along with header and reference data.
+
+ \param[in] filename name of specific BAM file to close
+
+ \sa Close(), IsOpen(), Open(), BamReader::Close()
+*/
+bool BamMultiReader::CloseFile(const std::string& filename)
+{
+ return d->CloseFile(filename);
+}
+
+/*! \fn bool BamMultiReader::CreateIndexes(const BamIndex::IndexType& type)
+ \brief Creates index files for the current BAM files.
+
+ \param[in] type file format to create, see BamIndex::IndexType for available formats
+ \return \c true if index files created OK
+ \sa LocateIndexes(), OpenIndexes(), BamReader::CreateIndex()
+*/
+bool BamMultiReader::CreateIndexes(const BamIndex::IndexType& type)
+{
+ return d->CreateIndexes(type);
+}
+
+/*! \fn const std::vector<std::string> BamMultiReader::Filenames() const
+ \brief Returns list of filenames for all open BAM files.
+
+ Retrieved filenames will contain whatever was passed via Open().
+ If you need full directory paths here, be sure to include them
+ when you open the BAM files.
+
+ \returns names of open BAM files. If no files are open, returns an empty vector.
+ \sa IsOpen(), BamReader::GetFilename()
+*/
+const std::vector<std::string> BamMultiReader::Filenames() const
+{
+ return d->Filenames();
+}
+
+/*! \fn std::string BamMultiReader::GetErrorString() const
+ \brief Returns a human-readable description of the last error that occurred
+
+ This method allows elimination of STDERR pollution. Developers of client code
+ may choose how the messages are displayed to the user, if at all.
+
+ \return error description
+*/
+std::string BamMultiReader::GetErrorString() const
+{
+ return d->GetErrorString();
+}
+
+/*! \fn SamHeader BamMultiReader::GetHeader() const
+ \brief Returns unified SAM-format header for all files
+
+ \note Modifying the retrieved text does NOT affect the current
+ BAM files. These files have been opened in a read-only mode. However,
+ your modified header text can be used in conjunction with BamWriter
+ to generate a new BAM file with the appropriate header information.
+
+ \returns header data wrapped in SamHeader object
+ \sa GetHeaderText(), BamReader::GetHeader()
+*/
+SamHeader BamMultiReader::GetHeader() const
+{
+ return d->GetHeader();
+}
+
+/*! \fn std::string BamMultiReader::GetHeaderText() const
+ \brief Returns unified SAM-format header text for all files
+
+ \note Modifying the retrieved text does NOT affect the current
+ BAM files. These files have been opened in a read-only mode. However,
+ your modified header text can be used in conjunction with BamWriter
+ to generate a new BAM file with the appropriate header information.
+
+ \returns SAM-formatted header text
+ \sa GetHeader(), BamReader::GetHeaderText()
+*/
+std::string BamMultiReader::GetHeaderText() const
+{
+ return d->GetHeaderText();
+}
+
+/*! \fn BamMultiReader::MergeOrder BamMultiReader::GetMergeOrder() const
+ \brief Returns curent merge order strategy.
+
+ \returns current merge order enum value
+ \sa BamMultiReader::MergeOrder, SetExplicitMergeOrder()
+*/
+BamMultiReader::MergeOrder BamMultiReader::GetMergeOrder() const
+{
+ return d->GetMergeOrder();
+}
+
+/*! \fn bool BamMultiReader::GetNextAlignment(BamAlignment& alignment)
+ \brief Retrieves next available alignment.
+
+ Equivalent to BamReader::GetNextAlignment() with respect to what is a valid
+ overlapping alignment and what data gets populated.
+
+ This method takes care of determining which alignment actually is 'next'
+ across multiple files, depending on their sort order.
+
+ \param[out] alignment destination for alignment record data
+ \returns \c true if a valid alignment was found
+ \sa GetNextAlignmentCore(), SetExplicitMergeOrder(), SetRegion(), BamReader::GetNextAlignment()
+*/
+bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment)
+{
+ return d->GetNextAlignment(nextAlignment);
+}
+
+/*! \fn bool BamMultiReader::GetNextAlignmentCore(BamAlignment& alignment)
+ \brief Retrieves next available alignment.
+
+ Equivalent to BamReader::GetNextAlignmentCore() with respect to what is a valid
+ overlapping alignment and what data gets populated.
+
+ This method takes care of determining which alignment actually is 'next'
+ across multiple files, depending on their sort order.
+
+ \param[out] alignment destination for alignment record data
+ \returns \c true if a valid alignment was found
+ \sa GetNextAlignment(), SetExplicitMergeOrder(), SetRegion(), BamReader::GetNextAlignmentCore()
+*/
+bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment)
+{
+ return d->GetNextAlignmentCore(nextAlignment);
+}
+
+/*! \fn int BamMultiReader::GetReferenceCount() const
+ \brief Returns number of reference sequences.
+ \sa BamReader::GetReferenceCount()
+*/
+int BamMultiReader::GetReferenceCount() const
+{
+ return d->GetReferenceCount();
+}
+
+/*! \fn const RefVector& BamMultiReader::GetReferenceData() const
+ \brief Returns all reference sequence entries.
+ \sa RefData, BamReader::GetReferenceData()
+*/
+const BamTools::RefVector BamMultiReader::GetReferenceData() const
+{
+ return d->GetReferenceData();
+}
+
+/*! \fn int BamMultiReader::GetReferenceID(const std::string& refName) const
+ \brief Returns the ID of the reference with this name.
+
+ If \a refName is not found, returns -1.
+
+ \param[in] refName name of reference to look up
+ \sa BamReader::GetReferenceID()
+*/
+int BamMultiReader::GetReferenceID(const std::string& refName) const
+{
+ return d->GetReferenceID(refName);
+}
+
+/*! \fn bool BamMultiReader::HasIndexes() const
+ \brief Returns \c true if all BAM files have index data available.
+ \sa BamReader::HasIndex()
+*/
+bool BamMultiReader::HasIndexes() const
+{
+ return d->HasIndexes();
+}
+
+/*! \fn bool BamMultiReader::HasOpenReaders() const
+ \brief Returns \c true if there are any open BAM files.
+*/
+bool BamMultiReader::HasOpenReaders() const
+{
+ return d->HasOpenReaders();
+}
+
+/*! \fn bool BamMultiReader::Jump(int refID, int position)
+ \brief Performs a random-access jump within current BAM files.
+
+ This is a convenience method, equivalent to calling SetRegion()
+ with only a left boundary specified.
+
+ \param[in] refID ID of reference to jump to
+ \param[in] position (0-based) left boundary
+
+ \returns \c true if jump was successful
+ \sa HasIndex(), BamReader::Jump()
+*/
+
+bool BamMultiReader::Jump(int refID, int position)
+{
+ return d->Jump(refID, position);
+}
+
+/*! \fn bool BamMultiReader::LocateIndexes(const BamIndex::IndexType& preferredType)
+ \brief Looks for index files that match current BAM files.
+
+ Use this function when you need index files, and perhaps have a
+ preferred index format, but do not depend heavily on which indexes
+ actually get loaded at runtime.
+
+ For each BAM file, this function will defer to your \a preferredType
+ whenever possible. However, if an index file of \a preferredType can
+ not be found, then it will look for any other index file that matches
+ that BAM file.
+
+ An example case would look this:
+ \code
+ BamMultiReader reader;
+
+ // do setup...
+
+ // ensure that all files have an index
+ if ( !reader.LocateIndexes() ) // opens any existing index files that match our BAM files
+ reader.CreateIndexes(); // creates index files for any BAM files that still lack one
+
+ // do interesting stuff using random-access...
+
+ \endcode
+
+ If you want precise control over which index files are loaded, use OpenIndexes()
+ with the desired index filenames. If that function returns false, you can use
+ CreateIndexes() to then build index files of the exact requested format.
+
+ \param[in] preferredType desired index file format, see BamIndex::IndexType for available formats
+ \returns \c true if index files could be found for \b ALL open BAM files
+ \sa BamReader::LocateIndex()
+*/
+bool BamMultiReader::LocateIndexes(const BamIndex::IndexType& preferredType)
+{
+ return d->LocateIndexes(preferredType);
+}
+
+/*! \fn bool BamMultiReader::Open(const std::vector<std::string>& filenames)
+ \brief Opens BAM files.
+
+ \note Opening BAM files will invalidate any current region set on the multireader.
+ All file pointers will be returned to the beginning of the alignment data. Follow
+ this with Jump() or SetRegion() to establish a region of interest.
+
+ \param[in] filenames list of BAM filenames to open
+ \returns \c true if BAM files were opened successfully
+ \sa Close(), HasOpenReaders(), OpenFile(), OpenIndexes(), BamReader::Open()
+*/
+bool BamMultiReader::Open(const std::vector<std::string>& filenames)
+{
+ return d->Open(filenames);
+}
+
+/*! \fn bool BamMultiReader::OpenFile(const std::string& filename)
+ \brief Opens a single BAM file.
+
+ Adds another BAM file to multireader "on-the-fly".
+
+ \note Opening a BAM file will invalidate any current region set on the multireader.
+ All file pointers will be returned to the beginning of the alignment data. Follow
+ this with Jump() or SetRegion() to establish a region of interest.
+
+ \param[in] filename BAM filename to open
+ \returns \c true if BAM file was opened successfully
+ \sa Close(), HasOpenReaders(), Open(), OpenIndexes(), BamReader::Open()
+*/
+bool BamMultiReader::OpenFile(const std::string& filename)
+{
+ return d->OpenFile(filename);
+}
+
+/*! \fn bool BamMultiReader::OpenIndexes(const std::vector<std::string>& indexFilenames)
+ \brief Opens index files for current BAM files.
+
+ \note Currently assumes that index filenames match the order (and number) of
+ BAM files passed to Open().
+
+ \param[in] indexFilenames list of BAM index file names
+ \returns \c true if BAM index file was opened & data loaded successfully
+ \sa LocateIndex(), Open(), SetIndex(), BamReader::OpenIndex()
+*/
+bool BamMultiReader::OpenIndexes(const std::vector<std::string>& indexFilenames)
+{
+ return d->OpenIndexes(indexFilenames);
+}
+
+/*! \fn bool BamMultiReader::Rewind()
+ \brief Returns the internal file pointers to the beginning of alignment records.
+
+ Useful for performing multiple sequential passes through BAM files.
+ Calling this function clears any prior region that may have been set.
+
+ \returns \c true if rewind operation was successful
+ \sa Jump(), SetRegion(), BamReader::Rewind()
+*/
+bool BamMultiReader::Rewind()
+{
+ return d->Rewind();
+}
+
+/*! \fn void BamMultiReader::SetExplicitMergeOrder(BamMultiReader::MergeOrder order)
+ \brief Sets an explicit merge order, regardless of the BAM files' SO header tag.
+
+ The default behavior of the BamMultiReader is to check the SO tag in the BAM files'
+ SAM header text to determine the merge strategy". The merge strategy is used to
+ determine from which BAM file the next alignment should come when either
+ GetNextAlignment() or GetNextAlignmentCore() are called. If files share a
+ 'coordinate' or 'queryname' value for this tag, then the merge strategy is
+ selected accordingly. If any of them do not match, or if any fileis marked as
+ 'unsorted', then the merge strategy is simply a round-robin.
+
+ This method allows client code to explicitly override the lookup behavior. This
+ method can be useful when you know, for example, that your BAM files are sorted
+ by coordinate but upstream processes did not set the header tag properly.
+
+ \note This method should \b not be called while reading alignments via
+ GetNextAlignment() or GetNextAlignmentCore(). For proper results, you should
+ call this method before (or immediately after) opening files, rewinding,
+ jumping, etc. but \b not once alignment fetching has started. There is
+ nothing in the API to prevent you from doing so, but the results may be
+ unexpected.
+
+ \returns \c true if merge order could be successfully applied
+ \sa BamMultiReader::MergeOrder, GetMergeOrder(), GetNextAlignment(), GetNextAlignmentCore()
+*/
+bool BamMultiReader::SetExplicitMergeOrder(BamMultiReader::MergeOrder order)
+{
+ return d->SetExplicitMergeOrder(order);
+}
+
+/*! \fn bool BamMultiReader::SetRegion(const BamRegion& region)
+ \brief Sets a target region of interest
+
+ Equivalent to calling BamReader::SetRegion() on all open BAM files.
+
+ \warning BamRegion now represents a zero-based, HALF-OPEN interval.
+ In previous versions of BamTools (0.x & 1.x) all intervals were treated
+ as zero-based, CLOSED.
+
+ \param[in] region desired region-of-interest to activate
+ \returns \c true if ALL readers set the region successfully
+ \sa HasIndexes(), Jump(), BamReader::SetRegion()
+*/
+bool BamMultiReader::SetRegion(const BamRegion& region)
+{
+ return d->SetRegion(region);
+}
+
+/*! \fn bool BamMultiReader::SetRegion(const int& leftRefID,
+ const int& leftPosition,
+ const int& rightRefID,
+ const int& rightPosition)
+ \brief Sets a target region of interest
+
+ This is an overloaded function. Equivalent to calling BamReader::SetRegion() on all open BAM files.
+
+ \warning This function now expects a zero-based, HALF-OPEN interval.
+ In previous versions of BamTools (0.x & 1.x) all intervals were treated
+ as zero-based, CLOSED.
+
+ \param[in] leftRefID referenceID of region's left boundary
+ \param[in] leftPosition position of region's left boundary
+ \param[in] rightRefID reference ID of region's right boundary
+ \param[in] rightPosition position of region's right boundary
+
+ \returns \c true if ALL readers set the region successfully
+ \sa HasIndexes(), Jump(), BamReader::SetRegion()
+*/
+bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID,
+ const int& rightPosition)
+{
+ return d->SetRegion(BamRegion(leftRefID, leftPosition, rightRefID, rightPosition));
+}
diff --git a/src/api/BamMultiReader.h b/src/api/BamMultiReader.h
new file mode 100644
index 0000000..dca1b1d
--- /dev/null
+++ b/src/api/BamMultiReader.h
@@ -0,0 +1,127 @@
+// ***************************************************************************
+// BamMultiReader.h (c) 2010 Erik Garrison, Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 14 January 2013 (DB)
+// ---------------------------------------------------------------------------
+// Convenience class for reading multiple BAM files.
+// ***************************************************************************
+
+#ifndef BAMMULTIREADER_H
+#define BAMMULTIREADER_H
+
+#include <map>
+#include <sstream>
+#include <string>
+#include <utility>
+#include "api/BamReader.h"
+#include "api/api_global.h"
+
+namespace BamTools {
+
+namespace Internal {
+class BamMultiReaderPrivate;
+} // namespace Internal
+
+class API_EXPORT BamMultiReader
+{
+
+ // enums
+public:
+ // possible merge order strategies
+ enum MergeOrder
+ {
+ RoundRobinMerge = 0,
+ MergeByCoordinate,
+ MergeByName
+ };
+
+ // constructor / destructor
+public:
+ BamMultiReader();
+ ~BamMultiReader();
+
+ // public interface
+public:
+ // ----------------------
+ // BAM file operations
+ // ----------------------
+
+ // closes all open BAM files
+ bool Close();
+ // close only the requested BAM file
+ bool CloseFile(const std::string& filename);
+ // returns list of filenames for all open BAM files
+ const std::vector<std::string> Filenames() const;
+ // returns curent merge order strategy
+ BamMultiReader::MergeOrder GetMergeOrder() const;
+ // returns true if multireader has any open BAM files
+ bool HasOpenReaders() const;
+ // performs random-access jump within current BAM files
+ bool Jump(int refID, int position = 0);
+ // opens BAM files
+ bool Open(const std::vector<std::string>& filenames);
+ // opens a single BAM file, adding to any other current BAM files
+ bool OpenFile(const std::string& filename);
+ // returns file pointers to beginning of alignments
+ bool Rewind();
+ // sets an explicit merge order, regardless of the BAM files' SO header tag
+ bool SetExplicitMergeOrder(BamMultiReader::MergeOrder order);
+ // sets the target region of interest
+ bool SetRegion(const BamRegion& region);
+ // sets the target region of interest
+ bool SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID,
+ const int& rightPosition);
+
+ // ----------------------
+ // access alignment data
+ // ----------------------
+
+ // retrieves next available alignment
+ bool GetNextAlignment(BamAlignment& alignment);
+ // retrieves next available alignment (without populating the alignment's string data fields)
+ bool GetNextAlignmentCore(BamAlignment& alignment);
+
+ // ----------------------
+ // access auxiliary data
+ // ----------------------
+
+ // returns unified SAM header for all files
+ SamHeader GetHeader() const;
+ // returns unified SAM header text for all files
+ std::string GetHeaderText() const;
+ // returns number of reference sequences
+ int GetReferenceCount() const;
+ // returns all reference sequence entries.
+ const BamTools::RefVector GetReferenceData() const;
+ // returns the ID of the reference with this name.
+ int GetReferenceID(const std::string& refName) const;
+
+ // ----------------------
+ // BAM index operations
+ // ----------------------
+
+ // creates index files for current BAM files
+ bool CreateIndexes(const BamIndex::IndexType& type = BamIndex::STANDARD);
+ // returns true if all BAM files have index data available
+ bool HasIndexes() const;
+ // looks for index files that match current BAM files
+ bool LocateIndexes(const BamIndex::IndexType& preferredType = BamIndex::STANDARD);
+ // opens index files for current BAM files.
+ bool OpenIndexes(const std::vector<std::string>& indexFilenames);
+
+ // ----------------------
+ // error handling
+ // ----------------------
+
+ // returns a human-readable description of the last error that occurred
+ std::string GetErrorString() const;
+
+ // private implementation
+private:
+ Internal::BamMultiReaderPrivate* d;
+};
+
+} // namespace BamTools
+
+#endif // BAMMULTIREADER_H
diff --git a/src/api/BamReader.cpp b/src/api/BamReader.cpp
new file mode 100644
index 0000000..56e6c39
--- /dev/null
+++ b/src/api/BamReader.cpp
@@ -0,0 +1,402 @@
+// ***************************************************************************
+// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 29 July 2013 (DB)
+// ---------------------------------------------------------------------------
+// Provides read access to BAM files.
+// ***************************************************************************
+
+#include "api/BamReader.h"
+#include "api/internal/bam/BamReader_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <vector>
+
+/*! \class BamTools::BamReader
+ \brief Provides read access to BAM files.
+*/
+
+/*! \fn BamReader::BamReader()
+ \brief constructor
+*/
+BamReader::BamReader()
+ : d(new BamReaderPrivate(this))
+{}
+
+/*! \fn BamReader::~BamReader()
+ \brief destructor
+*/
+BamReader::~BamReader()
+{
+ delete d;
+ d = 0;
+}
+
+/*! \fn bool BamReader::Close()
+ \brief Closes the current BAM file.
+
+ Also clears out all header and reference data.
+
+ \return \c true if file closed OK
+ \sa IsOpen(), Open()
+*/
+bool BamReader::Close()
+{
+ return d->Close();
+}
+
+/*! \fn bool BamReader::CreateIndex(const BamIndex::IndexType& type)
+ \brief Creates an index file for current BAM file.
+
+ \param[in] type file format to create, see BamIndex::IndexType for available formats
+ \return \c true if index created OK
+ \sa LocateIndex(), OpenIndex()
+*/
+bool BamReader::CreateIndex(const BamIndex::IndexType& type)
+{
+ return d->CreateIndex(type);
+}
+
+/*! \fn const SamHeader& BamReader::GetConstSamHeader() const
+ \brief Returns const reference to SAM header data.
+
+ Allows for read-only queries of SAM header data.
+
+ If you do not need to modify the SAM header, use this method to avoid the
+ potentially expensive copy used by GetHeader().
+
+ \note
+ \returns const reference to header data object
+ \sa GetHeader(), GetHeaderText()
+*/
+const SamHeader& BamReader::GetConstSamHeader() const
+{
+ return d->GetConstSamHeader();
+}
+
+/*! \fn std::string BamReader::GetErrorString() const
+ \brief Returns a human-readable description of the last error that occurred
+
+ This method allows elimination of STDERR pollution. Developers of client code
+ may choose how the messages are displayed to the user, if at all.
+
+ \return error description
+*/
+std::string BamReader::GetErrorString() const
+{
+ return d->GetErrorString();
+}
+
+/*! \fn const std::string BamReader::GetFilename() const
+ \brief Returns name of current BAM file.
+
+ Retrieved filename will contain whatever was passed via Open().
+ If you need full directory paths here, be sure to include them
+ when you open the BAM file.
+
+ \returns name of open BAM file. If no file is open, returns an empty string.
+ \sa IsOpen()
+*/
+const std::string BamReader::GetFilename() const
+{
+ return d->Filename();
+}
+
+/*! \fn SamHeader BamReader::GetHeader() const
+ \brief Returns SAM header data.
+
+ Header data is wrapped in a SamHeader object that can be conveniently queried and/or modified.
+ If you only need read access, consider using GetConstSamHeader() instead.
+
+ \note Modifying the retrieved SamHeader object does NOT affect the
+ current BAM file. This file has been opened in a read-only mode.
+ However, your modified SamHeader object can be used in conjunction with
+ BamWriter to generate a new BAM file with the appropriate header information.
+
+ \returns header data object
+ \sa GetConstSamHeader(), GetHeaderText()
+*/
+SamHeader BamReader::GetHeader() const
+{
+ return d->GetSamHeader();
+}
+
+/*! \fn std::string BamReader::GetHeaderText() const
+ \brief Returns SAM header data, as SAM-formatted text.
+
+ \note Modifying the retrieved text does NOT affect the current
+ BAM file. This file has been opened in a read-only mode. However,
+ your modified header text can be used in conjunction with BamWriter
+ to generate a new BAM file with the appropriate header information.
+
+ \returns SAM-formatted header text
+ \sa GetHeader()
+*/
+std::string BamReader::GetHeaderText() const
+{
+ return d->GetHeaderText();
+}
+
+/*! \fn bool BamReader::GetNextAlignment(BamAlignment& alignment)
+ \brief Retrieves next available alignment.
+
+ Attempts to read the next alignment record from BAM file, and checks to see
+ if it overlaps the current region. If no region is currently set, then the
+ next alignment available is always considered valid.
+
+ If a region has been set, via Jump() or SetRegion(), an alignment is only
+ considered valid if it overlaps the region. If the actual 'next' alignment record
+ in the BAM file does not overlap this region, then this function will read sequentially
+ through the file until the next alignment that overlaps this region is found.
+ Once the region has been exhausted (i.e. the next alignment loaded is beyond the region),
+ the function aborts and returns \c false. In this case, there is no point to continue
+ reading, assuming properly sorted alignments.
+
+ This function fully populates all of the alignment's available data fields,
+ including the string data fields (read name, bases, qualities, tags, filename).
+ If only positional data (refID, position, CIGAR ops, alignment flags, etc.)
+ are required, consider using GetNextAlignmentCore() for a significant
+ performance boost.
+
+ \param[out] alignment destination for alignment record data
+ \returns \c true if a valid alignment was found
+*/
+bool BamReader::GetNextAlignment(BamAlignment& alignment)
+{
+ return d->GetNextAlignment(alignment);
+}
+
+/*! \fn bool BamReader::GetNextAlignmentCore(BamAlignment& alignment)
+ \brief Retrieves next available alignment, without populating the alignment's string data fields.
+
+ Equivalent to GetNextAlignment() with respect to what is a valid overlapping alignment.
+
+ However, this method does NOT populate the alignment's string data fields
+ (read name, bases, qualities, tags, filename). This provides a boost in speed
+ when these fields are not required for every alignment. These fields, excluding filename,
+ can be populated 'lazily' (as needed) by calling BamAlignment::BuildCharData() later.
+
+ \param[out] alignment destination for alignment record data
+ \returns \c true if a valid alignment was found
+ \sa SetRegion()
+*/
+bool BamReader::GetNextAlignmentCore(BamAlignment& alignment)
+{
+ return d->GetNextAlignmentCore(alignment);
+}
+
+/*! \fn int BamReader::GetReferenceCount() const
+ \brief Returns number of reference sequences.
+*/
+int BamReader::GetReferenceCount() const
+{
+ return d->GetReferenceCount();
+}
+
+/*! \fn const RefVector& BamReader::GetReferenceData() const
+ \brief Returns all reference sequence entries.
+ \sa RefData
+*/
+const RefVector& BamReader::GetReferenceData() const
+{
+ return d->GetReferenceData();
+}
+
+/*! \fn int BamReader::GetReferenceID(const std::string& refName) const
+ \brief Returns the ID of the reference with this name.
+
+ If \a refName is not found, returns -1.
+
+ \param[in] refName name of reference to look up
+*/
+int BamReader::GetReferenceID(const std::string& refName) const
+{
+ return d->GetReferenceID(refName);
+}
+
+/*! \fn bool BamReader::HasIndex() const
+ \brief Returns \c true if index data is available.
+*/
+bool BamReader::HasIndex() const
+{
+ return d->HasIndex();
+}
+
+/*! \fn bool BamReader::IsOpen() const
+ \brief Returns \c true if a BAM file is open for reading.
+*/
+bool BamReader::IsOpen() const
+{
+ return d->IsOpen();
+}
+
+/*! \fn bool BamReader::Jump(int refID, int position)
+ \brief Performs a random-access jump within BAM file.
+
+ This is a convenience method, equivalent to calling SetRegion()
+ with only a left boundary specified.
+
+ \param[in] refID left-bound reference ID
+ \param[in] position left-bound position
+
+ \returns \c true if jump was successful
+ \sa HasIndex()
+*/
+bool BamReader::Jump(int refID, int position)
+{
+ return d->SetRegion(BamRegion(refID, position));
+}
+
+/*! \fn bool BamReader::LocateIndex(const BamIndex::IndexType& preferredType)
+ \brief Looks in BAM file's directory for a matching index file.
+
+ Use this function when you need an index file, and perhaps have a
+ preferred index format, but do not depend heavily on which format
+ actually gets loaded at runtime.
+
+ This function will defer to your \a preferredType whenever possible.
+ However, if an index file of \a preferredType can not be found, then
+ it will look for any other index file that corresponds to this BAM file.
+
+ If you want precise control over which index file is loaded, use OpenIndex()
+ with the desired index filename. If that function returns false, you can use
+ CreateIndex() to then build an index of the exact requested format.
+
+ \param[in] preferredType desired index file format, see BamIndex::IndexType for available formats
+
+ \returns \c true if (any) index file could be found
+*/
+bool BamReader::LocateIndex(const BamIndex::IndexType& preferredType)
+{
+ return d->LocateIndex(preferredType);
+}
+
+/*! \fn bool BamReader::Open(const std::string& filename)
+ \brief Opens a BAM file.
+
+ If BamReader is already opened on another file, this function closes
+ that file, then attempts to open requested \a filename.
+
+ \param[in] filename name of BAM file to open
+
+ \returns \c true if BAM file was opened successfully
+ \sa Close(), IsOpen(), OpenIndex()
+*/
+bool BamReader::Open(const std::string& filename)
+{
+ return d->Open(filename);
+}
+
+/*! \fn bool BamReader::OpenIndex(const std::string& indexFilename)
+ \brief Opens a BAM index file.
+
+ \param[in] indexFilename name of BAM index file to open
+
+ \returns \c true if BAM index file was opened & data loaded successfully
+ \sa LocateIndex(), Open(), SetIndex()
+*/
+bool BamReader::OpenIndex(const std::string& indexFilename)
+{
+ return d->OpenIndex(indexFilename);
+}
+
+/*! \fn bool BamReader::Rewind()
+ \brief Returns the internal file pointer to the first alignment record.
+
+ Useful for performing multiple sequential passes through a BAM file.
+ Calling this function clears any prior region that may have been set.
+
+ \note This function sets the file pointer to first alignment record
+ in the BAM file, NOT the beginning of the file.
+
+ \returns \c true if rewind operation was successful
+ \sa Jump(), SetRegion()
+*/
+bool BamReader::Rewind()
+{
+ return d->Rewind();
+}
+
+/*! \fn void BamReader::SetIndex(BamIndex* index)
+ \brief Sets a custom BamIndex on this reader.
+
+ Only necessary for custom BamIndex subclasses. Most clients should
+ never have to use this function.
+
+ Example:
+ \code
+ BamReader reader;
+ reader.SetIndex(new MyCustomBamIndex);
+ \endcode
+
+ \note BamReader takes ownership of \a index - i.e. the BamReader will
+ take care of deleting it when the reader is destructed, when the current
+ BAM file is closed, or when a new index is requested.
+
+ \param[in] index custom BamIndex subclass created by client
+ \sa CreateIndex(), LocateIndex(), OpenIndex()
+*/
+void BamReader::SetIndex(BamIndex* index)
+{
+ d->SetIndex(index);
+}
+
+/*! \fn bool BamReader::SetRegion(const BamRegion& region)
+ \brief Sets a target region of interest
+
+ Requires that index data be available. Attempts a random-access
+ jump in the BAM file, near \a region left boundary position.
+
+ Subsequent calls to GetNextAlignment() or GetNextAlignmentCore()
+ will only return \c true when alignments can be found that overlap
+ this \a region.
+
+ A \a region with no right boundary is considered open-ended, meaning
+ that all alignments that lie downstream of the left boundary are
+ considered valid, continuing to the end of the BAM file.
+
+ \warning BamRegion now represents a zero-based, HALF-OPEN interval.
+ In previous versions of BamTools (0.x & 1.x) all intervals were treated
+ as zero-based, CLOSED.
+
+ \param[in] region desired region-of-interest to activate
+
+ \returns \c true if reader was able to jump successfully to the region's left boundary
+ \sa HasIndex(), Jump()
+*/
+bool BamReader::SetRegion(const BamRegion& region)
+{
+ return d->SetRegion(region);
+}
+
+/*! \fn bool BamReader::SetRegion(const int& leftRefID,
+ const int& leftPosition,
+ const int& rightRefID,
+ const int& rightPosition)
+ \brief Sets a target region of interest.
+
+ This is an overloaded function.
+
+ \warning This function expects a zero-based, HALF-OPEN interval.
+ In previous versions of BamTools (0.x & 1.x) all intervals were treated
+ as zero-based, CLOSED.
+
+ \param[in] leftRefID referenceID of region's left boundary
+ \param[in] leftPosition position of region's left boundary
+ \param[in] rightRefID reference ID of region's right boundary
+ \param[in] rightPosition position of region's right boundary
+
+ \returns \c true if reader was able to jump successfully to the region's left boundary
+ \sa HasIndex(), Jump()
+*/
+bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID,
+ const int& rightBound)
+{
+ return d->SetRegion(BamRegion(leftRefID, leftBound, rightRefID, rightBound));
+}
diff --git a/src/api/BamReader.h b/src/api/BamReader.h
new file mode 100644
index 0000000..1991a67
--- /dev/null
+++ b/src/api/BamReader.h
@@ -0,0 +1,117 @@
+// ***************************************************************************
+// BamReader.h (c) 2009 Derek Barnett, Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 18 November 2012 (DB)
+// ---------------------------------------------------------------------------
+// Provides read access to BAM files.
+// ***************************************************************************
+
+#ifndef BAMREADER_H
+#define BAMREADER_H
+
+#include <string>
+#include "api/BamAlignment.h"
+#include "api/BamIndex.h"
+#include "api/SamHeader.h"
+#include "api/api_global.h"
+
+namespace BamTools {
+
+namespace Internal {
+class BamReaderPrivate;
+} // namespace Internal
+
+class API_EXPORT BamReader
+{
+
+ // constructor / destructor
+public:
+ BamReader();
+ ~BamReader();
+
+ // public interface
+public:
+ // ----------------------
+ // BAM file operations
+ // ----------------------
+
+ // closes the current BAM file
+ bool Close();
+ // returns filename of current BAM file
+ const std::string GetFilename() const;
+ // returns true if a BAM file is open for reading
+ bool IsOpen() const;
+ // performs random-access jump within BAM file
+ bool Jump(int refID, int position = 0);
+ // opens a BAM file
+ bool Open(const std::string& filename);
+ // returns internal file pointer to beginning of alignment data
+ bool Rewind();
+ // sets the target region of interest
+ bool SetRegion(const BamRegion& region);
+ // sets the target region of interest
+ bool SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID,
+ const int& rightPosition);
+
+ // ----------------------
+ // access alignment data
+ // ----------------------
+
+ // retrieves next available alignment
+ bool GetNextAlignment(BamAlignment& alignment);
+ // retrieves next available alignmnet (without populating the alignment's string data fields)
+ bool GetNextAlignmentCore(BamAlignment& alignment);
+
+ // ----------------------
+ // access header data
+ // ----------------------
+
+ // returns a read-only reference to SAM header data
+ const SamHeader& GetConstSamHeader() const;
+ // returns an editable copy of SAM header data
+ SamHeader GetHeader() const;
+ // returns SAM header data, as SAM-formatted text
+ std::string GetHeaderText() const;
+
+ // ----------------------
+ // access reference data
+ // ----------------------
+
+ // returns the number of reference sequences
+ int GetReferenceCount() const;
+ // returns all reference sequence entries
+ const RefVector& GetReferenceData() const;
+ // returns the ID of the reference with this name
+ int GetReferenceID(const std::string& refName) const;
+
+ // ----------------------
+ // BAM index operations
+ // ----------------------
+
+ // creates an index file for current BAM file, using the requested index type
+ bool CreateIndex(const BamIndex::IndexType& type = BamIndex::STANDARD);
+ // returns true if index data is available
+ bool HasIndex() const;
+ // looks in BAM file's directory for a matching index file
+ bool LocateIndex(const BamIndex::IndexType& preferredType = BamIndex::STANDARD);
+ // opens a BAM index file
+ bool OpenIndex(const std::string& indexFilename);
+ // sets a custom BamIndex on this reader
+ void SetIndex(BamIndex* index);
+
+ // ----------------------
+ // error handling
+ // ----------------------
+
+ // returns a human-readable description of the last error that occurred
+ std::string GetErrorString() const;
+
+ // private implementation
+private:
+ Internal::BamReaderPrivate* d;
+};
+
+} // namespace BamTools
+
+#endif // BAMREADER_H
diff --git a/src/api/BamWriter.cpp b/src/api/BamWriter.cpp
new file mode 100644
index 0000000..6f349ff
--- /dev/null
+++ b/src/api/BamWriter.cpp
@@ -0,0 +1,155 @@
+// ***************************************************************************
+// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#include "api/BamWriter.h"
+#include "api/BamAlignment.h"
+#include "api/SamHeader.h"
+#include "api/internal/bam/BamWriter_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+/*! \class BamTools::BamWriter
+ \brief Provides write access for generating BAM files.
+*/
+/*! \enum BamTools::BamWriter::CompressionMode
+ \brief This enum describes the compression behaviors for output BAM files.
+*/
+/*! \var BamWriter::CompressionMode BamWriter::Compressed
+ \brief Use normal BAM compression
+*/
+/*! \var BamWriter::CompressionMode BamWriter::Uncompressed
+ \brief Disable BAM compression
+
+ Useful in situations where the BAM data is streamed (e.g. piping).
+ It would be wasteful to compress, and then immediately decompress
+ the data.
+*/
+
+/*! \fn BamWriter::BamWriter()
+ \brief constructor
+*/
+BamWriter::BamWriter()
+ : d(new BamWriterPrivate)
+{}
+
+/*! \fn BamWriter::~BamWriter()
+ \brief destructor
+*/
+BamWriter::~BamWriter()
+{
+ delete d;
+ d = 0;
+}
+
+/*! \fn BamWriter::Close()
+ \brief Closes the current BAM file.
+ \sa Open()
+*/
+void BamWriter::Close()
+{
+ d->Close();
+}
+
+/*! \fn std::string BamWriter::GetErrorString() const
+ \brief Returns a human-readable description of the last error that occurred
+
+ This method allows elimination of STDERR pollution. Developers of client code
+ may choose how the messages are displayed to the user, if at all.
+
+ \return error description
+*/
+std::string BamWriter::GetErrorString() const
+{
+ return d->GetErrorString();
+}
+
+/*! \fn bool BamWriter::IsOpen() const
+ \brief Returns \c true if BAM file is open for writing.
+ \sa Open()
+*/
+bool BamWriter::IsOpen() const
+{
+ return d->IsOpen();
+}
+
+/*! \fn bool BamWriter::Open(const std::string& filename,
+ const std::string& samHeaderText,
+ const RefVector& referenceSequences)
+ \brief Opens a BAM file for writing.
+
+ Will overwrite the BAM file if it already exists.
+
+ \param[in] filename name of output BAM file
+ \param[in] samHeaderText header data, as SAM-formatted string
+ \param[in] referenceSequences list of reference entries
+
+ \return \c true if opened successfully
+ \sa Close(), IsOpen(), BamReader::GetHeaderText(), BamReader::GetReferenceData()
+*/
+bool BamWriter::Open(const std::string& filename, const std::string& samHeaderText,
+ const RefVector& referenceSequences)
+{
+ return d->Open(filename, samHeaderText, referenceSequences);
+}
+
+/*! \fn bool BamWriter::Open(const std::string& filename,
+ const SamHeader& samHeader,
+ const RefVector& referenceSequences)
+ \brief Opens a BAM file for writing.
+
+ This is an overloaded function.
+
+ Will overwrite the BAM file if it already exists.
+
+ \param[in] filename name of output BAM file
+ \param[in] samHeader header data, wrapped in SamHeader object
+ \param[in] referenceSequences list of reference entries
+
+ \return \c true if opened successfully
+ \sa Close(), IsOpen(), BamReader::GetHeader(), BamReader::GetReferenceData()
+*/
+bool BamWriter::Open(const std::string& filename, const SamHeader& samHeader,
+ const RefVector& referenceSequences)
+{
+ return d->Open(filename, samHeader.ToString(), referenceSequences);
+}
+
+/*! \fn void BamWriter::SaveAlignment(const BamAlignment& alignment)
+ \brief Saves an alignment to the BAM file.
+
+ \param[in] alignment BamAlignment record to save
+ \sa BamReader::GetNextAlignment(), BamReader::GetNextAlignmentCore()
+*/
+bool BamWriter::SaveAlignment(const BamAlignment& alignment)
+{
+ return d->SaveAlignment(alignment);
+}
+
+/*! \fn void BamWriter::SetCompressionMode(const BamWriter::CompressionMode& compressionMode)
+ \brief Sets the output compression mode.
+
+ Default mode is BamWriter::Compressed.
+
+ \note Changing the compression mode is disabled on open files (i.e. the request will
+ be ignored). Be sure to call this function before opening the BAM file.
+
+ \code
+ BamWriter writer;
+ writer.SetCompressionMode(BamWriter::Uncompressed);
+ writer.Open( ... );
+ // ...
+ \endcode
+
+ \param[in] compressionMode desired output compression behavior
+ \sa IsOpen(), Open()
+*/
+void BamWriter::SetCompressionMode(const BamWriter::CompressionMode& compressionMode)
+{
+ d->SetWriteCompressed(compressionMode == BamWriter::Compressed);
+}
diff --git a/src/api/BamWriter.h b/src/api/BamWriter.h
new file mode 100644
index 0000000..b4c01b5
--- /dev/null
+++ b/src/api/BamWriter.h
@@ -0,0 +1,70 @@
+// ***************************************************************************
+// BamWriter.h (c) 2009 Michael Str�mberg, Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#ifndef BAMWRITER_H
+#define BAMWRITER_H
+
+#include <string>
+#include "api/BamAux.h"
+#include "api/api_global.h"
+
+namespace BamTools {
+
+class BamAlignment;
+struct SamHeader;
+
+//! \cond
+namespace Internal {
+class BamWriterPrivate;
+} // namespace Internal
+//! \endcond
+
+class API_EXPORT BamWriter
+{
+
+ // enums
+public:
+ enum CompressionMode
+ {
+ Compressed = 0,
+ Uncompressed
+ };
+
+ // ctor & dtor
+public:
+ BamWriter();
+ ~BamWriter();
+
+ // public interface
+public:
+ // closes the current BAM file
+ void Close();
+ // returns a human-readable description of the last error that occurred
+ std::string GetErrorString() const;
+ // returns true if BAM file is open for writing
+ bool IsOpen() const;
+ // opens a BAM file for writing
+ bool Open(const std::string& filename, const std::string& samHeaderText,
+ const RefVector& referenceSequences);
+ // opens a BAM file for writing
+ bool Open(const std::string& filename, const SamHeader& samHeader,
+ const RefVector& referenceSequences);
+ // saves the alignment to the alignment archive
+ bool SaveAlignment(const BamAlignment& alignment);
+ // sets the output compression mode
+ void SetCompressionMode(const BamWriter::CompressionMode& compressionMode);
+
+ // private implementation
+private:
+ Internal::BamWriterPrivate* d;
+};
+
+} // namespace BamTools
+
+#endif // BAMWRITER_H
diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt
new file mode 100644
index 0000000..1c2ab9a
--- /dev/null
+++ b/src/api/CMakeLists.txt
@@ -0,0 +1,77 @@
+# ==========================
+# BamTools CMakeLists.txt
+# (c) 2010 Derek Barnett
+#
+# src/api/
+# ==========================
+
+# list include paths
+include_directories( ${BamTools_SOURCE_DIR}/src )
+
+# add compiler definitions
+add_definitions( -DBAMTOOLS_API_LIBRARY ) # (for proper exporting of library symbols)
+
+# fetch all internal source files
+add_subdirectory( internal )
+
+# make list of all API source files
+set( BamToolsAPISources
+ BamAlignment.cpp
+ BamMultiReader.cpp
+ BamReader.cpp
+ BamWriter.cpp
+ SamHeader.cpp
+ SamProgram.cpp
+ SamProgramChain.cpp
+ SamReadGroup.cpp
+ SamReadGroupDictionary.cpp
+ SamSequence.cpp
+ SamSequenceDictionary.cpp
+ ${InternalSources}
+)
+
+# link libraries automatically with zlib (and Winsock2, if applicable)
+find_package( ZLIB REQUIRED )
+
+if( WIN32 )
+ set( WIN32_LIBRARIES wsock32 ws2_32 )
+endif()
+
+# create main BamTools API library
+add_library( BamTools ${BamToolsAPISources} )
+# The SONAME is bumped on every version increment
+# as Bamtools does not yet guarantee a stable ABI
+set_target_properties( BamTools PROPERTIES
+ SOVERSION "${BamTools_VERSION}"
+ OUTPUT_NAME "bamtools" )
+target_include_directories( BamTools PRIVATE "${ZLIB_INCLUDE_DIRS}" )
+target_link_libraries( BamTools PRIVATE "${ZLIB_LIBRARIES}" "${WIN32_LIBRARIES}" )
+install( TARGETS BamTools
+ ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+ LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+ RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" )
+
+# export API headers
+include(../ExportHeader.cmake)
+set(ApiIncludeDir "api")
+ExportHeader(APIHeaders api_global.h ${ApiIncludeDir})
+ExportHeader(APIHeaders BamAlgorithms.h ${ApiIncludeDir})
+ExportHeader(APIHeaders BamAlignment.h ${ApiIncludeDir})
+ExportHeader(APIHeaders BamAux.h ${ApiIncludeDir})
+ExportHeader(APIHeaders BamConstants.h ${ApiIncludeDir})
+ExportHeader(APIHeaders BamIndex.h ${ApiIncludeDir})
+ExportHeader(APIHeaders BamMultiReader.h ${ApiIncludeDir})
+ExportHeader(APIHeaders BamReader.h ${ApiIncludeDir})
+ExportHeader(APIHeaders BamWriter.h ${ApiIncludeDir})
+ExportHeader(APIHeaders IBamIODevice.h ${ApiIncludeDir})
+ExportHeader(APIHeaders SamConstants.h ${ApiIncludeDir})
+ExportHeader(APIHeaders SamHeader.h ${ApiIncludeDir})
+ExportHeader(APIHeaders SamProgram.h ${ApiIncludeDir})
+ExportHeader(APIHeaders SamProgramChain.h ${ApiIncludeDir})
+ExportHeader(APIHeaders SamReadGroup.h ${ApiIncludeDir})
+ExportHeader(APIHeaders SamReadGroupDictionary.h ${ApiIncludeDir})
+ExportHeader(APIHeaders SamSequence.h ${ApiIncludeDir})
+ExportHeader(APIHeaders SamSequenceDictionary.h ${ApiIncludeDir})
+
+set( AlgorithmsIncludeDir "api/algorithms" )
+ExportHeader( AlgorithmsHeaders algorithms/Sort.h ${AlgorithmsIncludeDir} )
diff --git a/src/api/IBamIODevice.h b/src/api/IBamIODevice.h
new file mode 100644
index 0000000..6de8968
--- /dev/null
+++ b/src/api/IBamIODevice.h
@@ -0,0 +1,100 @@
+// ***************************************************************************
+// IBamIODevice.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 November 2011 (DB)
+// ---------------------------------------------------------------------------
+// Base class for all BAM I/O devices (e.g. local file, pipe, HTTP, FTP, etc.)
+//
+// Derived classes should provide protocol-specific implementations for
+// reading/writing plain bytes, as well as other I/O-related behaviors.
+//
+// Since IBamIODevices may be defined in client code, the internal
+// BamExceptions are NOT allowed to be thrown from devices, including the
+// built-in ones. This keeps a consistent interface at the BgzfStream for
+// handling any device type. Use the error string for relaying error messages.
+// ***************************************************************************
+
+#ifndef IBAMIODEVICE_H
+#define IBAMIODEVICE_H
+
+#include <cstdio>
+#include <string>
+#include "api/api_global.h"
+
+namespace BamTools {
+
+class API_EXPORT IBamIODevice
+{
+
+ // enums
+public:
+ enum OpenMode
+ {
+ NotOpen = 0x0000,
+ ReadOnly = 0x0001,
+ WriteOnly = 0x0002,
+ ReadWrite = ReadOnly | WriteOnly
+ };
+
+ // ctor & dtor
+public:
+ virtual ~IBamIODevice() {}
+
+ // IBamIODevice interface
+public:
+ // TODO: add seek(pos, *from*)
+
+ // pure virtuals
+ virtual void Close() = 0;
+ virtual bool IsRandomAccess() const = 0;
+ virtual bool Open(const OpenMode mode) = 0;
+ virtual int64_t Read(char* data, const unsigned int numBytes) = 0;
+ virtual bool Seek(const int64_t& position, const int origin = SEEK_SET) = 0;
+ virtual int64_t Tell() const = 0;
+ virtual int64_t Write(const char* data, const unsigned int numBytes) = 0;
+
+ // default implementation provided
+ virtual std::string GetErrorString();
+ virtual bool IsOpen() const;
+ virtual OpenMode Mode() const;
+
+ // internal methods
+protected:
+ IBamIODevice(); // hidden ctor
+ void SetErrorString(const std::string& where, const std::string& what);
+
+ // data members
+protected:
+ OpenMode m_mode;
+ std::string m_errorString;
+};
+
+inline IBamIODevice::IBamIODevice()
+ : m_mode(IBamIODevice::NotOpen)
+{}
+
+inline std::string IBamIODevice::GetErrorString()
+{
+ return m_errorString;
+}
+
+inline bool IBamIODevice::IsOpen() const
+{
+ return (m_mode != IBamIODevice::NotOpen);
+}
+
+inline IBamIODevice::OpenMode IBamIODevice::Mode() const
+{
+ return m_mode;
+}
+
+inline void IBamIODevice::SetErrorString(const std::string& where, const std::string& what)
+{
+ static const std::string SEPARATOR = ": ";
+ m_errorString = where + SEPARATOR + what;
+}
+
+} // namespace BamTools
+
+#endif // IBAMIODEVICE_H
diff --git a/src/api/SamConstants.h b/src/api/SamConstants.h
new file mode 100644
index 0000000..6a1a275
--- /dev/null
+++ b/src/api/SamConstants.h
@@ -0,0 +1,97 @@
+// ***************************************************************************
+// SamConstants.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 27 March 2012 (DB)
+// ---------------------------------------------------------------------------
+// Provides constants for SAM header
+// ***************************************************************************
+
+#ifndef SAM_CONSTANTS_H
+#define SAM_CONSTANTS_H
+
+#include <string>
+#include "api/api_global.h"
+
+namespace BamTools {
+namespace Constants {
+
+// basic char constants used in SAM format
+const char SAM_COLON = ':';
+const char SAM_EQUAL = '=';
+const char SAM_PERIOD = '.';
+const char SAM_STAR = '*';
+const char SAM_TAB = '\t';
+const std::string SAM_DIGITS = "0123456789";
+
+const std::string SAM_CURRENT_VERSION = "1.4";
+
+// HD entries
+const std::string SAM_HD_BEGIN_TOKEN = "@HD";
+const std::string SAM_HD_VERSION_TAG = "VN";
+const std::string SAM_HD_SORTORDER_TAG = "SO";
+const std::string SAM_HD_GROUPORDER_TAG = "GO";
+
+// SQ entries
+const std::string SAM_SQ_BEGIN_TOKEN = "@SQ";
+const std::string SAM_SQ_ASSEMBLYID_TAG = "AS";
+const std::string SAM_SQ_CHECKSUM_TAG = "M5";
+const std::string SAM_SQ_LENGTH_TAG = "LN";
+const std::string SAM_SQ_NAME_TAG = "SN";
+const std::string SAM_SQ_SPECIES_TAG = "SP";
+const std::string SAM_SQ_URI_TAG = "UR";
+
+// RG entries
+const std::string SAM_RG_BEGIN_TOKEN = "@RG";
+const std::string SAM_RG_DESCRIPTION_TAG = "DS";
+const std::string SAM_RG_FLOWORDER_TAG = "FO";
+const std::string SAM_RG_ID_TAG = "ID";
+const std::string SAM_RG_KEYSEQUENCE_TAG = "KS";
+const std::string SAM_RG_LIBRARY_TAG = "LB";
+const std::string SAM_RG_PLATFORMUNIT_TAG = "PU";
+const std::string SAM_RG_PREDICTEDINSERTSIZE_TAG = "PI";
+const std::string SAM_RG_PRODUCTIONDATE_TAG = "DT";
+const std::string SAM_RG_PROGRAM_TAG = "PG";
+const std::string SAM_RG_SAMPLE_TAG = "SM";
+const std::string SAM_RG_SEQCENTER_TAG = "CN";
+const std::string SAM_RG_SEQTECHNOLOGY_TAG = "PL";
+
+// PG entries
+const std::string SAM_PG_BEGIN_TOKEN = "@PG";
+const std::string SAM_PG_COMMANDLINE_TAG = "CL";
+const std::string SAM_PG_ID_TAG = "ID";
+const std::string SAM_PG_NAME_TAG = "PN";
+const std::string SAM_PG_PREVIOUSPROGRAM_TAG = "PP";
+const std::string SAM_PG_VERSION_TAG = "VN";
+
+// CO entries
+const std::string SAM_CO_BEGIN_TOKEN = "@CO";
+
+// HD:SO values
+const std::string SAM_HD_SORTORDER_COORDINATE = "coordinate";
+const std::string SAM_HD_SORTORDER_QUERYNAME = "queryname";
+const std::string SAM_HD_SORTORDER_UNKNOWN = "unknown";
+const std::string SAM_HD_SORTORDER_UNSORTED = "unsorted";
+
+// HD:GO values
+const std::string SAM_HD_GROUPORDER_NONE = "none";
+const std::string SAM_HD_GROUPORDER_QUERY = "query";
+const std::string SAM_HD_GROUPORDER_REFERENCE = "reference";
+
+// SQ:LN values
+const unsigned int SAM_SQ_LENGTH_MIN = 1;
+const unsigned int SAM_SQ_LENGTH_MAX = 536870911; // 2^29 - 1
+
+// RG:PL values
+const std::string SAM_RG_SEQTECHNOLOGY_CAPILLARY = "CAPILLARY";
+const std::string SAM_RG_SEQTECHNOLOGY_HELICOS = "HELICOS";
+const std::string SAM_RG_SEQTECHNOLOGY_ILLUMINA = "ILLUMINA";
+const std::string SAM_RG_SEQTECHNOLOGY_IONTORRENT = "IONTORRENT";
+const std::string SAM_RG_SEQTECHNOLOGY_LS454 = "LS454";
+const std::string SAM_RG_SEQTECHNOLOGY_PACBIO = "PACBIO";
+const std::string SAM_RG_SEQTECHNOLOGY_SOLID = "SOLID";
+
+} // namespace Constants
+} // namespace BamTools
+
+#endif // SAM_CONSTANTS_H
diff --git a/src/api/SamHeader.cpp b/src/api/SamHeader.cpp
new file mode 100644
index 0000000..9429e81
--- /dev/null
+++ b/src/api/SamHeader.cpp
@@ -0,0 +1,246 @@
+// ***************************************************************************
+// SamHeader.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides direct read/write access to the SAM header data fields.
+// ***************************************************************************
+
+#include "api/SamHeader.h"
+#include "api/SamConstants.h"
+#include "api/internal/sam/SamFormatParser_p.h"
+#include "api/internal/sam/SamFormatPrinter_p.h"
+#include "api/internal/sam/SamHeaderValidator_p.h"
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+/*! \struct BamTools::SamHeader
+ \brief Represents the SAM-formatted text header that is part of the BAM file header.
+
+ Provides direct read/write access to the SAM header data fields.
+
+ \sa \samSpecURL
+*/
+/*! \var SamHeader::Version
+ \brief corresponds to \@HD VN:\<Version\>
+
+ Required for valid SAM header, if \@HD record is present.
+*/
+/*! \var SamHeader::SortOrder
+ \brief corresponds to \@HD SO:\<SortOrder\>
+*/
+/*! \var SamHeader::GroupOrder
+ \brief corresponds to \@HD GO:\<GroupOrder\>
+*/
+/*! \var SamHeader::Sequences
+ \brief corresponds to \@SQ entries
+ \sa SamSequence, SamSequenceDictionary
+*/
+/*! \var SamHeader::ReadGroups
+ \brief corresponds to \@RG entries
+ \sa SamReadGroup, SamReadGroupDictionary
+*/
+/*! \var SamHeader::Programs
+ \brief corresponds to \@PG entries
+ \sa SamProgram, SamProgramChain
+*/
+/*! \var SamHeader::Comments
+ \brief corresponds to \@CO entries
+*/
+
+/*! \fn SamHeader::SamHeader(const std::string& headerText = std::string())
+ \brief constructor
+*/
+SamHeader::SamHeader(const std::string& headerText)
+ : SortOrder(Constants::SAM_HD_SORTORDER_UNKNOWN)
+{
+ SetHeaderText(headerText);
+}
+
+/*! \fn SamHeader::SamHeader(const SamHeader& other)
+ \brief copy constructor
+*/
+SamHeader::SamHeader(const SamHeader& other)
+ : Version(other.Version)
+ , SortOrder(other.SortOrder)
+ , GroupOrder(other.GroupOrder)
+ , CustomTags(other.CustomTags)
+ , Sequences(other.Sequences)
+ , ReadGroups(other.ReadGroups)
+ , Programs(other.Programs)
+ , Comments(other.Comments)
+ , m_errorString(other.GetErrorString())
+{}
+
+/*! \fn SamHeader::~SamHeader()
+ \brief destructor
+*/
+SamHeader::~SamHeader() {}
+
+/*! \fn void SamHeader::Clear()
+ \brief Clears all header contents.
+*/
+void SamHeader::Clear()
+{
+
+ // clear SAM header components
+ Version.clear();
+ SortOrder.clear();
+ GroupOrder.clear();
+ CustomTags.clear();
+ Sequences.Clear();
+ ReadGroups.Clear();
+ Programs.Clear();
+ Comments.clear();
+
+ // clear error string
+ m_errorString.clear();
+}
+
+/*! \fn std::string SamHeader::GetErrorString() const
+ \brief Returns a human-readable description of the last error that occurred
+
+ This method allows elimination of STDERR pollution. Developers of client code
+ may choose how the messages are displayed to the user, if at all.
+
+ \return error description
+*/
+std::string SamHeader::GetErrorString() const
+{
+ return m_errorString;
+}
+
+/*! \fn bool SamHeader::HasError() const
+ \brief Returns \c true if header encountered an error
+*/
+bool SamHeader::HasError() const
+{
+ return (!m_errorString.empty());
+}
+
+/*! \fn bool SamHeader::HasVersion() const
+ \brief Returns \c true if header contains \@HD ID:\<Version\>
+*/
+bool SamHeader::HasVersion() const
+{
+ return (!Version.empty());
+}
+
+/*! \fn bool SamHeader::HasSortOrder() const
+ \brief Returns \c true if header contains \@HD SO:\<SortOrder\>
+*/
+bool SamHeader::HasSortOrder() const
+{
+ return (!SortOrder.empty());
+}
+
+/*! \fn bool SamHeader::HasGroupOrder() const
+ \brief Returns \c true if header contains \@HD GO:\<GroupOrder\>
+*/
+bool SamHeader::HasGroupOrder() const
+{
+ return (!GroupOrder.empty());
+}
+
+/*! \fn bool SamHeader::HasSequences() const
+ \brief Returns \c true if header contains any \@SQ entries
+*/
+bool SamHeader::HasSequences() const
+{
+ return (!Sequences.IsEmpty());
+}
+
+/*! \fn bool SamHeader::HasReadGroups() const
+ \brief Returns \c true if header contains any \@RG entries
+*/
+bool SamHeader::HasReadGroups() const
+{
+ return (!ReadGroups.IsEmpty());
+}
+
+/*! \fn bool SamHeader::HasPrograms() const
+ \brief Returns \c true if header contains any \@PG entries
+*/
+bool SamHeader::HasPrograms() const
+{
+ return (!Programs.IsEmpty());
+}
+
+/*! \fn bool SamHeader::HasComments() const
+ \brief Returns \c true if header contains any \@CO entries
+*/
+bool SamHeader::HasComments() const
+{
+ return (!Comments.empty());
+}
+
+/*! \fn bool SamHeader::IsValid(bool verbose = false) const
+ \brief Checks header contents for required data and proper formatting.
+
+ \param[in] verbose If set to true, validation errors & warnings will be printed to stderr.
+ Otherwise, messages are available through SamHeader::GetErrorString().
+ \return \c true if SAM header is well-formed
+*/
+bool SamHeader::IsValid(bool verbose) const
+{
+
+ SamHeaderValidator validator(*this);
+
+ // if SAM header is valid, return success
+ if (validator.Validate()) return true;
+
+ // otherwiser
+ else {
+
+ // print messages to stderr
+ if (verbose) validator.PrintMessages(std::cerr);
+
+ // or catch in local error string
+ else {
+ std::stringstream errorStream;
+ validator.PrintMessages(errorStream);
+ m_errorString = errorStream.str();
+ }
+ return false;
+ }
+}
+
+/*! \fn void SamHeader::SetHeaderText(const std::string& headerText)
+ \brief Replaces header contents with \a headerText.
+
+ \param[in] headerText SAM formatted-text that will be parsed into data fields
+*/
+void SamHeader::SetHeaderText(const std::string& headerText)
+{
+
+ // clear prior data
+ Clear();
+
+ try {
+ SamFormatParser parser(*this);
+ parser.Parse(headerText);
+ } catch (BamException& e) {
+
+ // clear anything parsed so far
+ // no telling what's valid and what's partially parsed
+ Clear();
+
+ // set error string
+ m_errorString = e.what();
+ }
+}
+
+/*! \fn std::string SamHeader::ToString() const
+ \brief Converts data fields to SAM-formatted text.
+
+ Applies any local modifications made since creating this object or calling SetHeaderText().
+
+ \return SAM-formatted header text
+*/
+std::string SamHeader::ToString() const
+{
+ SamFormatPrinter printer(*this);
+ return printer.ToString();
+}
diff --git a/src/api/SamHeader.h b/src/api/SamHeader.h
new file mode 100644
index 0000000..23534b1
--- /dev/null
+++ b/src/api/SamHeader.h
@@ -0,0 +1,78 @@
+// ***************************************************************************
+// SamHeader.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides direct read/write access to the SAM header data fields.
+// ***************************************************************************
+
+#ifndef SAM_HEADER_H
+#define SAM_HEADER_H
+
+#include <string>
+#include <vector>
+#include "api/BamAux.h"
+#include "api/SamProgramChain.h"
+#include "api/SamReadGroupDictionary.h"
+#include "api/SamSequenceDictionary.h"
+#include "api/api_global.h"
+
+namespace BamTools {
+
+struct API_EXPORT SamHeader
+{
+
+ // ctor & dtor
+ SamHeader(const std::string& headerText = std::string());
+ SamHeader(const SamHeader& other);
+ ~SamHeader();
+
+ // query/modify entire SamHeader
+ void Clear(); // clears all header contents
+ std::string GetErrorString() const;
+ bool HasError() const;
+ bool IsValid(bool verbose = false) const; // returns true if SAM header is well-formed
+ void SetHeaderText(
+ const std::string& headerText); // replaces data fields with contents of SAM-formatted text
+ std::string ToString() const; // returns the printable, SAM-formatted header text
+
+ // convenience query methods
+ bool HasVersion() const; // returns true if header contains format version entry
+ bool HasSortOrder() const; // returns true if header contains sort order entry
+ bool HasGroupOrder() const; // returns true if header contains group order entry
+ bool HasSequences() const; // returns true if header contains any sequence entries
+ bool HasReadGroups() const; // returns true if header contains any read group entries
+ bool HasPrograms() const; // returns true if header contains any program record entries
+ bool HasComments() const; // returns true if header contains comments
+
+ // --------------
+ // data members
+ // --------------
+
+ // header metadata (@HD line)
+ std::string Version; // VN:<Version> *Required, if @HD record is present*
+ std::string SortOrder; // SO:<SortOrder>
+ std::string GroupOrder; // GO:<GroupOrder>
+ std::vector<CustomHeaderTag> CustomTags; // optional custom tags on @HD line
+
+ // header sequences (@SQ entries)
+ SamSequenceDictionary Sequences;
+
+ // header read groups (@RG entries)
+ SamReadGroupDictionary ReadGroups;
+
+ // header program data (@PG entries)
+ SamProgramChain Programs;
+
+ // header comments (@CO entries)
+ std::vector<std::string> Comments;
+
+ // internal data
+private:
+ mutable std::string m_errorString;
+};
+
+} // namespace BamTools
+
+#endif // SAM_HEADER_H
diff --git a/src/api/SamProgram.cpp b/src/api/SamProgram.cpp
new file mode 100644
index 0000000..0c23f11
--- /dev/null
+++ b/src/api/SamProgram.cpp
@@ -0,0 +1,134 @@
+// ***************************************************************************
+// SamProgram.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides direct read/write access to the SAM header program records.
+// ***************************************************************************
+
+#include "api/SamProgram.h"
+using namespace BamTools;
+
+/*! \struct BamTools::SamProgram
+ \brief Represents a SAM program record.
+
+ Provides direct read/write access to the SAM header program records.
+
+ \sa \samSpecURL
+*/
+/*! \var SamProgram::CommandLine
+ \brief corresponds to \@PG CL:\<CommandLine\>
+*/
+/*! \var SamProgram::ID
+ \brief corresponds to \@PG ID:\<ID\>
+
+ Required for valid SAM header.
+*/
+/*! \var SamProgram::Name
+ \brief corresponds to \@PG PN:\<Name\>
+*/
+/*! \var SamProgram::PreviousProgramID
+ \brief corresponds to \@PG PP:\<PreviousProgramID\>
+*/
+/*! \var SamProgram::Version
+ \brief corresponds to \@PG VN:\<Version\>
+*/
+/*! \var SamProgram::NextProgramID
+ \internal
+ Holds ID of the "next" program record in a SamProgramChain
+*/
+
+/*! \fn SamProgram::SamProgram()
+ \brief default constructor
+*/
+SamProgram::SamProgram() {}
+
+/*! \fn SamProgram::SamProgram(const std::string& id)
+ \brief constructs program record with \a id
+
+ \param id desired program record ID
+*/
+SamProgram::SamProgram(const std::string& id)
+ : ID(id)
+{}
+
+/*! \fn SamProgram::SamProgram(const SamProgram& other)
+ \brief copy constructor
+*/
+SamProgram::SamProgram(const SamProgram& other)
+ : CommandLine(other.CommandLine)
+ , ID(other.ID)
+ , Name(other.Name)
+ , PreviousProgramID(other.PreviousProgramID)
+ , Version(other.Version)
+ , CustomTags(other.CustomTags)
+ , NextProgramID(other.NextProgramID)
+{}
+
+/*! \fn SamProgram::~SamProgram()
+ \brief destructor
+*/
+SamProgram::~SamProgram() {}
+
+/*! \fn void SamProgram::Clear()
+ \brief Clears all data fields.
+*/
+void SamProgram::Clear()
+{
+ CommandLine.clear();
+ ID.clear();
+ Name.clear();
+ PreviousProgramID.clear();
+ Version.clear();
+ NextProgramID.clear();
+}
+
+/*! \fn bool SamProgram::HasCommandLine() const
+ \brief Returns \c true if program record contains \@PG: CL:\<CommandLine\>
+*/
+bool SamProgram::HasCommandLine() const
+{
+ return (!CommandLine.empty());
+}
+
+/*! \fn bool SamProgram::HasID() const
+ \brief Returns \c true if program record contains \@PG: ID:\<ID\>
+*/
+bool SamProgram::HasID() const
+{
+ return (!ID.empty());
+}
+
+/*! \fn bool SamProgram::HasName() const
+ \brief Returns \c true if program record contains \@PG: PN:\<Name\>
+*/
+bool SamProgram::HasName() const
+{
+ return (!Name.empty());
+}
+
+/*! \fn bool SamProgram::HasNextProgramID() const
+ \internal
+ \return true if program has a "next" record in a SamProgramChain
+*/
+bool SamProgram::HasNextProgramID() const
+{
+ return (!NextProgramID.empty());
+}
+
+/*! \fn bool SamProgram::HasPreviousProgramID() const
+ \brief Returns \c true if program record contains \@PG: PP:\<PreviousProgramID\>
+*/
+bool SamProgram::HasPreviousProgramID() const
+{
+ return (!PreviousProgramID.empty());
+}
+
+/*! \fn bool SamProgram::HasVersion() const
+ \brief Returns \c true if program record contains \@PG: VN:\<Version\>
+*/
+bool SamProgram::HasVersion() const
+{
+ return (!Version.empty());
+}
diff --git a/src/api/SamProgram.h b/src/api/SamProgram.h
new file mode 100644
index 0000000..b6e3017
--- /dev/null
+++ b/src/api/SamProgram.h
@@ -0,0 +1,66 @@
+// ***************************************************************************
+// SamProgram.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides direct read/write access to the SAM header program records.
+// ***************************************************************************
+
+#ifndef SAM_PROGRAM_H
+#define SAM_PROGRAM_H
+
+#include <string>
+#include "api/BamAux.h"
+#include "api/api_global.h"
+
+namespace BamTools {
+
+class SamProgramChain;
+
+struct API_EXPORT SamProgram
+{
+
+ // ctor & dtor
+ SamProgram();
+ SamProgram(const std::string& id);
+ SamProgram(const SamProgram& other);
+ ~SamProgram();
+
+ // query/modify entire program record
+ void Clear(); // clears all data fields
+
+ // convenience query methods
+ bool HasCommandLine() const; // returns true if program record has a command line entry
+ bool HasID() const; // returns true if program record has an ID
+ bool HasName() const; // returns true if program record has a name
+ bool HasPreviousProgramID()
+ const; // returns true if program record has a 'previous program ID'
+ bool HasVersion() const; // returns true if program record has a version
+
+ // data members
+ std::string CommandLine; // CL:<CommandLine>
+ std::string ID; // ID:<ID> *Required for valid SAM header*
+ std::string Name; // PN:<Name>
+ std::string PreviousProgramID; // PP:<PreviousProgramID>
+ std::string Version; // VN:<Version>
+ std::vector<CustomHeaderTag> CustomTags; // optional custom tags
+
+ // internal (non-standard) methods & fields
+private:
+ bool HasNextProgramID() const;
+ std::string NextProgramID;
+ friend class BamTools::SamProgramChain;
+};
+
+/*! \fn bool operator==(const SamProgram& lhs, const SamProgram& rhs)
+ \brief tests equality by comparing program IDs
+*/
+API_EXPORT inline bool operator==(const SamProgram& lhs, const SamProgram& rhs)
+{
+ return lhs.ID == rhs.ID;
+}
+
+} // namespace BamTools
+
+#endif // SAM_PROGRAM_H
diff --git a/src/api/SamProgramChain.cpp b/src/api/SamProgramChain.cpp
new file mode 100644
index 0000000..d796def
--- /dev/null
+++ b/src/api/SamProgramChain.cpp
@@ -0,0 +1,363 @@
+// ***************************************************************************
+// SamProgramChain.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides methods for operating on a SamProgram record "chain"
+// ***************************************************************************
+
+#include "api/SamProgramChain.h"
+using namespace BamTools;
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+
+/*! \class BamTools::SamProgramChain
+ \brief Sorted container "chain" of SamProgram records.
+
+ Provides methods for operating on a collection of SamProgram records.
+
+ \note Underlying container is *NOT* ordered by linkage, but by order of
+ appearance in SamHeader and subsequent Add() calls. Using the current
+ iterators will not allow you to step through the header's program history.
+ Instead use First()/Last() to access oldest/newest records, respectively.
+*/
+
+/*! \fn SamProgramChain::SamProgramChain()
+ \brief constructor
+*/
+SamProgramChain::SamProgramChain() {}
+
+/*! \fn SamProgramChain::SamProgramChain(const SamProgramChain& other)
+ \brief copy constructor
+*/
+SamProgramChain::SamProgramChain(const SamProgramChain& other)
+ : m_data(other.m_data)
+{}
+
+/*! \fn SamProgramChain::~SamProgramChain()
+ \brief destructor
+*/
+SamProgramChain::~SamProgramChain() {}
+
+/*! \fn void SamProgramChain::Add(SamProgram& program)
+ \brief Appends a program to program chain.
+
+ Duplicate entries are silently discarded.
+
+ \note Underlying container is *NOT* ordered by linkage, but by order of
+ appearance in SamHeader and subsequent Add() calls. Using the current
+ iterators will not allow you to step through the header's program history.
+ Instead use First()/Last() to access oldest/newest records, respectively.
+
+ \param[in] program entry to be appended
+*/
+void SamProgramChain::Add(SamProgram& program)
+{
+
+ // ignore duplicated records
+ if (Contains(program)) return;
+
+ // if other programs already in chain, try to find the "next" record
+ // tries to match another record's PPID with @program's ID
+ if (!IsEmpty()) program.NextProgramID = NextIdFor(program.ID);
+
+ // store program record
+ m_data.push_back(program);
+}
+
+/*! \fn void SamProgramChain::Add(std::vector<SamProgram>& programs)
+ \brief Appends a batch of programs to the end of the chain.
+
+ This is an overloaded function.
+
+ \param[in] programs batch of program records to append
+ \sa Add()
+*/
+void SamProgramChain::Add(std::vector<SamProgram>& programs)
+{
+ std::vector<SamProgram>::iterator pgIter = programs.begin();
+ std::vector<SamProgram>::iterator pgEnd = programs.end();
+ for (; pgIter != pgEnd; ++pgIter)
+ Add(*pgIter);
+}
+
+/*! \fn SamProgramIterator SamProgramChain::Begin()
+ \return an STL iterator pointing to the first (oldest) program record
+ \sa ConstBegin(), End(), First()
+*/
+SamProgramIterator SamProgramChain::Begin()
+{
+ return m_data.begin();
+}
+
+/*! \fn SamProgramConstIterator SamProgramChain::Begin() const
+ \return an STL const_iterator pointing to the first (oldest) program record
+
+ This is an overloaded function.
+
+ \sa ConstBegin(), End(), First()
+*/
+SamProgramConstIterator SamProgramChain::Begin() const
+{
+ return m_data.begin();
+}
+
+/*! \fn void SamProgramChain::Clear()
+ \brief Clears all program records.
+*/
+void SamProgramChain::Clear()
+{
+ m_data.clear();
+}
+
+/*! \fn SamProgramConstIterator SamProgramChain::ConstBegin() const
+ \return an STL const_iterator pointing to the first (oldest) program record
+ \sa Begin(), ConstEnd(), First()
+*/
+SamProgramConstIterator SamProgramChain::ConstBegin() const
+{
+ return m_data.begin();
+}
+
+/*! \fn SamProgramConstIterator SamProgramChain::ConstEnd() const
+ \return an STL const_iterator pointing to the imaginary entry after the last (newest) program record
+ \sa ConstBegin(), End(), Last()
+*/
+SamProgramConstIterator SamProgramChain::ConstEnd() const
+{
+ return m_data.end();
+}
+
+/*! \fn bool SamProgramChain::Contains(const SamProgram& program) const
+ \brief Returns true if chains has this program record (matching on ID).
+
+ This is an overloaded function.
+
+ \param[in] program SamProgram to search for
+ \return \c true if chain contains program (matching on ID)
+*/
+bool SamProgramChain::Contains(const SamProgram& program) const
+{
+ return Contains(program.ID);
+}
+
+/*! \fn bool SamProgramChain::Contains(const std::string& programId) const
+ \brief Returns true if chains has a program record with this ID
+
+ \param[in] programId search for program matching this ID
+ \return \c true if chain contains a program record with this ID
+*/
+bool SamProgramChain::Contains(const std::string& programId) const
+{
+ return (IndexOf(programId) != (int)m_data.size());
+}
+
+/*! \fn SamProgramIterator SamProgramChain::End()
+ \return an STL iterator pointing to the imaginary entry after the last (newest) program record
+ \sa Begin(), ConstEnd(), Last()
+*/
+SamProgramIterator SamProgramChain::End()
+{
+ return m_data.end();
+}
+
+/*! \fn SamProgramConstIterator SamProgramChain::End() const
+ \return an STL const_iterator pointing to the imaginary entry after the last (newest) program record
+
+ This is an overloaded function.
+
+ \sa Begin(), ConstEnd(), Last()
+*/
+SamProgramConstIterator SamProgramChain::End() const
+{
+ return m_data.end();
+}
+
+/*! \fn SamProgram& SamProgramChain::First()
+ \brief Fetches first (oldest) record in the chain.
+
+ \warning This function will fail if the chain is empty. If this is possible,
+ check the result of IsEmpty() before calling this function.
+
+ \return a modifiable reference to the first (oldest) program entry
+ \sa Begin(), Last()
+*/
+SamProgram& SamProgramChain::First()
+{
+
+ // find first record in container that has no PreviousProgramID entry
+ SamProgramIterator iter = Begin();
+ SamProgramIterator end = End();
+ for (; iter != end; ++iter) {
+ SamProgram& current = (*iter);
+ if (!current.HasPreviousProgramID()) return current;
+ }
+
+ // otherwise error
+ std::cerr << "SamProgramChain::First: could not find any record without a PP tag" << std::endl;
+ std::exit(EXIT_FAILURE);
+}
+
+/*! \fn const SamProgram& SamProgramChain::First() const
+ \brief Fetches first (oldest) record in the chain.
+
+ This is an overloaded function.
+
+ \warning This function will fail if the chain is empty. If this is possible,
+ check the result of IsEmpty() before calling this function.
+
+ \return a read-only reference to the first (oldest) program entry
+ \sa Begin(), ConstBegin(), Last()
+*/
+const SamProgram& SamProgramChain::First() const
+{
+
+ // find first record in container that has no PreviousProgramID entry
+ SamProgramConstIterator iter = ConstBegin();
+ SamProgramConstIterator end = ConstEnd();
+ for (; iter != end; ++iter) {
+ const SamProgram& current = (*iter);
+ if (!current.HasPreviousProgramID()) return current;
+ }
+
+ // otherwise error
+ std::cerr << "SamProgramChain::First: could not find any record without a PP tag" << std::endl;
+ std::exit(EXIT_FAILURE);
+}
+
+/*! \fn int SamProgramChain::IndexOf(const std::string& programId) const
+ \internal
+ \return index of program record if found.
+ Otherwise, returns vector::size() (invalid index).
+*/
+int SamProgramChain::IndexOf(const std::string& programId) const
+{
+ SamProgramConstIterator begin = ConstBegin();
+ SamProgramConstIterator iter = begin;
+ SamProgramConstIterator end = ConstEnd();
+ for (; iter != end; ++iter) {
+ const SamProgram& current = (*iter);
+ if (current.ID == programId) break;
+ }
+ return distance(begin, iter);
+}
+
+/*! \fn bool SamProgramChain::IsEmpty() const
+ \brief Returns \c true if chain contains no records
+ \sa Size()
+*/
+bool SamProgramChain::IsEmpty() const
+{
+ return m_data.empty();
+}
+
+/*! \fn SamProgram& SamProgramChain::Last()
+ \brief Fetches last (newest) record in the chain.
+
+ \warning This function will fail if the chain is empty. If this is possible,
+ check the result of IsEmpty() before calling this function.
+
+ \return a modifiable reference to the last (newest) program entry
+ \sa End(), First()
+*/
+SamProgram& SamProgramChain::Last()
+{
+ // find first record in container that has no NextProgramID entry
+ SamProgramIterator iter = Begin();
+ SamProgramIterator end = End();
+ for (; iter != end; ++iter) {
+ SamProgram& current = (*iter);
+ if (!current.HasNextProgramID()) return current;
+ }
+
+ // otherwise error
+ std::cerr << "SamProgramChain::Last: could not determine last record" << std::endl;
+ std::exit(EXIT_FAILURE);
+}
+
+/*! \fn const SamProgram& SamProgramChain::Last() const
+ \brief Fetches last (newest) record in the chain.
+
+ This is an overloaded function.
+
+ \warning This function will fail if the chain is empty. If this is possible,
+ check the result of IsEmpty() before calling this function.
+
+ \return a read-only reference to the last (newest) program entry
+ \sa End(), ConstEnd(), First()
+*/
+const SamProgram& SamProgramChain::Last() const
+{
+ // find first record in container that has no NextProgramID entry
+ SamProgramConstIterator iter = ConstBegin();
+ SamProgramConstIterator end = ConstEnd();
+ for (; iter != end; ++iter) {
+ const SamProgram& current = (*iter);
+ if (!current.HasNextProgramID()) return current;
+ }
+
+ // otherwise error
+ std::cerr << "SamProgramChain::Last: could not determine last record" << std::endl;
+ std::exit(EXIT_FAILURE);
+}
+
+/*! \fn const std::string SamProgramChain::NextIdFor(const std::string& programId) const
+ \internal
+
+ \return ID of program record, whose PreviousProgramID matches \a programId.
+ Otherwise, returns empty string if none found.
+*/
+const std::string SamProgramChain::NextIdFor(const std::string& programId) const
+{
+
+ // find first record in container whose PreviousProgramID matches @programId
+ SamProgramConstIterator iter = ConstBegin();
+ SamProgramConstIterator end = ConstEnd();
+ for (; iter != end; ++iter) {
+ const SamProgram& current = (*iter);
+ if (!current.HasPreviousProgramID() && current.PreviousProgramID == programId) {
+ return current.ID;
+ }
+ }
+
+ // none found
+ return std::string();
+}
+
+/*! \fn int SamProgramChain::Size() const
+ \brief Returns number of program records in the chain.
+ \sa IsEmpty()
+*/
+int SamProgramChain::Size() const
+{
+ return m_data.size();
+}
+
+/*! \fn SamProgram& SamProgramChain::operator[](const std::string& programId)
+ \brief Retrieves the modifiable SamProgram record that matches \a programId.
+
+ \warning If the chain contains no read group matching this ID, this function will
+ print an error and terminate. Check the return value of Contains() if this may be
+ possible.
+
+ \param[in] programId ID of program record to retrieve
+ \return a modifiable reference to the SamProgram associated with the ID
+*/
+SamProgram& SamProgramChain::operator[](const std::string& programId)
+{
+
+ // look up program record matching this ID
+ int index = IndexOf(programId);
+
+ // if record not found
+ if (index == (int)m_data.size()) {
+ std::cerr << "SamProgramChain::operator[] - unknown programId: " << programId << std::endl;
+ std::exit(EXIT_FAILURE);
+ }
+
+ // otherwise return program record at index
+ return m_data.at(index);
+}
diff --git a/src/api/SamProgramChain.h b/src/api/SamProgramChain.h
new file mode 100644
index 0000000..9e61857
--- /dev/null
+++ b/src/api/SamProgramChain.h
@@ -0,0 +1,86 @@
+// ***************************************************************************
+// SamProgramChain.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides methods for operating on a SamProgram record "chain"
+// ***************************************************************************
+
+#ifndef SAM_PROGRAMCHAIN_H
+#define SAM_PROGRAMCHAIN_H
+
+#include <string>
+#include <vector>
+#include "api/SamProgram.h"
+#include "api/api_global.h"
+
+namespace BamTools {
+
+// chain is *NOT* sorted in any order
+// use First()/Last() to retrieve oldest/newest programs, respectively
+typedef std::vector<SamProgram> SamProgramContainer;
+typedef SamProgramContainer::iterator SamProgramIterator;
+typedef SamProgramContainer::const_iterator SamProgramConstIterator;
+
+class API_EXPORT SamProgramChain
+{
+
+ // ctor & dtor
+public:
+ SamProgramChain();
+ SamProgramChain(const SamProgramChain& other);
+ ~SamProgramChain();
+
+ // query/modify program data
+public:
+ // appends a program record to the chain
+ void Add(SamProgram& program);
+ void Add(std::vector<SamProgram>& programs);
+
+ // clears all read group entries
+ void Clear();
+
+ // returns true if chain contains this program record (matches on ID)
+ bool Contains(const SamProgram& program) const;
+ bool Contains(const std::string& programId) const;
+
+ // returns the first (oldest) program in the chain
+ SamProgram& First();
+ const SamProgram& First() const;
+
+ // returns true if chain is empty
+ bool IsEmpty() const;
+
+ // returns last (most recent) program in the chain
+ SamProgram& Last();
+ const SamProgram& Last() const;
+
+ // returns number of program records in the chain
+ int Size() const;
+
+ // retrieves a modifiable reference to the SamProgram object associated with this ID
+ SamProgram& operator[](const std::string& programId);
+
+ // retrieve STL-compatible iterators
+public:
+ SamProgramIterator Begin(); // returns iterator to begin()
+ SamProgramConstIterator Begin() const; // returns const_iterator to begin()
+ SamProgramConstIterator ConstBegin() const; // returns const_iterator to begin()
+ SamProgramIterator End(); // returns iterator to end()
+ SamProgramConstIterator End() const; // returns const_iterator to end()
+ SamProgramConstIterator ConstEnd() const; // returns const_iterator to end()
+
+ // internal methods
+private:
+ int IndexOf(const std::string& programId) const;
+ const std::string NextIdFor(const std::string& programId) const;
+
+ // data members
+private:
+ SamProgramContainer m_data;
+};
+
+} // namespace BamTools
+
+#endif // SAM_PROGRAMCHAIN_H
diff --git a/src/api/SamReadGroup.cpp b/src/api/SamReadGroup.cpp
new file mode 100644
index 0000000..259c6ba
--- /dev/null
+++ b/src/api/SamReadGroup.cpp
@@ -0,0 +1,211 @@
+// ***************************************************************************
+// SamReadGroup.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides direct read/write access to the SAM read group data fields.
+// ***************************************************************************
+
+#include "api/SamReadGroup.h"
+using namespace BamTools;
+
+/*! \struct BamTools::SamReadGroup
+ \brief Represents a SAM read group entry.
+
+ Provides direct read/write access to the SAM read group data fields.
+
+ \sa \samSpecURL
+*/
+/*! \var SamReadGroup::Description
+ \brief corresponds to \@RG DS:\<Description\>
+*/
+/*! \var SamReadGroup::FlowOrder
+ \brief corresponds to \@RG FO:\<FlowOrder\>
+*/
+/*! \var SamReadGroup::ID
+ \brief corresponds to \@RG ID:\<ID\>
+
+ Required for valid SAM header.
+*/
+/*! \var SamReadGroup::KeySequence
+ \brief corresponds to \@RG KS:\<KeySequence\>
+*/
+/*! \var SamReadGroup::Library
+ \brief corresponds to \@RG LB:\<Library\>
+*/
+/*! \var SamReadGroup::PlatformUnit
+ \brief corresponds to \@RG PU:\<PlatformUnit\>
+*/
+/*! \var SamReadGroup::PredictedInsertSize
+ \brief corresponds to \@RG PI:\<PredictedInsertSize\>
+*/
+/*! \var SamReadGroup::ProductionDate
+ \brief corresponds to \@RG DT:\<ProductionDate\>
+*/
+/*! \var SamReadGroup::Program
+ \brief corresponds to \@RG PG:\<Program\>
+*/
+/*! \var SamReadGroup::Sample
+ \brief corresponds to \@RG SM:\<Sample\>
+*/
+/*! \var SamReadGroup::SequencingCenter
+ \brief corresponds to \@RG CN:\<SequencingCenter\>
+*/
+/*! \var SamReadGroup::SequencingTechnology
+ \brief corresponds to \@RG PL:\<SequencingTechnology\>
+*/
+
+/*! \fn SamReadGroup::SamReadGroup()
+ \brief default constructor
+*/
+SamReadGroup::SamReadGroup() {}
+
+/*! \fn SamReadGroup::SamReadGroup(const std::string& id)
+ \brief constructs read group with \a id
+
+ \param id desired read group ID
+*/
+SamReadGroup::SamReadGroup(const std::string& id)
+ : ID(id)
+{}
+
+/*! \fn SamReadGroup::SamReadGroup(const SamReadGroup& other)
+ \brief copy constructor
+*/
+SamReadGroup::SamReadGroup(const SamReadGroup& other)
+ : Description(other.Description)
+ , FlowOrder(other.FlowOrder)
+ , ID(other.ID)
+ , KeySequence(other.KeySequence)
+ , Library(other.Library)
+ , PlatformUnit(other.PlatformUnit)
+ , PredictedInsertSize(other.PredictedInsertSize)
+ , ProductionDate(other.ProductionDate)
+ , Program(other.Program)
+ , Sample(other.Sample)
+ , SequencingCenter(other.SequencingCenter)
+ , SequencingTechnology(other.SequencingTechnology)
+ , CustomTags(other.CustomTags)
+{}
+
+/*! \fn SamReadGroup::~SamReadGroup()
+ \brief destructor
+*/
+SamReadGroup::~SamReadGroup() {}
+
+/*! \fn void SamReadGroup::Clear()
+ \brief Clears all data fields.
+*/
+void SamReadGroup::Clear()
+{
+ Description.clear();
+ FlowOrder.clear();
+ ID.clear();
+ KeySequence.clear();
+ Library.clear();
+ PlatformUnit.clear();
+ PredictedInsertSize.clear();
+ ProductionDate.clear();
+ Program.clear();
+ Sample.clear();
+ SequencingCenter.clear();
+ SequencingTechnology.clear();
+ CustomTags.clear();
+}
+
+/*! \fn bool SamReadGroup::HasDescription() const
+ \brief Returns \c true if read group contains \@RG DS:\<Description\>
+*/
+bool SamReadGroup::HasDescription() const
+{
+ return (!Description.empty());
+}
+
+/*! \fn bool SamReadGroup::HasFlowOrder() const
+ \brief Returns \c true if read group contains \@RG FO:\<FlowOrder\>
+*/
+bool SamReadGroup::HasFlowOrder() const
+{
+ return (!FlowOrder.empty());
+}
+
+/*! \fn bool SamReadGroup::HasID() const
+ \brief Returns \c true if read group contains \@RG: ID:\<ID\>
+*/
+bool SamReadGroup::HasID() const
+{
+ return (!ID.empty());
+}
+
+/*! \fn bool SamReadGroup::HasKeySequence() const
+ \brief Returns \c true if read group contains \@RG KS:\<KeySequence\>
+*/
+bool SamReadGroup::HasKeySequence() const
+{
+ return (!KeySequence.empty());
+}
+
+/*! \fn bool SamReadGroup::HasLibrary() const
+ \brief Returns \c true if read group contains \@RG LB:\<Library\>
+*/
+bool SamReadGroup::HasLibrary() const
+{
+ return (!Library.empty());
+}
+
+/*! \fn bool SamReadGroup::HasPlatformUnit() const
+ \brief Returns \c true if read group contains \@RG PU:\<PlatformUnit\>
+*/
+bool SamReadGroup::HasPlatformUnit() const
+{
+ return (!PlatformUnit.empty());
+}
+
+/*! \fn bool SamReadGroup::HasPredictedInsertSize() const
+ \brief Returns \c true if read group contains \@RG PI:\<PredictedInsertSize\>
+*/
+bool SamReadGroup::HasPredictedInsertSize() const
+{
+ return (!PredictedInsertSize.empty());
+}
+
+/*! \fn bool SamReadGroup::HasProductionDate() const
+ \brief Returns \c true if read group contains \@RG DT:\<ProductionDate\>
+*/
+bool SamReadGroup::HasProductionDate() const
+{
+ return (!ProductionDate.empty());
+}
+
+/*! \fn bool SamReadGroup::HasProgram() const
+ \brief Returns \c true if read group contains \@RG PG:\<Program\>
+*/
+bool SamReadGroup::HasProgram() const
+{
+ return (!Program.empty());
+}
+
+/*! \fn bool SamReadGroup::HasSample() const
+ \brief Returns \c true if read group contains \@RG SM:\<Sample\>
+*/
+bool SamReadGroup::HasSample() const
+{
+ return (!Sample.empty());
+}
+
+/*! \fn bool SamReadGroup::HasSequencingCenter() const
+ \brief Returns \c true if read group contains \@RG CN:\<SequencingCenter\>
+*/
+bool SamReadGroup::HasSequencingCenter() const
+{
+ return (!SequencingCenter.empty());
+}
+
+/*! \fn bool SamReadGroup::HasSequencingTechnology() const
+ \brief Returns \c true if read group contains \@RG PL:\<SequencingTechnology\>
+*/
+bool SamReadGroup::HasSequencingTechnology() const
+{
+ return (!SequencingTechnology.empty());
+}
diff --git a/src/api/SamReadGroup.h b/src/api/SamReadGroup.h
new file mode 100644
index 0000000..96896e5
--- /dev/null
+++ b/src/api/SamReadGroup.h
@@ -0,0 +1,73 @@
+// ***************************************************************************
+// SamReadGroup.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides direct read/write access to the SAM read group data fields.
+// ***************************************************************************
+
+#ifndef SAM_READGROUP_H
+#define SAM_READGROUP_H
+
+#include <string>
+#include <vector>
+#include "api/BamAux.h"
+#include "api/api_global.h"
+
+namespace BamTools {
+
+struct API_EXPORT SamReadGroup
+{
+
+ // ctor & dtor
+ SamReadGroup();
+ SamReadGroup(const std::string& id);
+ SamReadGroup(const SamReadGroup& other);
+ ~SamReadGroup();
+
+ // query/modify entire read group
+ void Clear(); // clears all data fields
+
+ // convenience query methods
+ bool HasDescription() const; // returns true if read group has a description
+ bool HasFlowOrder() const; // returns true if read group has a flow order entry
+ bool HasID() const; // returns true if read group has a group ID
+ bool HasKeySequence() const; // returns true if read group has a key sequence
+ bool HasLibrary() const; // returns true if read group has a library name
+ bool HasPlatformUnit() const; // returns true if read group has a platform unit ID
+ bool HasPredictedInsertSize() const; // returns true if read group has a predicted insert size
+ bool HasProductionDate() const; // returns true if read group has a production date
+ bool HasProgram() const; // returns true if read group has a program entry
+ bool HasSample() const; // returns true if read group has a sample name
+ bool HasSequencingCenter() const; // returns true if read group has a sequencing center ID
+ bool HasSequencingTechnology()
+ const; // returns true if read group has a sequencing technology ID
+
+ // data fields
+ std::string Description; // DS:<Description>
+ std::string FlowOrder; // FO:<FlowOrder>
+ std::string ID; // ID:<ID> *Required for valid SAM header*
+ std::string KeySequence; // KS:<KeySequence>
+ std::string Library; // LB:<Library>
+ std::string PlatformUnit; // PU:<PlatformUnit>
+ std::string PredictedInsertSize; // PI:<PredictedInsertSize>
+ std::string ProductionDate; // DT:<ProductionDate>
+ std::string Program; // PG:<Program>
+ std::string Sample; // SM:<Sample>
+ std::string SequencingCenter; // CN:<SequencingCenter>
+ std::string SequencingTechnology; // PL:<SequencingTechnology>
+ std::vector<CustomHeaderTag> CustomTags; // optional custom tags
+};
+
+/*! \fn bool operator==(const SamReadGroup& lhs, const SamReadGroup& rhs)
+ \brief tests equality by comparing read group IDs
+*/
+API_EXPORT inline bool operator==(const SamReadGroup& lhs, const SamReadGroup& rhs)
+{
+ return lhs.ID == rhs.ID;
+}
+
+} // namespace BamTools
+
+#endif // SAM_READGROUP_H
diff --git a/src/api/SamReadGroupDictionary.cpp b/src/api/SamReadGroupDictionary.cpp
new file mode 100644
index 0000000..ec88031
--- /dev/null
+++ b/src/api/SamReadGroupDictionary.cpp
@@ -0,0 +1,317 @@
+// ***************************************************************************
+// SamReadGroupDictionary.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 16 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides methods for operating on a collection of SamReadGroup entries.
+// ***************************************************************************
+
+#include "api/SamReadGroupDictionary.h"
+using namespace BamTools;
+
+#include <cstddef>
+#include <iostream>
+
+/*! \class BamTools::SamReadGroupDictionary
+ \brief Container of SamReadGroup entries.
+
+ Provides methods for operating on a collection of SamReadGroup entries.
+*/
+
+/*! \fn SamReadGroupDictionary::SamReadGroupDictionary()
+ \brief constructor
+*/
+SamReadGroupDictionary::SamReadGroupDictionary() {}
+
+/*! \fn SamReadGroupDictionary::SamReadGroupDictionary(const SamReadGroupDictionary& other)
+ \brief copy constructor
+*/
+SamReadGroupDictionary::SamReadGroupDictionary(const SamReadGroupDictionary& other)
+ : m_data(other.m_data)
+ , m_lookupData(other.m_lookupData)
+{}
+
+/*! \fn SamReadGroupDictionary::~SamReadGroupDictionary()
+ \brief destructor
+*/
+SamReadGroupDictionary::~SamReadGroupDictionary() {}
+
+/*! \fn void SamReadGroupDictionary::Add(const SamReadGroup& readGroup)
+ \brief Appends a read group to the dictionary.
+
+ Duplicate entries are silently discarded.
+
+ \param[in] readGroup entry to be added
+*/
+void SamReadGroupDictionary::Add(const SamReadGroup& readGroup)
+{
+ if (IsEmpty() || !Contains(readGroup)) {
+ m_data.push_back(readGroup);
+ m_lookupData[readGroup.ID] = m_data.size() - 1;
+ }
+}
+
+/*! \fn void SamReadGroupDictionary::Add(const std::string& readGroupId)
+ \brief Appends a read group to the dictionary.
+
+ This is an overloaded function.
+
+ \param[in] readGroupId ID of read group to be added
+ \sa Add()
+*/
+void SamReadGroupDictionary::Add(const std::string& readGroupId)
+{
+ Add(SamReadGroup(readGroupId));
+}
+
+/*! \fn void SamReadGroupDictionary::Add(const SamReadGroupDictionary& readGroups)
+ \brief Appends another read group dictionary to this one.
+
+ This is an overloaded function.
+
+ \param[in] readGroups entries to be added
+ \sa Add()
+*/
+void SamReadGroupDictionary::Add(const SamReadGroupDictionary& readGroups)
+{
+ SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
+ SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
+ for (; rgIter != rgEnd; ++rgIter)
+ Add(*rgIter);
+}
+
+/*! \fn void SamReadGroupDictionary::Add(const std::vector<SamReadGroup>& readGroups)
+ \brief Appends multiple read groups to the dictionary.
+
+ This is an overloaded function.
+
+ \param[in] readGroups entries to be added
+ \sa Add()
+*/
+void SamReadGroupDictionary::Add(const std::vector<SamReadGroup>& readGroups)
+{
+ std::vector<SamReadGroup>::const_iterator rgIter = readGroups.begin();
+ std::vector<SamReadGroup>::const_iterator rgEnd = readGroups.end();
+ for (; rgIter != rgEnd; ++rgIter)
+ Add(*rgIter);
+}
+
+/*! \fn void SamReadGroupDictionary::Add(const std::vector<std::string>& readGroupIds)
+ \brief Appends multiple read groups to the dictionary.
+
+ This is an overloaded function.
+
+ \param[in] readGroupIds IDs of read groups to be added
+ \sa Add()
+*/
+void SamReadGroupDictionary::Add(const std::vector<std::string>& readGroupIds)
+{
+ std::vector<std::string>::const_iterator rgIter = readGroupIds.begin();
+ std::vector<std::string>::const_iterator rgEnd = readGroupIds.end();
+ for (; rgIter != rgEnd; ++rgIter)
+ Add(*rgIter);
+}
+
+/*! \fn SamReadGroupIterator SamReadGroupDictionary::Begin()
+ \return an STL iterator pointing to the first read group
+ \sa ConstBegin(), End()
+*/
+SamReadGroupIterator SamReadGroupDictionary::Begin()
+{
+ return m_data.begin();
+}
+
+/*! \fn SamReadGroupConstIterator SamReadGroupDictionary::Begin() const
+ \return an STL const_iterator pointing to the first read group
+
+ This is an overloaded function.
+
+ \sa ConstBegin(), End()
+*/
+SamReadGroupConstIterator SamReadGroupDictionary::Begin() const
+{
+ return m_data.begin();
+}
+
+/*! \fn void SamReadGroupDictionary::Clear()
+ \brief Clears all read group entries.
+*/
+void SamReadGroupDictionary::Clear()
+{
+ m_data.clear();
+ m_lookupData.clear();
+}
+
+/*! \fn SamReadGroupConstIterator SamReadGroupDictionary::ConstBegin() const
+ \return an STL const_iterator pointing to the first read group
+ \sa Begin(), ConstEnd()
+*/
+SamReadGroupConstIterator SamReadGroupDictionary::ConstBegin() const
+{
+ return m_data.begin();
+}
+
+/*! \fn SamReadGroupConstIterator SamReadGroupDictionary::ConstEnd() const
+ \return an STL const_iterator pointing to the imaginary entry after the last read group
+ \sa ConstBegin(), End()
+*/
+SamReadGroupConstIterator SamReadGroupDictionary::ConstEnd() const
+{
+ return m_data.end();
+}
+
+/*! \fn bool SamReadGroupDictionary::Contains(const std::string& readGroupId) const
+ \brief Returns true if dictionary contains read group.
+
+ \param[in] readGroupId search for read group matching this ID
+ \return \c true if dictionary contains a read group with this ID
+*/
+bool SamReadGroupDictionary::Contains(const std::string& readGroupId) const
+{
+ return (m_lookupData.find(readGroupId) != m_lookupData.end());
+}
+
+/*! \fn bool SamReadGroupDictionary::Contains(const SamReadGroup& readGroup) const
+ \brief Returns true if dictionary contains read group (matching on ID).
+
+ This is an overloaded function.
+
+ \param[in] readGroup search for this read group
+ \return \c true if dictionary contains read group (matching on ID).
+*/
+bool SamReadGroupDictionary::Contains(const SamReadGroup& readGroup) const
+{
+ return Contains(readGroup.ID);
+}
+
+/*! \fn SamReadGroupIterator SamReadGroupDictionary::End()
+ \return an STL iterator pointing to the imaginary entry after the last read group
+ \sa Begin(), ConstEnd()
+*/
+SamReadGroupIterator SamReadGroupDictionary::End()
+{
+ return m_data.end();
+}
+
+/*! \fn SamReadGroupConstIterator SamReadGroupDictionary::End() const
+ \return an STL const_iterator pointing to the imaginary entry after the last read group
+
+ This is an overloaded function.
+
+ \sa Begin(), ConstEnd()
+*/
+SamReadGroupConstIterator SamReadGroupDictionary::End() const
+{
+ return m_data.end();
+}
+
+/*! \fn bool SamReadGroupDictionary::IsEmpty() const
+ \brief Returns \c true if dictionary contains no read groups
+ \sa Size()
+*/
+bool SamReadGroupDictionary::IsEmpty() const
+{
+ return m_data.empty();
+}
+
+/*! \fn void SamReadGroupDictionary::Remove(const SamReadGroup& readGroup)
+ \brief Removes read group from dictionary, if found (matching on ID).
+
+ This is an overloaded function.
+
+ \param[in] readGroup read group to remove (matches on ID)
+*/
+void SamReadGroupDictionary::Remove(const SamReadGroup& readGroup)
+{
+ Remove(readGroup.ID);
+}
+
+/*! \fn void SamReadGroupDictionary::Remove(const std::string& readGroupId)
+ \brief Removes read group from dictionary, if found.
+
+ \param[in] readGroupId ID of read group to remove
+ \sa Remove()
+*/
+void SamReadGroupDictionary::Remove(const std::string& readGroupId)
+{
+
+ // skip if empty dictionary or if ID unknown
+ if (IsEmpty() || !Contains(readGroupId)) return;
+
+ // update 'lookup index' for every entry after @readGroupId
+ const std::size_t indexToRemove = m_lookupData[readGroupId];
+ const std::size_t numEntries = m_data.size();
+ for (std::size_t i = indexToRemove + 1; i < numEntries; ++i) {
+ const SamReadGroup& rg = m_data.at(i);
+ --m_lookupData[rg.ID];
+ }
+
+ // erase entry from containers
+ m_data.erase(Begin() + indexToRemove);
+ m_lookupData.erase(readGroupId);
+}
+
+/*! \fn void SamReadGroupDictionary::Remove(const std::vector<SamReadGroup>& readGroups)
+ \brief Removes multiple read groups from dictionary (matching on ID).
+
+ This is an overloaded function.
+
+ \param[in] readGroups read groups to remove
+ \sa Remove()
+*/
+void SamReadGroupDictionary::Remove(const std::vector<SamReadGroup>& readGroups)
+{
+ std::vector<SamReadGroup>::const_iterator rgIter = readGroups.begin();
+ std::vector<SamReadGroup>::const_iterator rgEnd = readGroups.end();
+ for (; rgIter != rgEnd; ++rgIter)
+ Remove(*rgIter);
+}
+
+/*! \fn void SamReadGroupDictionary::Remove(const std::vector<std::string>& readGroupIds)
+ \brief Removes multiple read groups from dictionary.
+
+ This is an overloaded function.
+
+ \param[in] readGroupIds IDs of the read groups to remove
+ \sa Remove()
+*/
+void SamReadGroupDictionary::Remove(const std::vector<std::string>& readGroupIds)
+{
+ std::vector<std::string>::const_iterator rgIter = readGroupIds.begin();
+ std::vector<std::string>::const_iterator rgEnd = readGroupIds.end();
+ for (; rgIter != rgEnd; ++rgIter)
+ Remove(*rgIter);
+}
+
+/*! \fn int SamReadGroupDictionary::Size() const
+ \brief Returns number of read groups in dictionary.
+ \sa IsEmpty()
+*/
+int SamReadGroupDictionary::Size() const
+{
+ return m_data.size();
+}
+
+/*! \fn SamReadGroup& SamReadGroupDictionary::operator[](const std::string& readGroupId)
+ \brief Retrieves the modifiable SamReadGroup that matches \a readGroupId.
+
+ \note If the dictionary contains no read group matching this ID, this function inserts
+ a new one with this ID, and returns a reference to it. If you want to avoid this insertion
+ behavior, check the result of Contains() before using this operator.
+
+ \param[in] readGroupId ID of read group to retrieve
+ \return a modifiable reference to the SamReadGroup associated with the ID
+*/
+SamReadGroup& SamReadGroupDictionary::operator[](const std::string& readGroupId)
+{
+
+ if (!Contains(readGroupId)) {
+ SamReadGroup rg(readGroupId);
+ m_data.push_back(rg);
+ m_lookupData[readGroupId] = m_data.size() - 1;
+ }
+
+ const std::size_t index = m_lookupData[readGroupId];
+ return m_data.at(index);
+}
diff --git a/src/api/SamReadGroupDictionary.h b/src/api/SamReadGroupDictionary.h
new file mode 100644
index 0000000..79df6ca
--- /dev/null
+++ b/src/api/SamReadGroupDictionary.h
@@ -0,0 +1,87 @@
+// ***************************************************************************
+// SamReadGroupDictionary.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 16 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides methods for operating on a collection of SamReadGroup entries.
+// ***************************************************************************
+
+#ifndef SAM_READGROUP_DICTIONARY_H
+#define SAM_READGROUP_DICTIONARY_H
+
+#include <cstddef>
+#include <map>
+#include <string>
+#include <vector>
+#include "api/SamReadGroup.h"
+#include "api/api_global.h"
+
+namespace BamTools {
+
+typedef std::vector<SamReadGroup> SamReadGroupContainer;
+typedef SamReadGroupContainer::iterator SamReadGroupIterator;
+typedef SamReadGroupContainer::const_iterator SamReadGroupConstIterator;
+
+class API_EXPORT SamReadGroupDictionary
+{
+
+ // ctor & dtor
+public:
+ SamReadGroupDictionary();
+ SamReadGroupDictionary(const SamReadGroupDictionary& other);
+ ~SamReadGroupDictionary();
+
+ // query/modify read group data
+public:
+ // adds a read group
+ void Add(const SamReadGroup& readGroup);
+ void Add(const std::string& readGroupId);
+
+ // adds multiple read groups
+ void Add(const SamReadGroupDictionary& readGroups);
+ void Add(const std::vector<SamReadGroup>& readGroups);
+ void Add(const std::vector<std::string>& readGroupIds);
+
+ // clears all read group entries
+ void Clear();
+
+ // returns true if dictionary contains this read group
+ bool Contains(const SamReadGroup& readGroup) const;
+ bool Contains(const std::string& readGroupId) const;
+
+ // returns true if dictionary is empty
+ bool IsEmpty() const;
+
+ // removes read group, if found
+ void Remove(const SamReadGroup& readGroup);
+ void Remove(const std::string& readGroupId);
+
+ // removes multiple read groups
+ void Remove(const std::vector<SamReadGroup>& readGroups);
+ void Remove(const std::vector<std::string>& readGroupIds);
+
+ // returns number of read groups in dictionary
+ int Size() const;
+
+ // retrieves a modifiable reference to the SamReadGroup object associated with this ID
+ SamReadGroup& operator[](const std::string& readGroupId);
+
+ // retrieve STL-compatible iterators
+public:
+ SamReadGroupIterator Begin(); // returns iterator to begin()
+ SamReadGroupConstIterator Begin() const; // returns const_iterator to begin()
+ SamReadGroupConstIterator ConstBegin() const; // returns const_iterator to begin()
+ SamReadGroupIterator End(); // returns iterator to end()
+ SamReadGroupConstIterator End() const; // returns const_iterator to end()
+ SamReadGroupConstIterator ConstEnd() const; // returns const_iterator to end()
+
+ // data members
+private:
+ SamReadGroupContainer m_data;
+ std::map<std::string, std::size_t> m_lookupData;
+};
+
+} // namespace BamTools
+
+#endif // SAM_READGROUP_DICTIONARY_H
diff --git a/src/api/SamSequence.cpp b/src/api/SamSequence.cpp
new file mode 100644
index 0000000..8b4bcfa
--- /dev/null
+++ b/src/api/SamSequence.cpp
@@ -0,0 +1,152 @@
+// ***************************************************************************
+// SamSequence.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides direct read/write access to the SAM sequence data fields.
+// ***************************************************************************
+
+#include "api/SamSequence.h"
+#include <sstream>
+using namespace BamTools;
+
+/*! \struct BamTools::SamSequence
+ \brief Represents a SAM sequence entry.
+
+ Provides direct read/write access to the SAM sequence data fields.
+
+ \sa \samSpecURL
+*/
+/*! \var SamSequence::AssemblyID
+ \brief corresponds to \@SQ AS:\<AssemblyID\>
+*/
+/*! \var SamSequence::Checksum
+ \brief corresponds to \@SQ M5:\<Checksum\>
+*/
+/*! \var SamSequence::Length
+ \brief corresponds to \@SQ LN:\<Length\>
+
+ Required for valid SAM header.
+*/
+/*! \var SamSequence::Name
+ \brief corresponds to \@SQ SN:\<Name\>
+
+ Required for valid SAM header.
+*/
+/*! \var SamSequence::Species
+ \brief corresponds to \@SQ SP:\<Species\>
+*/
+/*! \var SamSequence::URI
+ \brief corresponds to \@SQ UR:\<URI\>
+*/
+
+/*! \fn SamSequence::SamSequence()
+ \brief default constructor
+*/
+SamSequence::SamSequence() {}
+
+/*! \fn SamSequence::SamSequence(const std::string& name, const int& length)
+ \brief constructs sequence with \a name and \a length
+
+ \param name desired sequence name
+ \param length desired sequence length (numeric value)
+*/
+SamSequence::SamSequence(const std::string& name, const int& length)
+ : Name(name)
+{
+ std::stringstream s;
+ s << length;
+ Length = s.str();
+}
+
+/*! \fn SamSequence::SamSequence(const std::string& name, const std::string& length)
+ \brief constructs sequence with \a name and \a length
+
+ \param name desired sequence name
+ \param length desired sequence length (string value)
+*/
+SamSequence::SamSequence(const std::string& name, const std::string& length)
+ : Length(length)
+ , Name(name)
+{}
+
+/*! \fn SamSequence::SamSequence(const SamSequence& other)
+ \brief copy constructor
+*/
+SamSequence::SamSequence(const SamSequence& other)
+ : AssemblyID(other.AssemblyID)
+ , Checksum(other.Checksum)
+ , Length(other.Length)
+ , Name(other.Name)
+ , Species(other.Species)
+ , URI(other.URI)
+ , CustomTags(other.CustomTags)
+{}
+
+/*! \fn SamSequence::~SamSequence()
+ \brief destructor
+*/
+SamSequence::~SamSequence() {}
+
+/*! \fn void SamSequence::Clear()
+ \brief Clears all data fields.
+*/
+void SamSequence::Clear()
+{
+ AssemblyID.clear();
+ Checksum.clear();
+ Length.clear();
+ Name.clear();
+ Species.clear();
+ URI.clear();
+ CustomTags.clear();
+}
+
+/*! \fn bool SamSequence::HasAssemblyID() const
+ \brief Returns \c true if sequence contains \@SQ AS:\<AssemblyID\>
+*/
+bool SamSequence::HasAssemblyID() const
+{
+ return (!AssemblyID.empty());
+}
+
+/*! \fn bool SamSequence::HasChecksum() const
+ \brief Returns \c true if sequence contains \@SQ M5:\<Checksum\>
+*/
+bool SamSequence::HasChecksum() const
+{
+ return (!Checksum.empty());
+}
+
+/*! \fn bool SamSequence::HasLength() const
+ \brief Returns \c true if sequence contains \@SQ LN:\<Length\>
+*/
+bool SamSequence::HasLength() const
+{
+ return (!Length.empty());
+}
+
+/*! \fn bool SamSequence::HasName() const
+ \brief Returns \c true if sequence contains \@SQ SN:\<Name\>
+*/
+bool SamSequence::HasName() const
+{
+ return (!Name.empty());
+}
+
+/*! \fn bool SamSequence::HasSpecies() const
+ \brief Returns \c true if sequence contains \@SQ SP:\<Species\>
+*/
+bool SamSequence::HasSpecies() const
+{
+ return (!Species.empty());
+}
+
+/*! \fn bool SamSequence::HasURI() const
+ \brief Returns \c true if sequence contains \@SQ UR:\<URI\>
+*/
+bool SamSequence::HasURI() const
+{
+ return (!URI.empty());
+}
diff --git a/src/api/SamSequence.h b/src/api/SamSequence.h
new file mode 100644
index 0000000..c94a755
--- /dev/null
+++ b/src/api/SamSequence.h
@@ -0,0 +1,66 @@
+// ***************************************************************************
+// SamSequence.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides direct read/write access to the SAM sequence data fields.
+// ***************************************************************************
+
+#ifndef SAM_SEQUENCE_H
+#define SAM_SEQUENCE_H
+
+#include <string>
+#include <vector>
+#include "api/BamAux.h"
+#include "api/api_global.h"
+
+namespace BamTools {
+
+struct API_EXPORT SamSequence
+{
+
+ // ctor & dtor
+ SamSequence();
+ SamSequence(const std::string& name, const int& length);
+ SamSequence(const std::string& name, const std::string& length);
+ SamSequence(const SamSequence& other);
+ ~SamSequence();
+
+ // query/modify entire sequence
+ void Clear(); // clears all contents
+
+ // convenience query methods
+ bool HasAssemblyID() const; // returns true if sequence has an assembly ID
+ bool HasChecksum() const; // returns true if sequence has an MD5 checksum
+ bool HasLength() const; // returns true if sequence has a length
+ bool HasName() const; // returns true if sequence has a name
+ bool HasSpecies() const; // returns true if sequence has a species ID
+ bool HasURI() const; // returns true if sequence has a URI
+
+ // data members
+ std::string AssemblyID; // AS:<AssemblyID>
+ std::string Checksum; // M5:<Checksum>
+ std::string Length; // LN:<Length> *Required for valid SAM header*
+ std::string Name; // SN:<Name> *Required for valid SAM header*
+ std::string Species; // SP:<Species>
+ std::string URI; // UR:<URI>
+ std::vector<CustomHeaderTag> CustomTags; // optional custom tags
+};
+
+/*! \fn bool operator==(const SamSequence& lhs, const SamSequence& rhs)
+ \brief tests equality by comparing sequence names, lengths, & checksums (if available)
+*/
+API_EXPORT inline bool operator==(const SamSequence& lhs, const SamSequence& rhs)
+{
+ if (lhs.Name != rhs.Name) return false;
+ if (lhs.Length != rhs.Length) return false;
+ if (lhs.HasChecksum() && rhs.HasChecksum())
+ return (lhs.Checksum == rhs.Checksum);
+ else
+ return true;
+}
+
+} // namespace BamTools
+
+#endif // SAM_SEQUENCE_H
diff --git a/src/api/SamSequenceDictionary.cpp b/src/api/SamSequenceDictionary.cpp
new file mode 100644
index 0000000..e38b7d3
--- /dev/null
+++ b/src/api/SamSequenceDictionary.cpp
@@ -0,0 +1,321 @@
+// ***************************************************************************
+// SamSequenceDictionary.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 16 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides methods for operating on a collection of SamSequence entries.
+// *************************************************************************
+
+#include "api/SamSequenceDictionary.h"
+using namespace BamTools;
+
+#include <cstddef>
+#include <iostream>
+
+/*! \class BamTools::SamSequenceDictionary
+ \brief Container of SamSequence entries.
+
+ Provides methods for operating on a collection of SamSequence entries.
+*/
+
+/*! \fn SamSequenceDictionary::SamSequenceDictionary()
+ \brief constructor
+*/
+SamSequenceDictionary::SamSequenceDictionary() {}
+
+/*! \fn SamSequenceDictionary::SamSequenceDictionary(const SamSequenceDictionary& other)
+ \brief copy constructor
+*/
+SamSequenceDictionary::SamSequenceDictionary(const SamSequenceDictionary& other)
+ : m_data(other.m_data)
+ , m_lookupData(other.m_lookupData)
+{}
+
+/*! \fn SamSequenceDictionary::~SamSequenceDictionary()
+ \brief destructor
+*/
+SamSequenceDictionary::~SamSequenceDictionary() {}
+
+/*! \fn void SamSequenceDictionary::Add(const SamSequence& sequence)
+ \brief Appends a sequence to the dictionary.
+
+ Duplicate entries are silently discarded.
+
+ \param[in] sequence entry to be added
+*/
+void SamSequenceDictionary::Add(const SamSequence& sequence)
+{
+ if (IsEmpty() || !Contains(sequence)) {
+ m_data.push_back(sequence);
+ m_lookupData[sequence.Name] = m_data.size() - 1;
+ }
+}
+
+/*! \fn void SamSequenceDictionary::Add(const std::string& name, const int& length)
+ \brief Appends a sequence to the dictionary.
+
+ This is an overloaded function.
+
+ \param[in] name name of sequence entry to be added
+ \param[in] length length of sequence entry to be added
+ \sa Add()
+*/
+void SamSequenceDictionary::Add(const std::string& name, const int& length)
+{
+ Add(SamSequence(name, length));
+}
+
+/*! \fn void SamSequenceDictionary::Add(const SamSequenceDictionary& sequences)
+ \brief Appends another sequence dictionary to this one
+
+ This is an overloaded function.
+
+ \param[in] sequences sequence dictionary to be appended
+ \sa Add()
+*/
+void SamSequenceDictionary::Add(const SamSequenceDictionary& sequences)
+{
+ SamSequenceConstIterator seqIter = sequences.ConstBegin();
+ SamSequenceConstIterator seqEnd = sequences.ConstEnd();
+ for (; seqIter != seqEnd; ++seqIter)
+ Add(*seqIter);
+}
+
+/*! \fn void SamSequenceDictionary::Add(const std::vector<SamSequence>& sequences)
+ \brief Appends multiple sequences to the dictionary.
+
+ This is an overloaded function.
+
+ \param[in] sequences entries to be added
+ \sa Add()
+*/
+void SamSequenceDictionary::Add(const std::vector<SamSequence>& sequences)
+{
+ std::vector<SamSequence>::const_iterator seqIter = sequences.begin();
+ std::vector<SamSequence>::const_iterator seqEnd = sequences.end();
+ for (; seqIter != seqEnd; ++seqIter)
+ Add(*seqIter);
+}
+
+/*! \fn void SamSequenceDictionary::Add(const std::map<std::string, int>& sequenceMap)
+ \brief Appends multiple sequences to the dictionary.
+
+ This is an overloaded function.
+
+ \param[in] sequenceMap map of sequence entries (name => length) to be added
+ \sa Add()
+*/
+void SamSequenceDictionary::Add(const std::map<std::string, int>& sequenceMap)
+{
+ std::map<std::string, int>::const_iterator seqIter = sequenceMap.begin();
+ std::map<std::string, int>::const_iterator seqEnd = sequenceMap.end();
+ for (; seqIter != seqEnd; ++seqIter) {
+ const std::string& name = (*seqIter).first;
+ const int& length = (*seqIter).second;
+ Add(SamSequence(name, length));
+ }
+}
+
+/*! \fn SamSequenceIterator SamSequenceDictionary::Begin()
+ \return an STL iterator pointing to the first sequence
+ \sa ConstBegin(), End()
+*/
+SamSequenceIterator SamSequenceDictionary::Begin()
+{
+ return m_data.begin();
+}
+
+/*! \fn SamSequenceConstIterator SamSequenceDictionary::Begin() const
+ \return an STL const_iterator pointing to the first sequence
+
+ This is an overloaded function.
+
+ \sa ConstBegin(), End()
+*/
+SamSequenceConstIterator SamSequenceDictionary::Begin() const
+{
+ return m_data.begin();
+}
+
+/*! \fn void SamSequenceDictionary::Clear()
+ \brief Clears all sequence entries.
+*/
+void SamSequenceDictionary::Clear()
+{
+ m_data.clear();
+ m_lookupData.clear();
+}
+
+/*! \fn SamSequenceConstIterator SamSequenceDictionary::ConstBegin() const
+ \return an STL const_iterator pointing to the first sequence
+ \sa Begin(), ConstEnd()
+*/
+SamSequenceConstIterator SamSequenceDictionary::ConstBegin() const
+{
+ return m_data.begin();
+}
+
+/*! \fn SamSequenceConstIterator SamSequenceDictionary::ConstEnd() const
+ \return an STL const_iterator pointing to the imaginary entry after the last sequence
+ \sa End(), ConstBegin()
+*/
+SamSequenceConstIterator SamSequenceDictionary::ConstEnd() const
+{
+ return m_data.end();
+}
+
+/*! \fn bool SamSequenceDictionary::Contains(const std::string& sequenceName) const
+ \brief Returns true if dictionary contains sequence.
+
+ \param[in] sequenceName search for sequence matching this name
+ \return \c true if dictionary contains a sequence with this name
+*/
+bool SamSequenceDictionary::Contains(const std::string& sequenceName) const
+{
+ return (m_lookupData.find(sequenceName) != m_lookupData.end());
+}
+
+/*! \fn bool SamSequenceDictionary::Contains(const SamSequence& sequence) const
+ \brief Returns true if dictionary contains sequence (matches on name).
+
+ This is an overloaded function.
+
+ \param[in] sequence search for this sequence
+ \return \c true if dictionary contains sequence (matching on name)
+*/
+bool SamSequenceDictionary::Contains(const SamSequence& sequence) const
+{
+ return Contains(sequence.Name);
+}
+
+/*! \fn SamSequenceIterator SamSequenceDictionary::End()
+ \return an STL iterator pointing to the imaginary entry after the last sequence
+ \sa Begin(), ConstEnd()
+*/
+SamSequenceIterator SamSequenceDictionary::End()
+{
+ return m_data.end();
+}
+
+/*! \fn SamSequenceConstIterator SamSequenceDictionary::End() const
+ \return an STL const_iterator pointing to the imaginary entry after the last sequence
+
+ This is an overloaded function.
+
+ \sa Begin(), ConstEnd()
+*/
+SamSequenceConstIterator SamSequenceDictionary::End() const
+{
+ return m_data.end();
+}
+
+/*! \fn bool SamSequenceDictionary::IsEmpty() const
+ \brief Returns \c true if dictionary contains no sequences
+ \sa Size()
+*/
+bool SamSequenceDictionary::IsEmpty() const
+{
+ return m_data.empty();
+}
+
+/*! \fn void SamSequenceDictionary::Remove(const SamSequence& sequence)
+ \brief Removes sequence from dictionary, if found (matches on name).
+
+ This is an overloaded function.
+
+ \param[in] sequence SamSequence to remove (matching on name)
+*/
+void SamSequenceDictionary::Remove(const SamSequence& sequence)
+{
+ Remove(sequence.Name);
+}
+
+/*! \fn void SamSequenceDictionary::Remove(const std::string& sequenceName)
+ \brief Removes sequence from dictionary, if found.
+
+ \param[in] sequenceName name of sequence to remove
+ \sa Remove()
+*/
+void SamSequenceDictionary::Remove(const std::string& sequenceName)
+{
+
+ // skip if empty dictionary or if name unknown
+ if (IsEmpty() || !Contains(sequenceName)) return;
+
+ // update 'lookup index' for every entry after @sequenceName
+ const std::size_t indexToRemove = m_lookupData[sequenceName];
+ const std::size_t numEntries = m_data.size();
+ for (std::size_t i = indexToRemove + 1; i < numEntries; ++i) {
+ const SamSequence& sq = m_data.at(i);
+ --m_lookupData[sq.Name];
+ }
+
+ // erase entry from containers
+ m_data.erase(Begin() + indexToRemove);
+ m_lookupData.erase(sequenceName);
+}
+
+/*! \fn void SamSequenceDictionary::Remove(const std::vector<SamSequence>& sequences)
+ \brief Removes multiple sequences from dictionary.
+
+ This is an overloaded function.
+
+ \param[in] sequences sequences to remove
+ \sa Remove()
+*/
+void SamSequenceDictionary::Remove(const std::vector<SamSequence>& sequences)
+{
+ std::vector<SamSequence>::const_iterator rgIter = sequences.begin();
+ std::vector<SamSequence>::const_iterator rgEnd = sequences.end();
+ for (; rgIter != rgEnd; ++rgIter)
+ Remove(*rgIter);
+}
+
+/*! \fn void SamSequenceDictionary::Remove(const std::vector<std::string>& sequenceNames)
+ \brief Removes multiple sequences from dictionary.
+
+ This is an overloaded function.
+
+ \param[in] sequenceNames names of the sequences to remove
+ \sa Remove()
+*/
+void SamSequenceDictionary::Remove(const std::vector<std::string>& sequenceNames)
+{
+ std::vector<std::string>::const_iterator rgIter = sequenceNames.begin();
+ std::vector<std::string>::const_iterator rgEnd = sequenceNames.end();
+ for (; rgIter != rgEnd; ++rgIter)
+ Remove(*rgIter);
+}
+
+/*! \fn int SamSequenceDictionary::Size() const
+ \brief Returns number of sequences in dictionary.
+ \sa IsEmpty()
+*/
+int SamSequenceDictionary::Size() const
+{
+ return m_data.size();
+}
+
+/*! \fn SamSequence& SamSequenceDictionary::operator[](const std::string& sequenceName)
+ \brief Retrieves the modifiable SamSequence that matches \a sequenceName.
+
+ \note If the dictionary contains no sequence matching this name, this function inserts
+ a new one with this name (length:0), and returns a reference to it. If you want to avoid
+ this insertion behavior, check the result of Contains() before using this operator.
+
+ \param[in] sequenceName name of sequence to retrieve
+ \return a modifiable reference to the SamSequence associated with the name
+*/
+SamSequence& SamSequenceDictionary::operator[](const std::string& sequenceName)
+{
+
+ if (!Contains(sequenceName)) {
+ SamSequence seq(sequenceName, 0);
+ m_data.push_back(seq);
+ m_lookupData[sequenceName] = m_data.size() - 1;
+ }
+
+ const std::size_t index = m_lookupData[sequenceName];
+ return m_data.at(index);
+}
diff --git a/src/api/SamSequenceDictionary.h b/src/api/SamSequenceDictionary.h
new file mode 100644
index 0000000..12375e5
--- /dev/null
+++ b/src/api/SamSequenceDictionary.h
@@ -0,0 +1,87 @@
+// ***************************************************************************
+// SamSequenceDictionary.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 16 October 2011
+// ---------------------------------------------------------------------------
+// Provides methods for operating on a collection of SamSequence entries.
+// ***************************************************************************
+
+#ifndef SAM_SEQUENCE_DICTIONARY_H
+#define SAM_SEQUENCE_DICTIONARY_H
+
+#include <cstddef>
+#include <map>
+#include <string>
+#include <vector>
+#include "api/SamSequence.h"
+#include "api/api_global.h"
+
+namespace BamTools {
+
+typedef std::vector<SamSequence> SamSequenceContainer;
+typedef SamSequenceContainer::iterator SamSequenceIterator;
+typedef SamSequenceContainer::const_iterator SamSequenceConstIterator;
+
+class API_EXPORT SamSequenceDictionary
+{
+
+ // ctor & dtor
+public:
+ SamSequenceDictionary();
+ SamSequenceDictionary(const SamSequenceDictionary& other);
+ ~SamSequenceDictionary();
+
+ // query/modify sequence data
+public:
+ // adds a sequence
+ void Add(const SamSequence& sequence);
+ void Add(const std::string& name, const int& length);
+
+ // adds multiple sequences
+ void Add(const SamSequenceDictionary& sequences);
+ void Add(const std::vector<SamSequence>& sequences);
+ void Add(const std::map<std::string, int>& sequenceMap);
+
+ // clears all sequence entries
+ void Clear();
+
+ // returns true if dictionary contains this sequence
+ bool Contains(const SamSequence& sequence) const;
+ bool Contains(const std::string& sequenceName) const;
+
+ // returns true if dictionary is empty
+ bool IsEmpty() const;
+
+ // removes sequence, if found
+ void Remove(const SamSequence& sequence);
+ void Remove(const std::string& sequenceName);
+
+ // removes multiple sequences
+ void Remove(const std::vector<SamSequence>& sequences);
+ void Remove(const std::vector<std::string>& sequenceNames);
+
+ // returns number of sequences in dictionary
+ int Size() const;
+
+ // retrieves a modifiable reference to the SamSequence object associated with this name
+ SamSequence& operator[](const std::string& sequenceName);
+
+ // retrieve STL-compatible iterators
+public:
+ SamSequenceIterator Begin(); // returns iterator to begin()
+ SamSequenceConstIterator Begin() const; // returns const_iterator to begin()
+ SamSequenceConstIterator ConstBegin() const; // returns const_iterator to begin()
+ SamSequenceIterator End(); // returns iterator to end()
+ SamSequenceConstIterator End() const; // returns const_iterator to end()
+ SamSequenceConstIterator ConstEnd() const; // returns const_iterator to end()
+
+ // data members
+private:
+ SamSequenceContainer m_data;
+ std::map<std::string, std::size_t> m_lookupData;
+};
+
+} // namespace BamTools
+
+#endif // SAM_SEQUENCE_DICTIONARY_H
diff --git a/src/api/algorithms/Sort.h b/src/api/algorithms/Sort.h
new file mode 100644
index 0000000..e9017cb
--- /dev/null
+++ b/src/api/algorithms/Sort.h
@@ -0,0 +1,364 @@
+// ***************************************************************************
+// Sort.h (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 4 April 2012 (DB)
+// ---------------------------------------------------------------------------
+// Provides sorting functionality.
+// ***************************************************************************
+
+#ifndef ALGORITHMS_SORT_H
+#define ALGORITHMS_SORT_H
+
+#include <algorithm>
+#include <cassert>
+#include <functional>
+#include <string>
+#include <vector>
+#include "api/BamAlignment.h"
+#include "api/BamMultiReader.h"
+#include "api/BamReader.h"
+#include "api/api_global.h"
+
+namespace BamTools {
+namespace Algorithms {
+
+/*! \struct BamTools::Algorithms::Sort
+ \brief Provides classes & methods related to sorting BamAlignments
+*/
+struct API_EXPORT Sort
+{
+
+ //! Provides explicit values for specifying desired sort ordering
+ enum Order
+ {
+ AscendingOrder = 0,
+ DescendingOrder
+ };
+
+ /*! \fn template<typename ElemType> static inline bool sort_helper(const Sort::Order& order, const ElemType& lhs, const ElemType& rhs)
+ \internal
+
+ Determines necessary STL function object depending on requested Sort::Order
+ */
+ template <typename ElemType>
+ static inline bool sort_helper(const Sort::Order& order, const ElemType& lhs,
+ const ElemType& rhs)
+ {
+ switch (order) {
+ case (Sort::AscendingOrder): {
+ std::less<ElemType> comp;
+ return comp(lhs, rhs);
+ }
+ case (Sort::DescendingOrder): {
+ std::greater<ElemType> comp;
+ return comp(lhs, rhs);
+ }
+ default:
+ BT_ASSERT_UNREACHABLE;
+ }
+ return false; // <-- unreachable
+ }
+
+ //! Base class for our sorting function objects
+ typedef std::binary_function<BamAlignment, BamAlignment, bool> AlignmentSortBase;
+
+ /*! \struct BamTools::Algorithms::Sort::ByName
+ \brief Function object for comparing alignments by name
+
+ Default sort order is Sort::AscendingOrder.
+
+ \code
+ std::vector<BamAlignment> a;
+
+ // sort by name, in ascending order (the following two lines are equivalent):
+ std::sort( a.begin(), a.end(), Sort::ByName() );
+ std::sort( a.begin(), a.end(), Sort::ByName(Sort::AscendingOrder) );
+
+ // OR sort in descending order
+ std::sort( a.begin(), a.end(), Sort::ByName(Sort::DescendingOrder) );
+ \endcode
+ */
+ struct ByName : public AlignmentSortBase
+ {
+
+ // ctor
+ ByName(const Sort::Order& order = Sort::AscendingOrder)
+ : m_order(order)
+ {}
+
+ // comparison function
+ bool operator()(const BamTools::BamAlignment& lhs, const BamTools::BamAlignment& rhs)
+ {
+ return sort_helper(m_order, lhs.Name, rhs.Name);
+ }
+
+ // used by BamMultiReader internals
+ static inline bool UsesCharData()
+ {
+ return true;
+ }
+
+ // data members
+ private:
+ const Sort::Order m_order;
+ };
+
+ /*! \struct BamTools::Algorithms::Sort::ByPosition
+ \brief Function object for comparing alignments by position
+
+ Default sort order is Sort::AscendingOrder.
+
+ \code
+ std::vector<BamAlignment> a;
+
+ // sort by position, in ascending order (the following two lines are equivalent):
+ std::sort( a.begin(), a.end(), Sort::ByPosition() );
+ std::sort( a.begin(), a.end(), Sort::ByPosition(Sort::AscendingOrder) );
+
+ // OR sort in descending order
+ std::sort( a.begin(), a.end(), Sort::ByPosition(Sort::DescendingOrder) );
+ \endcode
+ */
+ struct ByPosition : public AlignmentSortBase
+ {
+
+ // ctor
+ ByPosition(const Sort::Order& order = Sort::AscendingOrder)
+ : m_order(order)
+ {}
+
+ // comparison function
+ bool operator()(const BamTools::BamAlignment& lhs, const BamTools::BamAlignment& rhs)
+ {
+
+ // force unmapped aligmnents to end
+ if (lhs.RefID == -1) return false;
+ if (rhs.RefID == -1) return true;
+
+ // if on same reference, sort on position
+ if (lhs.RefID == rhs.RefID) return sort_helper(m_order, lhs.Position, rhs.Position);
+
+ // otherwise sort on reference ID
+ return sort_helper(m_order, lhs.RefID, rhs.RefID);
+ }
+
+ // used by BamMultiReader internals
+ static inline bool UsesCharData()
+ {
+ return false;
+ }
+
+ // data members
+ private:
+ const Sort::Order m_order;
+ };
+
+ /*! \struct BamTools::Algorithms::Sort::ByTag
+ \brief Function object for comparing alignments by tag value
+
+ Default sort order is Sort::AscendingOrder.
+
+ \code
+ std::vector<BamAlignment> a;
+
+ // sort by edit distance, in ascending order (the following two lines are equivalent):
+ std::sort( a.begin(), a.end(), Sort::ByTag<int>("NM") );
+ std::sort( a.begin(), a.end(), Sort::ByTag<int>("NM", Sort::AscendingOrder) );
+
+ // OR sort in descending order
+ std::sort( a.begin(), a.end(), Sort::ByTag<int>("NM", Sort::DescendingOrder) );
+ \endcode
+ */
+ template <typename T>
+ struct ByTag : public AlignmentSortBase
+ {
+
+ // ctor
+ ByTag(const std::string& tag, const Sort::Order& order = Sort::AscendingOrder)
+ : m_tag(tag)
+ , m_order(order)
+ {}
+
+ // comparison function
+ bool operator()(const BamTools::BamAlignment& lhs, const BamTools::BamAlignment& rhs)
+ {
+
+ // force alignments without tag to end
+ T lhsTagValue;
+ T rhsTagValue;
+ if (!lhs.GetTag(m_tag, lhsTagValue)) return false;
+ if (!rhs.GetTag(m_tag, rhsTagValue)) return true;
+
+ // otherwise compare on tag values
+ return sort_helper(m_order, lhsTagValue, rhsTagValue);
+ }
+
+ // used by BamMultiReader internals
+ static inline bool UsesCharData()
+ {
+ return true;
+ }
+
+ // data members
+ private:
+ const std::string m_tag;
+ const Sort::Order m_order;
+ };
+
+ /*! \struct BamTools::Algorithms::Sort::Unsorted
+ \brief Placeholder function object
+
+ This function object exists purely to allow for dropping a "do not care" ordering
+ into methods, containers, etc that are designed to work with the other sorting objects.
+
+ \code
+ std::set<BamAlignment, Sort::ByName>; // STL set, ordered on alignment name
+ std::set<BamAlignment, Sort::Unsorted>; // STL set, unsorted (but probably insertion order)
+ \endcode
+ */
+ struct Unsorted : public AlignmentSortBase
+ {
+
+ // comparison function
+ inline bool operator()(const BamTools::BamAlignment&, const BamTools::BamAlignment&)
+ {
+ return false; // returning false tends to retain insertion order
+ }
+
+ // used by BamMultiReader internals
+ static inline bool UsesCharData()
+ {
+ return false;
+ }
+ };
+
+ /*! Sorts a std::vector of alignments (in-place), using the provided compare function.
+
+ \code
+ std::vector<BamAlignemnt> a;
+ // populate data
+
+ // sort our alignment list by edit distance
+ Sort::SortAlignments(a, Sort::ByTag<int>("NM"));
+ \endcode
+
+ \param[in,out] data vector of alignments to be sorted
+ \param[in] comp comparison function object
+ */
+ template <typename Compare>
+ static inline void SortAlignments(std::vector<BamAlignment>& data,
+ const Compare& comp = Compare())
+ {
+ std::sort(data.begin(), data.end(), comp);
+ }
+
+ /*! Returns a sorted copy of the input alignments, using the provided compare function.
+
+ \code
+ std::vector<BamAlignemnt> a;
+ // populate data
+
+ // get a copy of our original data, sorted by edit distance (descending order)
+ std::vector<BamAligment> sortedData;
+ sortedData = Sort::SortAlignments(a, Sort::ByTag<int>("NM", Sort::DescendingOrder));
+ \endcode
+
+ \param[in] input vector of alignments to be sorted
+ \param[in] comp comparison function object
+ \return sorted copy of the input data
+ */
+ template <typename Compare>
+ static inline std::vector<BamAlignment> SortAlignments(const std::vector<BamAlignment>& input,
+ const Compare& comp = Compare())
+ {
+ std::vector<BamAlignment> output(input);
+ SortAlignments(output, comp);
+ return output;
+ }
+
+ /*! Reads a region of alignments from a position-sorted BAM file,
+ then sorts by the provided compare function
+
+ \code
+ BamReader reader;
+ // open BAM file & index file
+
+ BamRegion region;
+ // define a region of interest (i.e. a exon or some other feature)
+
+ // get all alignments covering that region, sorted by read group name
+ std::vector<BamAlignments> a;
+ a = Sort::GetSortedRegion(reader, region, Sort::ByTag<std::string>("RG"));
+ \endcode
+
+ \param[in] reader BamReader opened on desired BAM file
+ \param[in] region desired region-of-interest
+ \param[in] comp comparison function object
+ \return sorted vector of the region's alignments
+ */
+ template <typename Compare>
+ static std::vector<BamAlignment> GetSortedRegion(BamReader& reader, const BamRegion& region,
+ const Compare& comp = Compare())
+ {
+ // return empty container if unable to find region
+ if (!reader.IsOpen()) return std::vector<BamAlignment>();
+ if (!reader.SetRegion(region)) return std::vector<BamAlignment>();
+
+ // iterate through region, grabbing alignments
+ BamAlignment al;
+ std::vector<BamAlignment> results;
+ while (reader.GetNextAlignmentCore(al))
+ results.push_back(al);
+
+ // sort & return alignments
+ SortAlignments(results, comp);
+ return results;
+ }
+
+ /*! Reads a region of alignments from position-sorted BAM files,
+ then sorts by the provided compare function
+
+ \code
+ BamMultiReader reader;
+ // open BAM files & index files
+
+ BamRegion region;
+ // define a region of interest (i.e. a exon or some other feature)
+
+ // get all alignments covering that region, sorted by read group name
+ std::vector<BamAlignments> a;
+ a = Sort::GetSortedRegion(reader, region, Sort::ByTag<std::string>("RG"));
+ \endcode
+
+ \param[in] reader BamMultiReader opened on desired BAM files
+ \param[in] region desired region-of-interest
+ \param[in] comp comparison function object
+ \return sorted vector of the region's alignments
+ */
+ template <typename Compare>
+ static std::vector<BamAlignment> GetSortedRegion(BamMultiReader& reader,
+ const BamRegion& region,
+ const Compare& comp = Compare())
+ {
+ // return empty container if unable to find region
+ if (!reader.HasOpenReaders()) return std::vector<BamAlignment>();
+ if (!reader.SetRegion(region)) return std::vector<BamAlignment>();
+
+ // iterate through region, grabbing alignments
+ BamAlignment al;
+ std::vector<BamAlignment> results;
+ while (reader.GetNextAlignmentCore(al))
+ results.push_back(al);
+
+ // sort & return alignments
+ SortAlignments(results, comp);
+ return results;
+ }
+};
+
+} // namespace Algorithms
+} // namespace BamTools
+
+#endif // ALGORITHMS_SORT_H
diff --git a/src/api/api_global.h b/src/api/api_global.h
new file mode 100644
index 0000000..889f050
--- /dev/null
+++ b/src/api/api_global.h
@@ -0,0 +1,21 @@
+// ***************************************************************************
+// api_global.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides macros for exporting & importing BamTools API library symbols
+// ***************************************************************************
+
+#ifndef API_GLOBAL_H
+#define API_GLOBAL_H
+
+#include "shared/bamtools_global.h"
+
+#ifdef BAMTOOLS_API_LIBRARY
+#define API_EXPORT BAMTOOLS_LIBRARY_EXPORT
+#else
+#define API_EXPORT BAMTOOLS_LIBRARY_IMPORT
+#endif
+
+#endif // API_GLOBAL_H
diff --git a/src/api/internal/CMakeLists.txt b/src/api/internal/CMakeLists.txt
new file mode 100644
index 0000000..a96cd6f
--- /dev/null
+++ b/src/api/internal/CMakeLists.txt
@@ -0,0 +1,25 @@
+# ==========================
+# BamTools CMakeLists.txt
+# (c) 2011 Derek Barnett
+#
+# src/api/internal
+# ==========================
+
+set( InternalDir "internal" )
+
+add_subdirectory( bam )
+add_subdirectory( index )
+add_subdirectory( io )
+add_subdirectory( sam )
+add_subdirectory( utils )
+
+set( InternalSources
+ ${InternalBamSources}
+ ${InternalIndexSources}
+ ${InternalIOSources}
+ ${InternalSamSources}
+ ${InternalUtilsSources}
+
+ PARENT_SCOPE # <-- leave this last
+ )
+
diff --git a/src/api/internal/bam/BamHeader_p.cpp b/src/api/internal/bam/BamHeader_p.cpp
new file mode 100644
index 0000000..b97e565
--- /dev/null
+++ b/src/api/internal/bam/BamHeader_p.cpp
@@ -0,0 +1,132 @@
+// ***************************************************************************
+// BamHeader_p.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 18 November 2012 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for handling BAM headers.
+// ***************************************************************************
+
+#include "api/internal/bam/BamHeader_p.h"
+#include "api/BamAux.h"
+#include "api/BamConstants.h"
+#include "api/internal/io/BgzfStream_p.h"
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+
+// ------------------------
+// static utility methods
+// ------------------------
+
+static inline bool isValidMagicNumber(const char* buffer)
+{
+ return (strncmp(buffer, Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH) == 0);
+}
+
+// --------------------------
+// BamHeader implementation
+// --------------------------
+
+// ctor
+BamHeader::BamHeader() {}
+
+// dtor
+BamHeader::~BamHeader() {}
+
+// reads magic number from BGZF stream, returns true if valid
+void BamHeader::CheckMagicNumber(BgzfStream* stream)
+{
+
+ // try to read magic number
+ char buffer[Constants::BAM_HEADER_MAGIC_LENGTH];
+ const std::size_t numBytesRead = stream->Read(buffer, Constants::BAM_HEADER_MAGIC_LENGTH);
+ if (numBytesRead != (int)Constants::BAM_HEADER_MAGIC_LENGTH)
+ throw BamException("BamHeader::CheckMagicNumber", "could not read magic number");
+
+ // validate magic number
+ if (!isValidMagicNumber(buffer))
+ throw BamException("BamHeader::CheckMagicNumber", "invalid magic number");
+}
+
+// clear SamHeader data
+void BamHeader::Clear()
+{
+ m_header.Clear();
+}
+
+// return true if SamHeader data is valid
+bool BamHeader::IsValid() const
+{
+ return m_header.IsValid();
+}
+
+// load BAM header ('magic number' and SAM header text) from BGZF stream
+void BamHeader::Load(BgzfStream* stream)
+{
+
+ // read & check magic number
+ CheckMagicNumber(stream);
+
+ // read header (length, then actual text)
+ uint32_t length(0);
+ ReadHeaderLength(stream, length);
+ ReadHeaderText(stream, length);
+}
+
+// reads SAM header text length from BGZF stream, stores it in @length
+void BamHeader::ReadHeaderLength(BgzfStream* stream, uint32_t& length)
+{
+
+ // read BAM header text length
+ char buffer[sizeof(uint32_t)];
+ const std::size_t numBytesRead = stream->Read(buffer, sizeof(uint32_t));
+ if (numBytesRead != sizeof(uint32_t))
+ throw BamException("BamHeader::ReadHeaderLength", "could not read header length");
+
+ // convert char buffer to length
+ length = BamTools::UnpackUnsignedInt(buffer);
+ if (BamTools::SystemIsBigEndian()) BamTools::SwapEndian_32(length);
+}
+
+// reads SAM header text from BGZF stream, stores in SamHeader object
+void BamHeader::ReadHeaderText(BgzfStream* stream, const uint32_t& length)
+{
+
+ // read header text
+ char* headerText = (char*)calloc(length + 1, 1);
+ const std::size_t bytesRead = stream->Read(headerText, length);
+
+ // if error reading, clean up buffer & throw
+ if (bytesRead != length) {
+ free(headerText);
+ throw BamException("BamHeader::ReadHeaderText", "could not read header text");
+ }
+
+ // otherwise, text was read OK
+ // store & cleanup
+ m_header.SetHeaderText(static_cast<std::string>((const char*)headerText));
+ free(headerText);
+}
+
+// returns const-reference to SamHeader data object
+const SamHeader& BamHeader::ToConstSamHeader() const
+{
+ return m_header;
+}
+
+// returns *copy* of SamHeader data object
+SamHeader BamHeader::ToSamHeader() const
+{
+ return m_header;
+}
+
+// returns SAM-formatted string of header data
+std::string BamHeader::ToString() const
+{
+ return m_header.ToString();
+}
diff --git a/src/api/internal/bam/BamHeader_p.h b/src/api/internal/bam/BamHeader_p.h
new file mode 100644
index 0000000..eed576e
--- /dev/null
+++ b/src/api/internal/bam/BamHeader_p.h
@@ -0,0 +1,72 @@
+// ***************************************************************************
+// BamHeader_p.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 18 November 2012 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for handling BAM headers.
+// ***************************************************************************
+
+#ifndef BAMHEADER_P_H
+#define BAMHEADER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <string>
+#include "api/SamHeader.h"
+
+namespace BamTools {
+namespace Internal {
+
+class BgzfStream;
+
+class BamHeader
+{
+
+ // ctor & dtor
+public:
+ BamHeader();
+ ~BamHeader();
+
+ // BamHeader interface
+public:
+ // clear SamHeader data
+ void Clear();
+ // return true if SamHeader data is valid
+ bool IsValid() const;
+ // load BAM header ('magic number' and SAM header text) from BGZF stream
+ // returns true if all OK
+ void Load(BgzfStream* stream);
+ // returns (read-only) reference to SamHeader data object
+ const SamHeader& ToConstSamHeader() const;
+ // returns (editable) copy of SamHeader data object
+ SamHeader ToSamHeader() const;
+ // returns SAM-formatted string of header data
+ std::string ToString() const;
+
+ // internal methods
+private:
+ // reads magic number from BGZF stream
+ void CheckMagicNumber(BgzfStream* stream);
+ // reads SAM header length from BGZF stream, stores it in @length
+ void ReadHeaderLength(BgzfStream* stream, uint32_t& length);
+ // reads SAM header text from BGZF stream, stores in SamHeader object
+ void ReadHeaderText(BgzfStream* stream, const uint32_t& length);
+
+ // data members
+private:
+ SamHeader m_header;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMHEADER_P_H
diff --git a/src/api/internal/bam/BamMultiMerger_p.h b/src/api/internal/bam/BamMultiMerger_p.h
new file mode 100644
index 0000000..9835559
--- /dev/null
+++ b/src/api/internal/bam/BamMultiMerger_p.h
@@ -0,0 +1,278 @@
+// ***************************************************************************
+// BamMultiMerger_p.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides merging functionality for BamMultiReader. At this point, supports
+// sorting results by (refId, position) or by read name.
+// ***************************************************************************
+
+#ifndef BAMMULTIMERGER_P_H
+#define BAMMULTIMERGER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <deque>
+#include <functional>
+#include <set>
+#include <string>
+#include "api/BamAlignment.h"
+#include "api/BamReader.h"
+#include "api/algorithms/Sort.h"
+
+namespace BamTools {
+namespace Internal {
+
+struct MergeItem
+{
+
+ // data members
+ BamReader* Reader;
+ BamAlignment* Alignment;
+
+ // ctors & dtor
+ MergeItem(BamReader* reader = 0, BamAlignment* alignment = 0)
+ : Reader(reader)
+ , Alignment(alignment)
+ {}
+
+ MergeItem(const MergeItem& other)
+ : Reader(other.Reader)
+ , Alignment(other.Alignment)
+ {}
+
+ ~MergeItem() {}
+};
+
+template <typename Compare>
+struct MergeItemSorter : public std::binary_function<MergeItem, MergeItem, bool>
+{
+
+public:
+ MergeItemSorter(const Compare& comp = Compare())
+ : m_comp(comp)
+ {}
+
+ bool operator()(const MergeItem& lhs, const MergeItem& rhs)
+ {
+ const BamAlignment& l = *lhs.Alignment;
+ const BamAlignment& r = *rhs.Alignment;
+ return m_comp(l, r);
+ }
+
+private:
+ Compare m_comp;
+};
+
+// pure ABC so we can just work polymorphically with any specific merger implementation
+class IMultiMerger
+{
+
+public:
+ IMultiMerger() {}
+ virtual ~IMultiMerger() {}
+
+public:
+ virtual void Add(MergeItem item) = 0;
+ virtual void Clear() = 0;
+ virtual const MergeItem& First() const = 0;
+ virtual bool IsEmpty() const = 0;
+ virtual void Remove(BamReader* reader) = 0;
+ virtual int Size() const = 0;
+ virtual MergeItem TakeFirst() = 0;
+};
+
+// general merger
+template <typename Compare>
+class MultiMerger : public IMultiMerger
+{
+
+public:
+ typedef Compare CompareType;
+ typedef MergeItemSorter<CompareType> MergeType;
+
+public:
+ explicit MultiMerger(const Compare& comp = Compare())
+ : IMultiMerger()
+ , m_data(MergeType(comp))
+ {}
+ ~MultiMerger() {}
+
+public:
+ void Add(MergeItem item);
+ void Clear();
+ const MergeItem& First() const;
+ bool IsEmpty() const;
+ void Remove(BamReader* reader);
+ int Size() const;
+ MergeItem TakeFirst();
+
+private:
+ typedef MergeItem ValueType;
+ typedef std::multiset<ValueType, MergeType> ContainerType;
+ typedef typename ContainerType::iterator DataIterator;
+ typedef typename ContainerType::const_iterator DataConstIterator;
+ ContainerType m_data;
+};
+
+template <typename Compare>
+inline void MultiMerger<Compare>::Add(MergeItem item)
+{
+
+ // N.B. - any future custom Compare types must define this method
+ // see algorithms/Sort.h
+
+ if (CompareType::UsesCharData()) item.Alignment->BuildCharData();
+ m_data.insert(item);
+}
+
+template <typename Compare>
+inline void MultiMerger<Compare>::Clear()
+{
+ m_data.clear();
+}
+
+template <typename Compare>
+inline const MergeItem& MultiMerger<Compare>::First() const
+{
+ const ValueType& entry = (*m_data.begin());
+ return entry;
+}
+
+template <typename Compare>
+inline bool MultiMerger<Compare>::IsEmpty() const
+{
+ return m_data.empty();
+}
+template <typename Compare>
+inline void MultiMerger<Compare>::Remove(BamReader* reader)
+{
+
+ if (reader == 0) return;
+ const std::string& filenameToRemove = reader->GetFilename();
+
+ // iterate over readers in cache
+ DataIterator dataIter = m_data.begin();
+ DataIterator dataEnd = m_data.end();
+ for (; dataIter != dataEnd; ++dataIter) {
+ const MergeItem& item = (*dataIter);
+ const BamReader* itemReader = item.Reader;
+ if (itemReader == 0) continue;
+
+ // remove iterator on match
+ if (itemReader->GetFilename() == filenameToRemove) {
+ m_data.erase(dataIter);
+ return;
+ }
+ }
+}
+template <typename Compare>
+inline int MultiMerger<Compare>::Size() const
+{
+ return m_data.size();
+}
+
+template <typename Compare>
+inline MergeItem MultiMerger<Compare>::TakeFirst()
+{
+ DataIterator firstIter = m_data.begin();
+ MergeItem firstItem = (*firstIter);
+ m_data.erase(firstIter);
+ return firstItem;
+}
+
+// unsorted "merger"
+template <>
+class MultiMerger<Algorithms::Sort::Unsorted> : public IMultiMerger
+{
+
+public:
+ explicit MultiMerger(const Algorithms::Sort::Unsorted& comp = Algorithms::Sort::Unsorted())
+ : IMultiMerger()
+ {}
+ ~MultiMerger() {}
+
+public:
+ void Add(MergeItem item);
+ void Clear();
+ const MergeItem& First() const;
+ bool IsEmpty() const;
+ void Remove(BamReader* reader);
+ int Size() const;
+ MergeItem TakeFirst();
+
+private:
+ typedef MergeItem ValueType;
+ typedef std::deque<ValueType> ContainerType;
+ typedef ContainerType::iterator DataIterator;
+ typedef ContainerType::const_iterator DataConstIterator;
+ ContainerType m_data;
+};
+
+inline void MultiMerger<Algorithms::Sort::Unsorted>::Add(MergeItem item)
+{
+ m_data.push_back(item);
+}
+
+inline void MultiMerger<Algorithms::Sort::Unsorted>::Clear()
+{
+ m_data.clear();
+}
+
+inline const MergeItem& MultiMerger<Algorithms::Sort::Unsorted>::First() const
+{
+ return m_data.front();
+}
+
+inline bool MultiMerger<Algorithms::Sort::Unsorted>::IsEmpty() const
+{
+ return m_data.empty();
+}
+
+inline void MultiMerger<Algorithms::Sort::Unsorted>::Remove(BamReader* reader)
+{
+
+ if (reader == 0) return;
+ const std::string filenameToRemove = reader->GetFilename();
+
+ // iterate over readers in cache
+ DataIterator dataIter = m_data.begin();
+ DataIterator dataEnd = m_data.end();
+ for (; dataIter != dataEnd; ++dataIter) {
+ const MergeItem& item = (*dataIter);
+ const BamReader* itemReader = item.Reader;
+ if (itemReader == 0) continue;
+
+ // remove iterator on match
+ if (itemReader->GetFilename() == filenameToRemove) {
+ m_data.erase(dataIter);
+ return;
+ }
+ }
+}
+
+inline int MultiMerger<Algorithms::Sort::Unsorted>::Size() const
+{
+ return m_data.size();
+}
+
+inline MergeItem MultiMerger<Algorithms::Sort::Unsorted>::TakeFirst()
+{
+ MergeItem firstItem = m_data.front();
+ m_data.pop_front();
+ return firstItem;
+}
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMMULTIMERGER_P_H
diff --git a/src/api/internal/bam/BamMultiReader_p.cpp b/src/api/internal/bam/BamMultiReader_p.cpp
new file mode 100644
index 0000000..a99fac1
--- /dev/null
+++ b/src/api/internal/bam/BamMultiReader_p.cpp
@@ -0,0 +1,905 @@
+// ***************************************************************************
+// BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 24 July 2013 (DB)
+// ---------------------------------------------------------------------------
+// Functionality for simultaneously reading multiple BAM files
+// *************************************************************************
+
+#include "api/internal/bam/BamMultiReader_p.h"
+#include "api/BamAlignment.h"
+#include "api/BamMultiReader.h"
+#include "api/SamConstants.h"
+#include "api/algorithms/Sort.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <algorithm>
+#include <cstddef>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <sstream>
+
+// ctor
+BamMultiReaderPrivate::BamMultiReaderPrivate()
+ : m_alignmentCache(0)
+ , m_hasUserMergeOrder(false)
+ , m_mergeOrder(BamMultiReader::RoundRobinMerge)
+{}
+
+// dtor
+BamMultiReaderPrivate::~BamMultiReaderPrivate()
+{
+ Close();
+}
+
+// close all BAM files
+bool BamMultiReaderPrivate::Close()
+{
+
+ m_errorString.clear();
+
+ if (CloseFiles(Filenames()))
+ return true;
+ else {
+ const std::string currentError = m_errorString;
+ const std::string message =
+ std::string("error encountered while closing all files: \n\t") + currentError;
+ SetErrorString("BamMultiReader::Close", message);
+ return false;
+ }
+}
+
+// close requested BAM file
+bool BamMultiReaderPrivate::CloseFile(const std::string& filename)
+{
+
+ m_errorString.clear();
+
+ std::vector<std::string> filenames(1, filename);
+ if (CloseFiles(filenames))
+ return true;
+ else {
+ const std::string currentError = m_errorString;
+ const std::string message =
+ std::string("error while closing file: ") + filename + '\n' + currentError;
+ SetErrorString("BamMultiReader::CloseFile", message);
+ return false;
+ }
+}
+
+// close requested BAM files
+bool BamMultiReaderPrivate::CloseFiles(const std::vector<std::string>& filenames)
+{
+
+ bool errorsEncountered = false;
+ m_errorString.clear();
+
+ // iterate over filenames
+ std::vector<std::string>::const_iterator filesIter = filenames.begin();
+ std::vector<std::string>::const_iterator filesEnd = filenames.end();
+ for (; filesIter != filesEnd; ++filesIter) {
+ const std::string& filename = (*filesIter);
+ if (filename.empty()) continue;
+
+ // iterate over readers
+ std::vector<MergeItem>::iterator readerIter = m_readers.begin();
+ std::vector<MergeItem>::iterator readerEnd = m_readers.end();
+ for (; readerIter != readerEnd; ++readerIter) {
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
+ if (reader == 0) continue;
+
+ // if reader matches requested filename
+ if (reader->GetFilename() == filename) {
+
+ // remove reader's entry from alignment cache
+ m_alignmentCache->Remove(reader);
+
+ // clean up reader & its alignment
+ if (!reader->Close()) {
+ m_errorString.append(1, '\t');
+ m_errorString.append(reader->GetErrorString());
+ m_errorString.append(1, '\n');
+ errorsEncountered = true;
+ }
+ delete reader;
+ reader = 0;
+
+ // delete reader's alignment entry
+ BamAlignment* alignment = item.Alignment;
+ delete alignment;
+ alignment = 0;
+
+ // remove reader from reader list
+ m_readers.erase(readerIter);
+
+ // on match, just go on to next filename
+ // (no need to keep looking and item iterator is invalid now anyway)
+ break;
+ }
+ }
+ }
+
+ // make sure we clean up properly if all readers were closed
+ if (m_readers.empty()) {
+
+ // clean up merger
+ if (m_alignmentCache) {
+ m_alignmentCache->Clear();
+ delete m_alignmentCache;
+ m_alignmentCache = 0;
+ }
+
+ // reset merge flags
+ m_hasUserMergeOrder = false;
+ m_mergeOrder = BamMultiReader::RoundRobinMerge;
+ }
+
+ // return whether all readers closed OK
+ return !errorsEncountered;
+}
+
+// creates index files for BAM files that don't have them
+bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type)
+{
+
+ bool errorsEncountered = false;
+ m_errorString.clear();
+
+ // iterate over readers
+ std::vector<MergeItem>::iterator itemIter = m_readers.begin();
+ std::vector<MergeItem>::iterator itemEnd = m_readers.end();
+ for (; itemIter != itemEnd; ++itemIter) {
+ MergeItem& item = (*itemIter);
+ BamReader* reader = item.Reader;
+ if (reader == 0) continue;
+
+ // if reader doesn't have an index, create one
+ if (!reader->HasIndex()) {
+ if (!reader->CreateIndex(type)) {
+ m_errorString.append(1, '\t');
+ m_errorString.append(reader->GetErrorString());
+ m_errorString.append(1, '\n');
+ errorsEncountered = true;
+ }
+ }
+ }
+
+ // check for errors encountered before returning success/fail
+ if (errorsEncountered) {
+ const std::string currentError = m_errorString;
+ const std::string message =
+ std::string("error while creating index files: \n") + currentError;
+ SetErrorString("BamMultiReader::CreateIndexes", message);
+ return false;
+ } else
+ return true;
+}
+
+IMultiMerger* BamMultiReaderPrivate::CreateAlignmentCache()
+{
+
+ // if no merge order set explicitly, use SAM header to lookup proper order
+ if (!m_hasUserMergeOrder) {
+
+ // fetch SamHeader from BAM files
+ SamHeader header = GetHeader();
+
+ // if BAM files are sorted by position
+ if (header.SortOrder == Constants::SAM_HD_SORTORDER_COORDINATE)
+ m_mergeOrder = BamMultiReader::MergeByCoordinate;
+
+ // if BAM files are sorted by read name
+ else if (header.SortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME)
+ m_mergeOrder = BamMultiReader::MergeByName;
+
+ // otherwise, sorting is either "unknown" or marked as "unsorted"
+ else
+ m_mergeOrder = BamMultiReader::RoundRobinMerge;
+ }
+
+ // use current merge order to create proper 'multi-merger'
+ switch (m_mergeOrder) {
+
+ // merge BAM files by position
+ case BamMultiReader::MergeByCoordinate:
+ return new MultiMerger<Algorithms::Sort::ByPosition>();
+
+ // merge BAM files by read name
+ case BamMultiReader::MergeByName:
+ return new MultiMerger<Algorithms::Sort::ByName>();
+
+ // sorting is "unknown", "unsorted" or "ignored"... so use unsorted merger
+ case BamMultiReader::RoundRobinMerge:
+ return new MultiMerger<Algorithms::Sort::Unsorted>();
+
+ // unknown merge order, can't create merger
+ default:
+ return 0;
+ }
+}
+
+const std::vector<std::string> BamMultiReaderPrivate::Filenames() const
+{
+
+ // init filename container
+ std::vector<std::string> filenames;
+ filenames.reserve(m_readers.size());
+
+ // iterate over readers
+ std::vector<MergeItem>::const_iterator itemIter = m_readers.begin();
+ std::vector<MergeItem>::const_iterator itemEnd = m_readers.end();
+ for (; itemIter != itemEnd; ++itemIter) {
+ const MergeItem& item = (*itemIter);
+ const BamReader* reader = item.Reader;
+ if (reader == 0) continue;
+
+ // store filename if not empty
+ const std::string& filename = reader->GetFilename();
+ if (!filename.empty()) filenames.push_back(filename);
+ }
+
+ // return result
+ return filenames;
+}
+
+std::string BamMultiReaderPrivate::GetErrorString() const
+{
+ return m_errorString;
+}
+
+SamHeader BamMultiReaderPrivate::GetHeader() const
+{
+ const std::string& text = GetHeaderText();
+ return SamHeader(text);
+}
+
+// makes a virtual, unified header for all the bam files in the multireader
+std::string BamMultiReaderPrivate::GetHeaderText() const
+{
+
+ // N.B. - right now, simply copies all header data from first BAM,
+ // and then appends RG's from other BAM files
+ // TODO: make this more intelligent wrt other header lines/fields
+
+ // if no readers open
+ const std::size_t numReaders = m_readers.size();
+ if (numReaders == 0) return std::string();
+
+ // retrieve first reader's header
+ const MergeItem& firstItem = m_readers.front();
+ const BamReader* reader = firstItem.Reader;
+ if (reader == 0) return std::string();
+ SamHeader mergedHeader = reader->GetHeader();
+
+ // iterate over any remaining readers (skipping the first)
+ for (std::size_t i = 1; i < numReaders; ++i) {
+ const MergeItem& item = m_readers.at(i);
+ const BamReader* reader = item.Reader;
+ if (reader == 0) continue;
+
+ // retrieve current reader's header
+ const SamHeader currentHeader = reader->GetHeader();
+
+ // append current reader's RG entries to merged header
+ // N.B. - SamReadGroupDictionary handles duplicate-checking
+ mergedHeader.ReadGroups.Add(currentHeader.ReadGroups);
+
+ // TODO: merge anything else??
+ }
+
+ // return stringified header
+ return mergedHeader.ToString();
+}
+
+BamMultiReader::MergeOrder BamMultiReaderPrivate::GetMergeOrder() const
+{
+ return m_mergeOrder;
+}
+
+// get next alignment among all files
+bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al)
+{
+ return PopNextCachedAlignment(al, true);
+}
+
+// get next alignment among all files without parsing character data from alignments
+bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al)
+{
+ return PopNextCachedAlignment(al, false);
+}
+
+// ---------------------------------------------------------------------------------------
+//
+// NB: The following GetReferenceX() functions assume that we have identical
+// references for all BAM files. We enforce this by invoking the
+// ValidateReaders() method to verify that our reference data is the same
+// across all files on Open - so we will not encounter a situation in which
+// there is a mismatch and we are still live.
+//
+// ---------------------------------------------------------------------------------------
+
+// returns the number of reference sequences
+int BamMultiReaderPrivate::GetReferenceCount() const
+{
+
+ // handle empty multireader
+ if (m_readers.empty()) return 0;
+
+ // return reference count from first reader
+ const MergeItem& item = m_readers.front();
+ const BamReader* reader = item.Reader;
+ if (reader == 0)
+ return 0;
+ else
+ return reader->GetReferenceCount();
+}
+
+// returns vector of reference objects
+const RefVector BamMultiReaderPrivate::GetReferenceData() const
+{
+
+ // handle empty multireader
+ if (m_readers.empty()) return RefVector();
+
+ // return reference data from first BamReader
+ const MergeItem& item = m_readers.front();
+ const BamReader* reader = item.Reader;
+ if (reader == 0)
+ return RefVector();
+ else
+ return reader->GetReferenceData();
+}
+
+// returns refID from reference name
+int BamMultiReaderPrivate::GetReferenceID(const std::string& refName) const
+{
+
+ // handle empty multireader
+ if (m_readers.empty()) return -1;
+
+ // return reference ID from first BamReader
+ const MergeItem& item = m_readers.front();
+ const BamReader* reader = item.Reader;
+ if (reader == 0)
+ return -1;
+ else
+ return reader->GetReferenceID(refName);
+}
+// ---------------------------------------------------------------------------------------
+
+// returns true if all readers have index data available
+// this is useful to indicate whether Jump() or SetRegion() are possible
+bool BamMultiReaderPrivate::HasIndexes() const
+{
+
+ // handle empty multireader
+ if (m_readers.empty()) return false;
+
+ bool result = true;
+
+ // iterate over readers
+ std::vector<MergeItem>::const_iterator readerIter = m_readers.begin();
+ std::vector<MergeItem>::const_iterator readerEnd = m_readers.end();
+ for (; readerIter != readerEnd; ++readerIter) {
+ const MergeItem& item = (*readerIter);
+ const BamReader* reader = item.Reader;
+ if (reader == 0) continue;
+
+ // see if current reader has index data
+ result &= reader->HasIndex();
+ }
+
+ return result;
+}
+
+// returns true if multireader has open readers
+bool BamMultiReaderPrivate::HasOpenReaders()
+{
+
+ // iterate over readers
+ std::vector<MergeItem>::const_iterator readerIter = m_readers.begin();
+ std::vector<MergeItem>::const_iterator readerEnd = m_readers.end();
+ for (; readerIter != readerEnd; ++readerIter) {
+ const MergeItem& item = (*readerIter);
+ const BamReader* reader = item.Reader;
+ if (reader == 0) continue;
+
+ // return true whenever an open reader is found
+ if (reader->IsOpen()) return true;
+ }
+
+ // no readers open
+ return false;
+}
+
+// performs random-access jump using (refID, position) as a left-bound
+bool BamMultiReaderPrivate::Jump(int refID, int position)
+{
+
+ // NB: While it may make sense to track readers in which we can
+ // successfully Jump, in practice a failure of Jump means "no
+ // alignments here." It makes sense to simply accept the failure,
+ // UpdateAlignments(), and continue.
+
+ // iterate over readers
+ std::vector<MergeItem>::iterator readerIter = m_readers.begin();
+ std::vector<MergeItem>::iterator readerEnd = m_readers.end();
+ for (; readerIter != readerEnd; ++readerIter) {
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
+ if (reader == 0) continue;
+
+ // jump in each BamReader to position of interest
+ reader->Jump(refID, position);
+ }
+
+ // returns status of cache update
+ return UpdateAlignmentCache();
+}
+
+// locate (& load) index files for BAM readers that don't already have one loaded
+bool BamMultiReaderPrivate::LocateIndexes(const BamIndex::IndexType& preferredType)
+{
+
+ bool errorsEncountered = false;
+ m_errorString.clear();
+
+ // iterate over readers
+ std::vector<MergeItem>::iterator readerIter = m_readers.begin();
+ std::vector<MergeItem>::iterator readerEnd = m_readers.end();
+ for (; readerIter != readerEnd; ++readerIter) {
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
+ if (reader == 0) continue;
+
+ // if reader has no index, try to locate one
+ if (!reader->HasIndex()) {
+ if (!reader->LocateIndex(preferredType)) {
+ m_errorString.append(1, '\t');
+ m_errorString.append(reader->GetErrorString());
+ m_errorString.append(1, '\n');
+ errorsEncountered = true;
+ }
+ }
+ }
+
+ // check for errors encountered before returning success/fail
+ if (errorsEncountered) {
+ const std::string currentError = m_errorString;
+ const std::string message =
+ std::string("error while locating index files: \n") + currentError;
+ SetErrorString("BamMultiReader::LocatingIndexes", message);
+ return false;
+ } else
+ return true;
+}
+
+// opens BAM files
+bool BamMultiReaderPrivate::Open(const std::vector<std::string>& filenames)
+{
+
+ m_errorString.clear();
+
+ // put all current readers back at beginning (refreshes alignment cache)
+ if (!Rewind()) {
+ const std::string currentError = m_errorString;
+ const std::string message =
+ std::string("unable to rewind existing readers: \n\t") + currentError;
+ SetErrorString("BamMultiReader::Open", message);
+ return false;
+ }
+
+ // iterate over filenames
+ bool errorsEncountered = false;
+ std::vector<std::string>::const_iterator filenameIter = filenames.begin();
+ std::vector<std::string>::const_iterator filenameEnd = filenames.end();
+ for (; filenameIter != filenameEnd; ++filenameIter) {
+ const std::string& filename = (*filenameIter);
+ if (filename.empty()) continue;
+
+ // attempt to open BamReader
+ BamReader* reader = new BamReader;
+ const bool readerOpened = reader->Open(filename);
+
+ // if opened OK, store it
+ if (readerOpened) m_readers.push_back(MergeItem(reader, new BamAlignment));
+
+ // otherwise store error & clean up invalid reader
+ else {
+ m_errorString.append(1, '\t');
+ m_errorString += std::string("unable to open file: ") + filename;
+ m_errorString.append(1, '\n');
+ errorsEncountered = true;
+
+ delete reader;
+ reader = 0;
+ }
+ }
+
+ // check for errors while opening
+ if (errorsEncountered) {
+ const std::string currentError = m_errorString;
+ const std::string message = std::string("unable to open all files: \t\n") + currentError;
+ SetErrorString("BamMultiReader::Open", message);
+ return false;
+ }
+
+ // check for BAM file consistency
+ if (!ValidateReaders()) {
+ const std::string currentError = m_errorString;
+ const std::string message =
+ std::string("unable to open inconsistent files: \t\n") + currentError;
+ SetErrorString("BamMultiReader::Open", message);
+ return false;
+ }
+
+ // update alignment cache
+ return UpdateAlignmentCache();
+}
+
+bool BamMultiReaderPrivate::OpenFile(const std::string& filename)
+{
+ std::vector<std::string> filenames(1, filename);
+ if (Open(filenames))
+ return true;
+ else {
+ const std::string currentError = m_errorString;
+ const std::string message =
+ std::string("could not open file: ") + filename + "\n\t" + currentError;
+ SetErrorString("BamMultiReader::OpenFile", message);
+ return false;
+ }
+}
+
+bool BamMultiReaderPrivate::OpenIndexes(const std::vector<std::string>& indexFilenames)
+{
+
+ // TODO: This needs to be cleaner - should not assume same order.
+ // And either way, shouldn't start at first reader. Should start at
+ // first reader without an index?
+
+ // make sure same number of index filenames as readers
+ if (m_readers.size() != indexFilenames.size()) {
+ const std::string message("size of index file list does not match current BAM file count");
+ SetErrorString("BamMultiReader::OpenIndexes", message);
+ return false;
+ }
+
+ bool errorsEncountered = false;
+ m_errorString.clear();
+
+ // iterate over BamReaders
+ std::vector<std::string>::const_iterator indexFilenameIter = indexFilenames.begin();
+ std::vector<std::string>::const_iterator indexFilenameEnd = indexFilenames.end();
+ std::vector<MergeItem>::iterator readerIter = m_readers.begin();
+ std::vector<MergeItem>::iterator readerEnd = m_readers.end();
+ for (; readerIter != readerEnd; ++readerIter) {
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
+
+ // open index filename on reader
+ if (reader) {
+ const std::string& indexFilename = (*indexFilenameIter);
+ if (!reader->OpenIndex(indexFilename)) {
+ m_errorString.append(1, '\t');
+ m_errorString += reader->GetErrorString();
+ m_errorString.append(1, '\n');
+ errorsEncountered = true;
+ }
+ }
+
+ // increment filename iterator, skip if no more index files to open
+ if (++indexFilenameIter == indexFilenameEnd) break;
+ }
+
+ // return success/fail
+ if (errorsEncountered) {
+ const std::string currentError = m_errorString;
+ const std::string message =
+ std::string("could not open all index files: \n\t") + currentError;
+ SetErrorString("BamMultiReader::OpenIndexes", message);
+ return false;
+ } else
+ return true;
+}
+
+bool BamMultiReaderPrivate::PopNextCachedAlignment(BamAlignment& al, const bool needCharData)
+{
+
+ // skip if no alignments available
+ if (m_alignmentCache == 0 || m_alignmentCache->IsEmpty()) return false;
+
+ // pop next merge item entry from cache
+ MergeItem item = m_alignmentCache->TakeFirst();
+ BamReader* reader = item.Reader;
+ BamAlignment* alignment = item.Alignment;
+ if (reader == 0 || alignment == 0) return false;
+
+ // set char data if requested
+ if (needCharData) {
+ alignment->BuildCharData();
+ alignment->Filename = reader->GetFilename();
+ }
+
+ // store cached alignment into destination parameter (by copy)
+ al = *alignment;
+
+ // load next alignment from reader & store in cache
+ SaveNextAlignment(reader, alignment);
+ return true;
+}
+
+// returns BAM file pointers to beginning of alignment data & resets alignment cache
+bool BamMultiReaderPrivate::Rewind()
+{
+
+ // skip if no readers open
+ if (m_readers.empty()) return true;
+
+ // attempt to rewind files
+ if (!RewindReaders()) {
+ const std::string currentError = m_errorString;
+ const std::string message = std::string("could not rewind readers: \n\t") + currentError;
+ SetErrorString("BamMultiReader::Rewind", message);
+ return false;
+ }
+
+ // return status of cache update
+ return UpdateAlignmentCache();
+}
+
+// returns BAM file pointers to beginning of alignment data
+bool BamMultiReaderPrivate::RewindReaders()
+{
+
+ m_errorString.clear();
+ bool errorsEncountered = false;
+
+ // iterate over readers
+ std::vector<MergeItem>::iterator readerIter = m_readers.begin();
+ std::vector<MergeItem>::iterator readerEnd = m_readers.end();
+ for (; readerIter != readerEnd; ++readerIter) {
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
+ if (reader == 0) continue;
+
+ // attempt rewind on BamReader
+ if (!reader->Rewind()) {
+ m_errorString.append(1, '\t');
+ m_errorString.append(reader->GetErrorString());
+ m_errorString.append(1, '\n');
+ errorsEncountered = true;
+ }
+ }
+
+ return !errorsEncountered;
+}
+
+void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* alignment)
+{
+
+ // if can read alignment from reader, store in cache
+ //
+ // N.B. - lazy building of alignment's char data - populated only:
+ // automatically by alignment cache to maintain its sorting OR
+ // on demand from client call to future call to GetNextAlignment()
+
+ if (reader->GetNextAlignmentCore(*alignment))
+ m_alignmentCache->Add(MergeItem(reader, alignment));
+}
+
+bool BamMultiReaderPrivate::SetExplicitMergeOrder(BamMultiReader::MergeOrder order)
+{
+
+ // set new merge flags
+ m_hasUserMergeOrder = true;
+ m_mergeOrder = order;
+
+ // remove any existing merger (storing any existing data sitting in the cache)
+ std::vector<MergeItem> currentCacheData;
+ if (m_alignmentCache) {
+ while (!m_alignmentCache->IsEmpty())
+ currentCacheData.push_back(m_alignmentCache->TakeFirst());
+ delete m_alignmentCache;
+ m_alignmentCache = 0;
+ }
+
+ // create new cache using the new merge flags
+ m_alignmentCache = CreateAlignmentCache();
+ if (m_alignmentCache == 0) {
+ SetErrorString("BamMultiReader::SetExplicitMergeOrder", "requested order is unrecognized");
+ return false;
+ }
+
+ // push current data onto new cache
+ std::vector<MergeItem>::const_iterator readerIter = currentCacheData.begin();
+ std::vector<MergeItem>::const_iterator readerEnd = currentCacheData.end();
+ for (; readerIter != readerEnd; ++readerIter) {
+ const MergeItem& item = (*readerIter);
+ m_alignmentCache->Add(item);
+ }
+
+ // return success
+ return true;
+}
+
+void BamMultiReaderPrivate::SetErrorString(const std::string& where, const std::string& what) const
+{
+ static const std::string SEPARATOR(": ");
+ m_errorString = where + SEPARATOR + what;
+}
+
+bool BamMultiReaderPrivate::SetRegion(const BamRegion& region)
+{
+
+ // NB: While it may make sense to track readers in which we can
+ // successfully SetRegion, In practice a failure of SetRegion means "no
+ // alignments here." It makes sense to simply accept the failure,
+ // UpdateAlignments(), and continue.
+
+ // iterate over alignments
+ std::vector<MergeItem>::iterator readerIter = m_readers.begin();
+ std::vector<MergeItem>::iterator readerEnd = m_readers.end();
+ for (; readerIter != readerEnd; ++readerIter) {
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
+ if (reader == 0) continue;
+
+ // set region of interest
+ reader->SetRegion(region);
+ }
+
+ // return status of cache update
+ return UpdateAlignmentCache();
+}
+
+// updates our alignment cache
+bool BamMultiReaderPrivate::UpdateAlignmentCache()
+{
+
+ // create alignment cache if not created yet
+ if (m_alignmentCache == 0) {
+ m_alignmentCache = CreateAlignmentCache();
+ if (m_alignmentCache == 0) {
+ SetErrorString("BamMultiReader::UpdateAlignmentCache",
+ "unable to create new alignment cache");
+ return false;
+ }
+ }
+
+ // clear any prior cache data
+ m_alignmentCache->Clear();
+
+ // iterate over readers
+ std::vector<MergeItem>::iterator readerIter = m_readers.begin();
+ std::vector<MergeItem>::iterator readerEnd = m_readers.end();
+ for (; readerIter != readerEnd; ++readerIter) {
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
+ BamAlignment* alignment = item.Alignment;
+ if (reader == 0 || alignment == 0) continue;
+
+ // save next alignment from each reader in cache
+ SaveNextAlignment(reader, alignment);
+ }
+
+ // if we get here, ok
+ return true;
+}
+
+// ValidateReaders checks that all the readers point to BAM files representing
+// alignments against the same set of reference sequences, and that the
+// sequences are identically ordered. If these checks fail the operation of
+// the multireader is undefined, so we force program exit.
+bool BamMultiReaderPrivate::ValidateReaders() const
+{
+
+ m_errorString.clear();
+
+ // skip if 0 or 1 readers opened
+ if (m_readers.empty() || (m_readers.size() == 1)) return true;
+
+ // retrieve first reader
+ const MergeItem& firstItem = m_readers.front();
+ const BamReader* firstReader = firstItem.Reader;
+ if (firstReader == 0) return false;
+
+ // retrieve first reader's header data
+ const SamHeader& firstReaderHeader = firstReader->GetHeader();
+ const std::string& firstReaderSortOrder = firstReaderHeader.SortOrder;
+
+ // retrieve first reader's reference data
+ const RefVector& firstReaderRefData = firstReader->GetReferenceData();
+ const int firstReaderRefCount = firstReader->GetReferenceCount();
+ const int firstReaderRefSize = firstReaderRefData.size();
+
+ // iterate over all readers
+ std::vector<MergeItem>::const_iterator readerIter = m_readers.begin();
+ std::vector<MergeItem>::const_iterator readerEnd = m_readers.end();
+ for (; readerIter != readerEnd; ++readerIter) {
+ const MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
+ if (reader == 0) continue;
+
+ // get current reader's header data
+ const SamHeader& currentReaderHeader = reader->GetHeader();
+ const std::string& currentReaderSortOrder = currentReaderHeader.SortOrder;
+
+ // check compatible sort order
+ if (currentReaderSortOrder != firstReaderSortOrder) {
+ const std::string message =
+ std::string("mismatched sort order in ") + reader->GetFilename() + ", expected " +
+ firstReaderSortOrder + ", but found " + currentReaderSortOrder;
+ SetErrorString("BamMultiReader::ValidateReaders", message);
+ return false;
+ }
+
+ // get current reader's reference data
+ const RefVector currentReaderRefData = reader->GetReferenceData();
+ const int currentReaderRefCount = reader->GetReferenceCount();
+ const int currentReaderRefSize = currentReaderRefData.size();
+
+ // init reference data iterators
+ RefVector::const_iterator firstRefIter = firstReaderRefData.begin();
+ RefVector::const_iterator firstRefEnd = firstReaderRefData.end();
+ RefVector::const_iterator currentRefIter = currentReaderRefData.begin();
+
+ // compare reference counts from BamReader ( & container size, in case of BR error)
+ if ((currentReaderRefCount != firstReaderRefCount) ||
+ (firstReaderRefSize != currentReaderRefSize)) {
+ std::stringstream s;
+ s << "mismatched reference count in " << reader->GetFilename() << ", expected "
+ << firstReaderRefCount << ", but found " << currentReaderRefCount;
+ SetErrorString("BamMultiReader::ValidateReaders", s.str());
+ return false;
+ }
+
+ // this will be ok; we just checked above that we have identically-sized sets of references
+ // here we simply check if they are all, in fact, equal in content
+ while (firstRefIter != firstRefEnd) {
+ const RefData& firstRef = (*firstRefIter);
+ const RefData& currentRef = (*currentRefIter);
+
+ // compare reference name & length
+ if ((firstRef.RefName != currentRef.RefName) ||
+ (firstRef.RefLength != currentRef.RefLength)) {
+ std::stringstream s;
+ s << "mismatched references found in" << reader->GetFilename()
+ << "expected: " << std::endl;
+
+ // print first reader's reference data
+ RefVector::const_iterator refIter = firstReaderRefData.begin();
+ RefVector::const_iterator refEnd = firstReaderRefData.end();
+ for (; refIter != refEnd; ++refIter) {
+ const RefData& entry = (*refIter);
+ std::stringstream s;
+ s << entry.RefName << ' ' << std::endl;
+ }
+
+ s << "but found: " << std::endl;
+
+ // print current reader's reference data
+ refIter = currentReaderRefData.begin();
+ refEnd = currentReaderRefData.end();
+ for (; refIter != refEnd; ++refIter) {
+ const RefData& entry = (*refIter);
+ s << entry.RefName << ' ' << entry.RefLength << std::endl;
+ }
+
+ SetErrorString("BamMultiReader::ValidateReaders", s.str());
+ return false;
+ }
+
+ // update iterators
+ ++firstRefIter;
+ ++currentRefIter;
+ }
+ }
+
+ // if we get here, everything checks out
+ return true;
+}
diff --git a/src/api/internal/bam/BamMultiReader_p.h b/src/api/internal/bam/BamMultiReader_p.h
new file mode 100644
index 0000000..aa661cd
--- /dev/null
+++ b/src/api/internal/bam/BamMultiReader_p.h
@@ -0,0 +1,104 @@
+// ***************************************************************************
+// BamMultiReader_p.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 14 January 2013 (DB)
+// ---------------------------------------------------------------------------
+// Functionality for simultaneously reading multiple BAM files
+// *************************************************************************
+
+#ifndef BAMMULTIREADER_P_H
+#define BAMMULTIREADER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <string>
+#include <vector>
+#include "api/BamMultiReader.h"
+#include "api/SamHeader.h"
+#include "api/internal/bam/BamMultiMerger_p.h"
+
+namespace BamTools {
+namespace Internal {
+
+class BamMultiReaderPrivate
+{
+
+ // typedefs
+public:
+ typedef std::pair<BamReader*, BamAlignment*> ReaderAlignment;
+
+ // constructor / destructor
+public:
+ BamMultiReaderPrivate();
+ ~BamMultiReaderPrivate();
+
+ // public interface
+public:
+ // file operations
+ bool Close();
+ bool CloseFile(const std::string& filename);
+ const std::vector<std::string> Filenames() const;
+ bool Jump(int refID, int position = 0);
+ bool Open(const std::vector<std::string>& filenames);
+ bool OpenFile(const std::string& filename);
+ bool Rewind();
+ bool SetRegion(const BamRegion& region);
+
+ // access alignment data
+ BamMultiReader::MergeOrder GetMergeOrder() const;
+ bool GetNextAlignment(BamAlignment& al);
+ bool GetNextAlignmentCore(BamAlignment& al);
+ bool HasOpenReaders();
+ bool SetExplicitMergeOrder(BamMultiReader::MergeOrder order);
+
+ // access auxiliary data
+ SamHeader GetHeader() const;
+ std::string GetHeaderText() const;
+ int GetReferenceCount() const;
+ const BamTools::RefVector GetReferenceData() const;
+ int GetReferenceID(const std::string& refName) const;
+
+ // BAM index operations
+ bool CreateIndexes(const BamIndex::IndexType& type = BamIndex::STANDARD);
+ bool HasIndexes() const;
+ bool LocateIndexes(const BamIndex::IndexType& preferredType = BamIndex::STANDARD);
+ bool OpenIndexes(const std::vector<std::string>& indexFilenames);
+
+ // error handling
+ std::string GetErrorString() const;
+
+ // 'internal' methods
+public:
+ bool CloseFiles(const std::vector<std::string>& filenames);
+ IMultiMerger* CreateAlignmentCache();
+ bool PopNextCachedAlignment(BamAlignment& al, const bool needCharData);
+ bool RewindReaders();
+ void SaveNextAlignment(BamReader* reader, BamAlignment* alignment);
+ void SetErrorString(const std::string& where, const std::string& what) const; //
+ bool UpdateAlignmentCache();
+ bool ValidateReaders() const;
+
+ // data members
+public:
+ std::vector<MergeItem> m_readers;
+ IMultiMerger* m_alignmentCache;
+
+ bool m_hasUserMergeOrder;
+ BamMultiReader::MergeOrder m_mergeOrder;
+
+ mutable std::string m_errorString;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMMULTIREADER_P_H
diff --git a/src/api/internal/bam/BamRandomAccessController_p.cpp b/src/api/internal/bam/BamRandomAccessController_p.cpp
new file mode 100644
index 0000000..5b5bc58
--- /dev/null
+++ b/src/api/internal/bam/BamRandomAccessController_p.cpp
@@ -0,0 +1,302 @@
+// ***************************************************************************
+// BamRandomAccessController_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011(DB)
+// ---------------------------------------------------------------------------
+// Manages random access operations in a BAM file
+// **************************************************************************
+
+#include "api/internal/bam/BamRandomAccessController_p.h"
+#include "api/BamIndex.h"
+#include "api/internal/bam/BamReader_p.h"
+#include "api/internal/index/BamIndexFactory_p.h"
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cassert>
+#include <sstream>
+
+BamRandomAccessController::BamRandomAccessController()
+ : m_index(0)
+ , m_hasAlignmentsInRegion(true)
+{}
+
+BamRandomAccessController::~BamRandomAccessController()
+{
+ Close();
+}
+
+void BamRandomAccessController::AdjustRegion(const int& referenceCount)
+{
+
+ // skip if no index available
+ if (m_index == 0) return;
+
+ // see if any references in region have alignments
+ m_hasAlignmentsInRegion = false;
+ int currentId = m_region.LeftRefID;
+ const int rightBoundRefId =
+ (m_region.isRightBoundSpecified() ? m_region.RightRefID : referenceCount - 1);
+ while (currentId <= rightBoundRefId) {
+ m_hasAlignmentsInRegion = m_index->HasAlignments(currentId);
+ if (m_hasAlignmentsInRegion) break;
+ ++currentId;
+ }
+
+ // if no data found on any reference in region
+ if (!m_hasAlignmentsInRegion) return;
+
+ // if left bound of desired region had no data, use first reference that had data
+ // otherwise, leave requested region as-is
+ if (currentId != m_region.LeftRefID) {
+ m_region.LeftRefID = currentId;
+ m_region.LeftPosition = 0;
+ }
+}
+
+// returns alignments' "RegionState": { Before|Overlaps|After } current region
+BamRandomAccessController::RegionState BamRandomAccessController::AlignmentState(
+ const BamAlignment& alignment) const
+{
+
+ // if region has no left bound at all
+ if (!m_region.isLeftBoundSpecified()) return OverlapsRegion;
+
+ // handle unmapped reads - return AFTER region to halt processing
+ if (alignment.RefID == -1) return AfterRegion;
+
+ // if alignment is on any reference before left bound reference
+ if (alignment.RefID < m_region.LeftRefID) return BeforeRegion;
+
+ // if alignment is on left bound reference
+ else if (alignment.RefID == m_region.LeftRefID) {
+
+ // if alignment starts at or after left bound position
+ if (alignment.Position >= m_region.LeftPosition) {
+
+ if (m_region.isRightBoundSpecified() && // right bound is specified AND
+ m_region.LeftRefID ==
+ m_region.RightRefID && // left & right bounds on same reference AND
+ alignment.Position >=
+ m_region.RightPosition) // alignment starts on or after right bound position
+ return AfterRegion;
+
+ // otherwise, alignment overlaps region
+ else
+ return OverlapsRegion;
+ }
+
+ // alignment starts before left bound position
+ else {
+
+ // if alignment overlaps left bound position
+ if (alignment.GetEndPosition() > m_region.LeftPosition)
+ return OverlapsRegion;
+ else
+ return BeforeRegion;
+ }
+ }
+
+ // otherwise alignment is on a reference after left bound reference
+ else {
+
+ // if region has a right bound
+ if (m_region.isRightBoundSpecified()) {
+
+ // alignment is on any reference between boundaries
+ if (alignment.RefID < m_region.RightRefID) return OverlapsRegion;
+
+ // alignment is on any reference after right boundary
+ else if (alignment.RefID > m_region.RightRefID)
+ return AfterRegion;
+
+ // alignment is on right bound reference
+ else {
+
+ // if alignment starts before right bound position
+ if (alignment.Position < m_region.RightPosition)
+ return OverlapsRegion;
+ else
+ return AfterRegion;
+ }
+ }
+
+ // otherwise, alignment starts after left bound and there is no right bound given
+ else
+ return OverlapsRegion;
+ }
+}
+
+void BamRandomAccessController::Close()
+{
+ ClearIndex();
+ ClearRegion();
+}
+
+void BamRandomAccessController::ClearIndex()
+{
+ if (m_index) {
+ delete m_index;
+ m_index = 0;
+ }
+}
+
+void BamRandomAccessController::ClearRegion()
+{
+ m_region.clear();
+ m_hasAlignmentsInRegion = true;
+}
+
+bool BamRandomAccessController::CreateIndex(BamReaderPrivate* reader,
+ const BamIndex::IndexType& type)
+{
+ // skip if reader is invalid
+ assert(reader);
+ if (!reader->IsOpen()) {
+ SetErrorString("BamRandomAccessController::CreateIndex",
+ "cannot create index for unopened reader");
+ return false;
+ }
+
+ // create new index of requested type
+ BamIndex* newIndex = BamIndexFactory::CreateIndexOfType(type, reader);
+ if (newIndex == 0) {
+ std::stringstream s;
+ s << "could not create index of type: " << type;
+ SetErrorString("BamRandomAccessController::CreateIndex", s.str());
+ return false;
+ }
+
+ // attempt to build index from current BamReader file
+ if (!newIndex->Create()) {
+ const std::string indexError = newIndex->GetErrorString();
+ const std::string message = "could not create index: \n\t" + indexError;
+ SetErrorString("BamRandomAccessController::CreateIndex", message);
+ return false;
+ }
+
+ // save new index & return success
+ SetIndex(newIndex);
+ return true;
+}
+
+std::string BamRandomAccessController::GetErrorString() const
+{
+ return m_errorString;
+}
+
+bool BamRandomAccessController::HasIndex() const
+{
+ return (m_index != 0);
+}
+
+bool BamRandomAccessController::HasRegion() const
+{
+ return (!m_region.isNull());
+}
+
+bool BamRandomAccessController::IndexHasAlignmentsForReference(const int& refId)
+{
+ return m_index->HasAlignments(refId);
+}
+
+bool BamRandomAccessController::LocateIndex(BamReaderPrivate* reader,
+ const BamIndex::IndexType& preferredType)
+{
+ // look up index filename, deferring to preferredType if possible
+ assert(reader);
+ const std::string& indexFilename =
+ BamIndexFactory::FindIndexFilename(reader->Filename(), preferredType);
+
+ // if no index file found (of any type)
+ if (indexFilename.empty()) {
+ const std::string message =
+ std::string("could not find index file for:") + reader->Filename();
+ SetErrorString("BamRandomAccessController::LocateIndex", message);
+ return false;
+ }
+
+ // otherwise open & use index file that was found
+ return OpenIndex(indexFilename, reader);
+}
+
+bool BamRandomAccessController::OpenIndex(const std::string& indexFilename,
+ BamReaderPrivate* reader)
+{
+
+ // attempt create new index of type based on filename
+ BamIndex* index = BamIndexFactory::CreateIndexFromFilename(indexFilename, reader);
+ if (index == 0) {
+ const std::string message = std::string("could not open index file: ") + indexFilename;
+ SetErrorString("BamRandomAccessController::OpenIndex", message);
+ return false;
+ }
+
+ // attempt to load data from index file
+ if (!index->Load(indexFilename)) {
+ const std::string indexError = index->GetErrorString();
+ const std::string message = std::string("could not load index data from file: ") +
+ indexFilename + "\n\t" + indexError;
+ SetErrorString("BamRandomAccessController::OpenIndex", message);
+ return false;
+ }
+
+ // save new index & return success
+ SetIndex(index);
+ return true;
+}
+
+bool BamRandomAccessController::RegionHasAlignments() const
+{
+ return m_hasAlignmentsInRegion;
+}
+
+void BamRandomAccessController::SetErrorString(const std::string& where, const std::string& what)
+{
+ m_errorString = where + ": " + what;
+}
+
+void BamRandomAccessController::SetIndex(BamIndex* index)
+{
+ if (m_index) ClearIndex();
+ m_index = index;
+}
+
+bool BamRandomAccessController::SetRegion(const BamRegion& region, const int& referenceCount)
+{
+
+ // store region
+ m_region = region;
+
+ // cannot jump when no index is available
+ if (!HasIndex()) {
+ SetErrorString("BamRandomAccessController", "cannot jump if no index data available");
+ return false;
+ }
+
+ // adjust region as necessary to reflect where data actually begins
+ AdjustRegion(referenceCount);
+
+ // if no data present, return true
+ // * Not an error, but future attempts to access alignments in this region will not return data
+ // Returning true is useful in a BamMultiReader setting where some BAM files may
+ // lack alignments in regions where other files still have data available.
+ if (!m_hasAlignmentsInRegion) return true;
+
+ // return success/failure of jump to specified region,
+ //
+ // * Index::Jump() is allowed to modify the m_hasAlignmentsInRegion flag
+ // This covers 'corner case' where a region is requested that lies beyond the last
+ // alignment on a reference. If this occurs, any subsequent calls to GetNextAlignment[Core]
+ // will not return data. BamMultiReader will still be able to successfully pull alignments
+ // from a region from other files even if this one has no data.
+ if (!m_index->Jump(m_region, &m_hasAlignmentsInRegion)) {
+ const std::string indexError = m_index->GetErrorString();
+ const std::string message = std::string("could not set region\n\t") + indexError;
+ SetErrorString("BamRandomAccessController::OpenIndex", message);
+ return false;
+ } else
+ return true;
+}
diff --git a/src/api/internal/bam/BamRandomAccessController_p.h b/src/api/internal/bam/BamRandomAccessController_p.h
new file mode 100644
index 0000000..e569581
--- /dev/null
+++ b/src/api/internal/bam/BamRandomAccessController_p.h
@@ -0,0 +1,96 @@
+// ***************************************************************************
+// BamRandomAccessController_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011(DB)
+// ---------------------------------------------------------------------------
+// Manages random access operations in a BAM file
+// ***************************************************************************
+
+#ifndef BAMRACONTROLLER_P_H
+#define BAMRACONTROLLER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/BamAux.h"
+#include "api/BamIndex.h"
+
+namespace BamTools {
+
+class BamAlignment;
+
+namespace Internal {
+
+class BamReaderPrivate;
+
+class BamRandomAccessController
+{
+
+ // enums
+public:
+ enum RegionState
+ {
+ BeforeRegion = 0,
+ OverlapsRegion,
+ AfterRegion
+ };
+
+ // ctor & dtor
+public:
+ BamRandomAccessController();
+ ~BamRandomAccessController();
+
+ // BamRandomAccessController interface
+public:
+ // index methods
+ void ClearIndex();
+ bool CreateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& type);
+ bool HasIndex() const;
+ bool IndexHasAlignmentsForReference(const int& refId);
+ bool LocateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& preferredType);
+ bool OpenIndex(const std::string& indexFilename, BamReaderPrivate* reader);
+ void SetIndex(BamIndex* index);
+
+ // region methods
+ void ClearRegion();
+ bool HasRegion() const;
+ RegionState AlignmentState(const BamAlignment& alignment) const;
+ bool RegionHasAlignments() const;
+ bool SetRegion(const BamRegion& region, const int& referenceCount);
+
+ // general methods
+ void Close();
+ std::string GetErrorString() const;
+
+ // internal methods
+private:
+ // adjusts requested region if necessary (depending on where data actually begins)
+ void AdjustRegion(const int& referenceCount);
+ // error-string handling
+ void SetErrorString(const std::string& where, const std::string& what);
+
+ // data members
+private:
+ // index data
+ BamIndex* m_index; // owns the index, not a copy - responsible for deleting
+
+ // region data
+ BamRegion m_region;
+ bool m_hasAlignmentsInRegion;
+
+ // general data
+ std::string m_errorString;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMRACONTROLLER_P_H
diff --git a/src/api/internal/bam/BamReader_p.cpp b/src/api/internal/bam/BamReader_p.cpp
new file mode 100644
index 0000000..76faa63
--- /dev/null
+++ b/src/api/internal/bam/BamReader_p.cpp
@@ -0,0 +1,591 @@
+// ***************************************************************************
+// BamReader_p.cpp (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 18 November 2012 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
+
+#include "api/internal/bam/BamReader_p.h"
+#include "api/BamConstants.h"
+#include "api/BamReader.h"
+#include "api/IBamIODevice.h"
+#include "api/internal/bam/BamHeader_p.h"
+#include "api/internal/bam/BamRandomAccessController_p.h"
+#include "api/internal/index/BamStandardIndex_p.h"
+#include "api/internal/index/BamToolsIndex_p.h"
+#include "api/internal/io/BamDeviceFactory_p.h"
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <iterator>
+#include <vector>
+
+// constructor
+BamReaderPrivate::BamReaderPrivate(BamReader* parent)
+ : m_alignmentsBeginOffset(0)
+ , m_parent(parent)
+{
+ m_isBigEndian = BamTools::SystemIsBigEndian();
+}
+
+// destructor
+BamReaderPrivate::~BamReaderPrivate()
+{
+ Close();
+}
+
+// closes the BAM file
+bool BamReaderPrivate::Close()
+{
+
+ // clear BAM metadata
+ m_references.clear();
+ m_header.Clear();
+
+ // clear filename
+ m_filename.clear();
+
+ // close random access controller
+ m_randomAccessController.Close();
+
+ // if stream is open, attempt close
+ if (IsOpen()) {
+ try {
+ m_stream.Close();
+ } catch (BamException& e) {
+ const std::string streamError = e.what();
+ const std::string message =
+ std::string("encountered error closing BAM file: \n\t") + streamError;
+ SetErrorString("BamReader::Close", message);
+ return false;
+ }
+ }
+
+ // return success
+ return true;
+}
+
+// creates an index file of requested type on current BAM file
+bool BamReaderPrivate::CreateIndex(const BamIndex::IndexType& type)
+{
+
+ // skip if BAM file not open
+ if (!IsOpen()) {
+ SetErrorString("BamReader::CreateIndex", "cannot create index on unopened BAM file");
+ return false;
+ }
+
+ // attempt to create index
+ if (m_randomAccessController.CreateIndex(this, type))
+ return true;
+ else {
+ const std::string bracError = m_randomAccessController.GetErrorString();
+ const std::string message = std::string("could not create index: \n\t") + bracError;
+ SetErrorString("BamReader::CreateIndex", message);
+ return false;
+ }
+}
+
+// return path & filename of current BAM file
+const std::string BamReaderPrivate::Filename() const
+{
+ return m_filename;
+}
+
+const SamHeader& BamReaderPrivate::GetConstSamHeader() const
+{
+ return m_header.ToConstSamHeader();
+}
+
+std::string BamReaderPrivate::GetErrorString() const
+{
+ return m_errorString;
+}
+
+// return header data as string
+std::string BamReaderPrivate::GetHeaderText() const
+{
+ return m_header.ToString();
+}
+
+// return header data as SamHeader object
+SamHeader BamReaderPrivate::GetSamHeader() const
+{
+ return m_header.ToSamHeader();
+}
+
+// get next alignment (with character data fully parsed)
+bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment)
+{
+
+ // if valid alignment found
+ if (GetNextAlignmentCore(alignment)) {
+
+ // store alignment's "source" filename
+ alignment.Filename = m_filename;
+
+ // return success/failure of parsing char data
+ if (alignment.BuildCharData())
+ return true;
+ else {
+ const std::string alError = alignment.GetErrorString();
+ const std::string message =
+ std::string("could not populate alignment data: \n\t") + alError;
+ SetErrorString("BamReader::GetNextAlignment", message);
+ return false;
+ }
+ }
+
+ // no valid alignment found
+ return false;
+}
+
+// retrieves next available alignment core data (returns success/fail)
+// ** DOES NOT populate any character data fields (read name, bases, qualities, tag data, filename)
+// these can be accessed, if necessary, from the supportData
+// useful for operations requiring ONLY positional or other alignment-related information
+bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& alignment)
+{
+
+ // skip if stream not opened
+ if (!m_stream.IsOpen()) return false;
+
+ try {
+
+ // skip if region is set but has no alignments
+ if (m_randomAccessController.HasRegion() &&
+ !m_randomAccessController.RegionHasAlignments()) {
+ return false;
+ }
+
+ // if can't read next alignment
+ if (!LoadNextAlignment(alignment)) return false;
+
+ // check alignment's region-overlap state
+ BamRandomAccessController::RegionState state =
+ m_randomAccessController.AlignmentState(alignment);
+
+ // if alignment starts after region, no need to keep reading
+ if (state == BamRandomAccessController::AfterRegion) return false;
+
+ // read until overlap is found
+ while (state != BamRandomAccessController::OverlapsRegion) {
+
+ // if can't read next alignment
+ if (!LoadNextAlignment(alignment)) return false;
+
+ // check alignment's region-overlap state
+ state = m_randomAccessController.AlignmentState(alignment);
+
+ // if alignment starts after region, no need to keep reading
+ if (state == BamRandomAccessController::AfterRegion) return false;
+ }
+
+ // if we get here, we found the next 'valid' alignment
+ // (e.g. overlaps current region if one was set, simply the next alignment if not)
+ alignment.SupportData.HasCoreOnly = true;
+ return true;
+
+ } catch (BamException& e) {
+ const std::string streamError = e.what();
+ const std::string message =
+ std::string("encountered error reading BAM alignment: \n\t") + streamError;
+ SetErrorString("BamReader::GetNextAlignmentCore", message);
+ return false;
+ }
+}
+
+int BamReaderPrivate::GetReferenceCount() const
+{
+ return m_references.size();
+}
+
+const RefVector& BamReaderPrivate::GetReferenceData() const
+{
+ return m_references;
+}
+
+// returns RefID for given RefName (returns References.size() if not found)
+int BamReaderPrivate::GetReferenceID(const std::string& refName) const
+{
+
+ // retrieve names from reference data
+ std::vector<std::string> refNames;
+ RefVector::const_iterator refIter = m_references.begin();
+ RefVector::const_iterator refEnd = m_references.end();
+ for (; refIter != refEnd; ++refIter)
+ refNames.push_back((*refIter).RefName);
+
+ // return 'index-of' refName (or -1 if not found)
+ int index = distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));
+ if (index == (int)m_references.size())
+ return -1;
+ else
+ return index;
+}
+
+bool BamReaderPrivate::HasIndex() const
+{
+ return m_randomAccessController.HasIndex();
+}
+
+bool BamReaderPrivate::IsOpen() const
+{
+ return m_stream.IsOpen();
+}
+
+// load BAM header data
+void BamReaderPrivate::LoadHeaderData()
+{
+ m_header.Load(&m_stream);
+}
+
+static inline int bam_aux_type2size(int x)
+{
+ if (x == 'C' || x == 'c' || x == 'A')
+ return 1;
+ else if (x == 'S' || x == 's')
+ return 2;
+ else if (x == 'I' || x == 'i' || x == 'f')
+ return 4;
+ else
+ return 0;
+}
+
+static unsigned char* bam_aux_get(int aux_data_len, const unsigned char* aux_start, const char* tag)
+{
+ const unsigned char* p = aux_start;
+ while (p < aux_start + aux_data_len) {
+ if (p[0] == tag[0] && p[1] == tag[1]) return (unsigned char*)(p + 2);
+ p += 2; // skip tag
+ int type = *p++; // read type
+ if (type == 'B') {
+ int size = bam_aux_type2size(*p++); // read array type
+ unsigned len =
+ (unsigned)p[0] | (unsigned)p[1] << 8 | (unsigned)p[2] << 16 | (unsigned)p[3] << 24;
+ p += 4; // skip the size field
+ p += len * size; // skip array
+ } else if (type == 'Z' || type == 'H') {
+ while (*p++ != 0) {
+ } // skip NULL terminated string
+ } else {
+ p += bam_aux_type2size(type); // skip value
+ }
+ }
+ return NULL;
+}
+
+static inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls)
+{
+ int l, s = min_shift, t = ((1 << ((n_lvls << 1) + n_lvls)) - 1) / 7;
+ for (--end, l = n_lvls; l > 0; --l, s += 3, t -= 1 << ((l << 1) + l))
+ if (beg >> s == end >> s) return t + (beg >> s);
+ return 0;
+}
+
+bool BamReaderPrivate::Tag2Cigar(BamAlignment& a, RaiiBuffer& buf)
+{
+ if (a.RefID < 0 || a.Position < 0 || a.SupportData.NumCigarOperations == 0) return false;
+
+ const unsigned char* data = (const unsigned char*)buf.Buffer;
+ const unsigned data_len = a.SupportData.BlockLength - Constants::BAM_CORE_SIZE;
+ const unsigned char* p = data + a.SupportData.QueryNameLength; // the original CIGAR
+ unsigned cigar1 =
+ (unsigned)p[0] | (unsigned)p[1] << 8 | (unsigned)p[2] << 16 | (unsigned)p[3] << 24;
+ if ((cigar1 & 0xf) != 4 || cigar1 >> 4 != a.SupportData.QuerySequenceLength) return false;
+
+ const int seq_offset = a.SupportData.QueryNameLength + a.SupportData.NumCigarOperations * 4;
+ const int aux_offset = seq_offset + (a.SupportData.QuerySequenceLength + 1) / 2 +
+ a.SupportData.QuerySequenceLength;
+ unsigned char* CG = bam_aux_get(data_len - aux_offset, data + aux_offset, "CG");
+ if (CG == NULL || CG[0] != 'B' || CG[1] != 'I') return false;
+
+ const unsigned tag_cigar_len =
+ (unsigned)CG[2] | (unsigned)CG[3] << 8 | (unsigned)CG[4] << 16 | (unsigned)CG[5] << 24;
+ if (tag_cigar_len == 0) return false;
+
+ // recalculate bin, as it may be incorrect if it was calculated by a tool unaware of the real CIGAR in tag
+ const unsigned tag_cigar_offset = CG - data + 6;
+ unsigned alignment_end = a.Position;
+ p = data + tag_cigar_offset;
+ for (unsigned i = 0; i < tag_cigar_len * 4; i += 4, p += 4) {
+ unsigned cigar1 =
+ (unsigned)p[0] | (unsigned)p[1] << 8 | (unsigned)p[2] << 16 | (unsigned)p[3] << 24;
+ int op = cigar1 & 0xf;
+ if (op == 0 || op == 2 || op == 3 || op == 7 || op == 8) alignment_end += cigar1 >> 4;
+ }
+ a.Bin = hts_reg2bin(a.Position, alignment_end, 14, 5);
+
+ // populate new AllCharData
+ int fake_bytes = a.SupportData.NumCigarOperations * 4;
+ std::string new_data;
+ new_data.reserve(data_len - 8 - fake_bytes + 1);
+ new_data.append((char*)data, a.SupportData.QueryNameLength); // query name
+ new_data.append((char*)data + tag_cigar_offset, tag_cigar_len * 4); // real CIGAR
+ new_data.append((char*)data + seq_offset,
+ tag_cigar_offset - 8 - seq_offset); // seq, qual and tags before CG
+ const unsigned tag_cigar_end_offset = tag_cigar_offset + tag_cigar_len * 4;
+ if (tag_cigar_end_offset < data_len) // tags after CG, if there is any
+ new_data.append((char*)data + tag_cigar_end_offset, data_len - tag_cigar_end_offset);
+
+ // update member variables
+ a.SupportData.NumCigarOperations = tag_cigar_len;
+ a.SupportData.BlockLength -= 8 + fake_bytes;
+ memcpy(buf.Buffer, new_data.c_str(), buf.NumBytes - 8 - fake_bytes);
+ return true;
+}
+
+// populates BamAlignment with alignment data under file pointer, returns success/fail
+bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment)
+{
+
+ // read in the 'block length' value, make sure it's not zero
+ char buffer[sizeof(uint32_t)];
+ std::fill_n(buffer, sizeof(uint32_t), 0);
+ m_stream.Read(buffer, sizeof(uint32_t));
+ alignment.SupportData.BlockLength = BamTools::UnpackUnsignedInt(buffer);
+ if (m_isBigEndian) BamTools::SwapEndian_32(alignment.SupportData.BlockLength);
+ if (alignment.SupportData.BlockLength == 0) return false;
+
+ // read in core alignment data, make sure the right size of data was read
+ char x[Constants::BAM_CORE_SIZE];
+ if (m_stream.Read(x, Constants::BAM_CORE_SIZE) != Constants::BAM_CORE_SIZE) return false;
+
+ // swap core endian-ness if necessary
+ if (m_isBigEndian) {
+ for (unsigned int i = 0; i < Constants::BAM_CORE_SIZE; i += sizeof(uint32_t))
+ BamTools::SwapEndian_32p(&x[i]);
+ }
+
+ // set BamAlignment 'core' and 'support' data
+ alignment.RefID = BamTools::UnpackSignedInt(&x[0]);
+ alignment.Position = BamTools::UnpackSignedInt(&x[4]);
+
+ unsigned int tempValue = BamTools::UnpackUnsignedInt(&x[8]);
+ alignment.Bin = tempValue >> 16;
+ alignment.MapQuality = tempValue >> 8 & 0xff;
+ alignment.SupportData.QueryNameLength = tempValue & 0xff;
+
+ tempValue = BamTools::UnpackUnsignedInt(&x[12]);
+ alignment.AlignmentFlag = tempValue >> 16;
+ alignment.SupportData.NumCigarOperations = tempValue & 0xffff;
+
+ alignment.SupportData.QuerySequenceLength = BamTools::UnpackUnsignedInt(&x[16]);
+ alignment.MateRefID = BamTools::UnpackSignedInt(&x[20]);
+ alignment.MatePosition = BamTools::UnpackSignedInt(&x[24]);
+ alignment.InsertSize = BamTools::UnpackSignedInt(&x[28]);
+
+ // set BamAlignment length
+ alignment.Length = alignment.SupportData.QuerySequenceLength;
+
+ // read in character data - make sure proper data size was read
+ bool readCharDataOK = false;
+ unsigned int dataLength = alignment.SupportData.BlockLength - Constants::BAM_CORE_SIZE;
+ RaiiBuffer allCharData(dataLength);
+
+ if (m_stream.Read(allCharData.Buffer, dataLength) == dataLength) {
+
+ int OldNumCigarOperations = alignment.SupportData.NumCigarOperations;
+ if (Tag2Cigar(alignment, allCharData)) dataLength -= 8 + OldNumCigarOperations * 4;
+
+ // store 'allCharData' in supportData structure
+ alignment.SupportData.AllCharData.assign((const char*)allCharData.Buffer, dataLength);
+
+ // set success flag
+ readCharDataOK = true;
+
+ // save CIGAR ops
+ // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly,
+ // even when GetNextAlignmentCore() is called
+ const unsigned int cigarDataOffset = alignment.SupportData.QueryNameLength;
+ uint32_t* cigarData = (uint32_t*)(allCharData.Buffer + cigarDataOffset);
+ CigarOp op;
+ alignment.CigarData.clear();
+ alignment.CigarData.reserve(alignment.SupportData.NumCigarOperations);
+ for (unsigned int i = 0; i < alignment.SupportData.NumCigarOperations; ++i) {
+
+ // swap endian-ness if necessary
+ if (m_isBigEndian) BamTools::SwapEndian_32(cigarData[i]);
+
+ // build CigarOp structure
+ op.Length = (cigarData[i] >> Constants::BAM_CIGAR_SHIFT);
+ op.Type = Constants::BAM_CIGAR_LOOKUP[(cigarData[i] & Constants::BAM_CIGAR_MASK)];
+
+ // save CigarOp
+ alignment.CigarData.push_back(op);
+ }
+ }
+
+ // return success/failure
+ return readCharDataOK;
+}
+
+// loads reference data from BAM file
+bool BamReaderPrivate::LoadReferenceData()
+{
+
+ // get number of reference sequences
+ char buffer[sizeof(uint32_t)];
+ m_stream.Read(buffer, sizeof(uint32_t));
+ uint32_t numberRefSeqs = BamTools::UnpackUnsignedInt(buffer);
+ if (m_isBigEndian) BamTools::SwapEndian_32(numberRefSeqs);
+ m_references.reserve((int)numberRefSeqs);
+
+ // iterate over all references in header
+ for (unsigned int i = 0; i != numberRefSeqs; ++i) {
+
+ // get length of reference name
+ m_stream.Read(buffer, sizeof(uint32_t));
+ uint32_t refNameLength = BamTools::UnpackUnsignedInt(buffer);
+ if (m_isBigEndian) BamTools::SwapEndian_32(refNameLength);
+ RaiiBuffer refName(refNameLength);
+
+ // get reference name and reference sequence length
+ m_stream.Read(refName.Buffer, refNameLength);
+ m_stream.Read(buffer, sizeof(int32_t));
+ int32_t refLength = BamTools::UnpackSignedInt(buffer);
+ if (m_isBigEndian) BamTools::SwapEndian_32(refLength);
+
+ // store data for reference
+ RefData aReference;
+ aReference.RefName = static_cast<std::string>((const char*)refName.Buffer);
+ aReference.RefLength = refLength;
+ m_references.push_back(aReference);
+ }
+
+ // return success
+ return true;
+}
+
+bool BamReaderPrivate::LocateIndex(const BamIndex::IndexType& preferredType)
+{
+
+ if (m_randomAccessController.LocateIndex(this, preferredType))
+ return true;
+ else {
+ const std::string bracError = m_randomAccessController.GetErrorString();
+ const std::string message = std::string("could not locate index: \n\t") + bracError;
+ SetErrorString("BamReader::LocateIndex", message);
+ return false;
+ }
+}
+
+// opens BAM file (and index)
+bool BamReaderPrivate::Open(const std::string& filename)
+{
+
+ try {
+
+ // make sure we're starting with fresh state
+ Close();
+
+ // open BgzfStream
+ m_stream.Open(filename, IBamIODevice::ReadOnly);
+
+ // load BAM metadata
+ LoadHeaderData();
+ LoadReferenceData();
+
+ // store filename & offset of first alignment
+ m_filename = filename;
+ m_alignmentsBeginOffset = m_stream.Tell();
+
+ // return success
+ return true;
+
+ } catch (BamException& e) {
+ const std::string error = e.what();
+ const std::string message =
+ std::string("could not open file: ") + filename + "\n\t" + error;
+ SetErrorString("BamReader::Open", message);
+ return false;
+ }
+}
+
+bool BamReaderPrivate::OpenIndex(const std::string& indexFilename)
+{
+
+ if (m_randomAccessController.OpenIndex(indexFilename, this))
+ return true;
+ else {
+ const std::string bracError = m_randomAccessController.GetErrorString();
+ const std::string message = std::string("could not open index: \n\t") + bracError;
+ SetErrorString("BamReader::OpenIndex", message);
+ return false;
+ }
+}
+
+// returns BAM file pointer to beginning of alignment data
+bool BamReaderPrivate::Rewind()
+{
+
+ // reset region
+ m_randomAccessController.ClearRegion();
+
+ // return status of seeking back to first alignment
+ if (Seek(m_alignmentsBeginOffset))
+ return true;
+ else {
+ const std::string currentError = m_errorString;
+ const std::string message = std::string("could not rewind: \n\t") + currentError;
+ SetErrorString("BamReader::Rewind", message);
+ return false;
+ }
+}
+
+bool BamReaderPrivate::Seek(const int64_t& position)
+{
+
+ // skip if BAM file not open
+ if (!IsOpen()) {
+ SetErrorString("BamReader::Seek", "cannot seek on unopened BAM file");
+ return false;
+ }
+
+ try {
+ m_stream.Seek(position);
+ return true;
+ } catch (BamException& e) {
+ const std::string streamError = e.what();
+ const std::string message = std::string("could not seek in BAM file: \n\t") + streamError;
+ SetErrorString("BamReader::Seek", message);
+ return false;
+ }
+}
+
+void BamReaderPrivate::SetErrorString(const std::string& where, const std::string& what)
+{
+ static const std::string SEPARATOR(": ");
+ m_errorString = where + SEPARATOR + what;
+}
+
+void BamReaderPrivate::SetIndex(BamIndex* index)
+{
+ m_randomAccessController.SetIndex(index);
+}
+
+// sets current region & attempts to jump to it
+// returns success/failure
+bool BamReaderPrivate::SetRegion(const BamRegion& region)
+{
+
+ if (m_randomAccessController.SetRegion(region, m_references.size()))
+ return true;
+ else {
+ const std::string bracError = m_randomAccessController.GetErrorString();
+ const std::string message = std::string("could not set region: \n\t") + bracError;
+ SetErrorString("BamReader::SetRegion", message);
+ return false;
+ }
+}
+
+int64_t BamReaderPrivate::Tell() const
+{
+ return m_stream.Tell();
+}
diff --git a/src/api/internal/bam/BamReader_p.h b/src/api/internal/bam/BamReader_p.h
new file mode 100644
index 0000000..48dea89
--- /dev/null
+++ b/src/api/internal/bam/BamReader_p.h
@@ -0,0 +1,119 @@
+// ***************************************************************************
+// BamReader_p.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 18 November 2012 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
+
+#ifndef BAMREADER_P_H
+#define BAMREADER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <string>
+#include "api/BamAlignment.h"
+#include "api/BamIndex.h"
+#include "api/BamReader.h"
+#include "api/SamHeader.h"
+#include "api/internal/bam/BamHeader_p.h"
+#include "api/internal/bam/BamRandomAccessController_p.h"
+#include "api/internal/io/BgzfStream_p.h"
+
+namespace BamTools {
+namespace Internal {
+
+class BamReaderPrivate
+{
+
+ // ctor & dtor
+public:
+ BamReaderPrivate(BamReader* parent);
+ ~BamReaderPrivate();
+
+ // BamReader interface
+public:
+ // file operations
+ bool Close();
+ const std::string Filename() const;
+ bool IsOpen() const;
+ bool Open(const std::string& filename);
+ bool Rewind();
+ bool SetRegion(const BamRegion& region);
+
+ // access alignment data
+ bool GetNextAlignment(BamAlignment& alignment);
+ bool GetNextAlignmentCore(BamAlignment& alignment);
+ bool Tag2Cigar(BamAlignment& alignment, RaiiBuffer& buf);
+
+ // access auxiliary data
+ std::string GetHeaderText() const;
+ const SamHeader& GetConstSamHeader() const;
+ SamHeader GetSamHeader() const;
+ int GetReferenceCount() const;
+ const RefVector& GetReferenceData() const;
+ int GetReferenceID(const std::string& refName) const;
+
+ // index operations
+ bool CreateIndex(const BamIndex::IndexType& type);
+ bool HasIndex() const;
+ bool LocateIndex(const BamIndex::IndexType& preferredType);
+ bool OpenIndex(const std::string& indexFilename);
+ void SetIndex(BamIndex* index);
+
+ // error handling
+ std::string GetErrorString() const;
+ void SetErrorString(const std::string& where, const std::string& what);
+
+ // internal methods, but available as a BamReaderPrivate 'interface'
+ //
+ // these methods should only be used by BamTools::Internal classes
+ // (currently only used by the BamIndex subclasses)
+public:
+ // retrieves header text from BAM file
+ void LoadHeaderData();
+ // retrieves BAM alignment under file pointer
+ // (does no overlap checking or character data parsing)
+ bool LoadNextAlignment(BamAlignment& alignment);
+ // builds reference data structure from BAM file
+ bool LoadReferenceData();
+ // seek reader to file position
+ bool Seek(const int64_t& position);
+ // return reader's file position
+ int64_t Tell() const;
+
+ // data members
+public:
+ // general BAM file data
+ int64_t m_alignmentsBeginOffset;
+ std::string m_filename;
+ RefVector m_references;
+
+ // system data
+ bool m_isBigEndian;
+
+ // parent BamReader
+ BamReader* m_parent;
+
+ // BamReaderPrivate components
+ BamHeader m_header;
+ BamRandomAccessController m_randomAccessController;
+ BgzfStream m_stream;
+
+ // error handling
+ std::string m_errorString;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMREADER_P_H
diff --git a/src/api/internal/bam/BamWriter_p.cpp b/src/api/internal/bam/BamWriter_p.cpp
new file mode 100644
index 0000000..9509777
--- /dev/null
+++ b/src/api/internal/bam/BamWriter_p.cpp
@@ -0,0 +1,599 @@
+// ***************************************************************************
+// BamWriter_p.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 18 November 2012 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#include "api/internal/bam/BamWriter_p.h"
+#include "api/BamAlignment.h"
+#include "api/BamConstants.h"
+#include "api/IBamIODevice.h"
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+
+// ctor
+BamWriterPrivate::BamWriterPrivate()
+ : m_isBigEndian(BamTools::SystemIsBigEndian())
+{}
+
+// dtor
+BamWriterPrivate::~BamWriterPrivate()
+{
+ Close();
+}
+
+// calculates minimum bin for a BAM alignment interval [begin, end)
+uint32_t BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const
+{
+ --end;
+ if ((begin >> 14) == (end >> 14)) return 4681 + (begin >> 14);
+ if ((begin >> 17) == (end >> 17)) return 585 + (begin >> 17);
+ if ((begin >> 20) == (end >> 20)) return 73 + (begin >> 20);
+ if ((begin >> 23) == (end >> 23)) return 9 + (begin >> 23);
+ if ((begin >> 26) == (end >> 26)) return 1 + (begin >> 26);
+ return 0;
+}
+
+// closes the alignment archive
+void BamWriterPrivate::Close()
+{
+
+ // skip if file not open
+ if (!IsOpen()) return;
+
+ // close output stream
+ try {
+ m_stream.Close();
+ } catch (BamException& e) {
+ m_errorString = e.what();
+ }
+}
+
+// creates a cigar string from the supplied alignment
+void BamWriterPrivate::CreatePackedCigar(const std::vector<CigarOp>& cigarOperations,
+ std::string& packedCigar)
+{
+
+ // initialize
+ const std::size_t numCigarOperations = cigarOperations.size();
+ packedCigar.resize(numCigarOperations * Constants::BAM_SIZEOF_INT);
+
+ // pack the cigar data into the string
+ unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
+
+ // iterate over cigar operations
+ std::vector<CigarOp>::const_iterator coIter = cigarOperations.begin();
+ std::vector<CigarOp>::const_iterator coEnd = cigarOperations.end();
+ for (; coIter != coEnd; ++coIter) {
+
+ // store op in packedCigar
+ uint8_t cigarOp;
+ switch (coIter->Type) {
+ case (Constants::BAM_CIGAR_MATCH_CHAR):
+ cigarOp = Constants::BAM_CIGAR_MATCH;
+ break;
+ case (Constants::BAM_CIGAR_INS_CHAR):
+ cigarOp = Constants::BAM_CIGAR_INS;
+ break;
+ case (Constants::BAM_CIGAR_DEL_CHAR):
+ cigarOp = Constants::BAM_CIGAR_DEL;
+ break;
+ case (Constants::BAM_CIGAR_REFSKIP_CHAR):
+ cigarOp = Constants::BAM_CIGAR_REFSKIP;
+ break;
+ case (Constants::BAM_CIGAR_SOFTCLIP_CHAR):
+ cigarOp = Constants::BAM_CIGAR_SOFTCLIP;
+ break;
+ case (Constants::BAM_CIGAR_HARDCLIP_CHAR):
+ cigarOp = Constants::BAM_CIGAR_HARDCLIP;
+ break;
+ case (Constants::BAM_CIGAR_PAD_CHAR):
+ cigarOp = Constants::BAM_CIGAR_PAD;
+ break;
+ case (Constants::BAM_CIGAR_SEQMATCH_CHAR):
+ cigarOp = Constants::BAM_CIGAR_SEQMATCH;
+ break;
+ case (Constants::BAM_CIGAR_MISMATCH_CHAR):
+ cigarOp = Constants::BAM_CIGAR_MISMATCH;
+ break;
+ default:
+ const std::string message =
+ std::string("invalid CIGAR operation type") + coIter->Type;
+ throw BamException("BamWriter::CreatePackedCigar", message);
+ }
+
+ *pPackedCigar = coIter->Length << Constants::BAM_CIGAR_SHIFT | cigarOp;
+ pPackedCigar++;
+ }
+}
+
+// encodes the supplied query sequence into 4-bit notation
+void BamWriterPrivate::EncodeQuerySequence(const std::string& query, std::string& encodedQuery)
+{
+
+ // prepare the encoded query string
+ const std::size_t queryLength = query.size();
+ const std::size_t encodedQueryLength = static_cast<std::size_t>((queryLength + 1) / 2);
+ encodedQuery.resize(encodedQueryLength);
+ char* pEncodedQuery = (char*)encodedQuery.data();
+ const char* pQuery = (const char*)query.data();
+
+ // walk through original query sequence, encoding its bases
+ unsigned char nucleotideCode;
+ bool useHighWord = true;
+ while (*pQuery) {
+ switch (*pQuery) {
+ case (Constants::BAM_DNA_EQUAL):
+ nucleotideCode = Constants::BAM_BASECODE_EQUAL;
+ break;
+ case (Constants::BAM_DNA_A):
+ nucleotideCode = Constants::BAM_BASECODE_A;
+ break;
+ case (Constants::BAM_DNA_C):
+ nucleotideCode = Constants::BAM_BASECODE_C;
+ break;
+ case (Constants::BAM_DNA_M):
+ nucleotideCode = Constants::BAM_BASECODE_M;
+ break;
+ case (Constants::BAM_DNA_G):
+ nucleotideCode = Constants::BAM_BASECODE_G;
+ break;
+ case (Constants::BAM_DNA_R):
+ nucleotideCode = Constants::BAM_BASECODE_R;
+ break;
+ case (Constants::BAM_DNA_S):
+ nucleotideCode = Constants::BAM_BASECODE_S;
+ break;
+ case (Constants::BAM_DNA_V):
+ nucleotideCode = Constants::BAM_BASECODE_V;
+ break;
+ case (Constants::BAM_DNA_T):
+ nucleotideCode = Constants::BAM_BASECODE_T;
+ break;
+ case (Constants::BAM_DNA_W):
+ nucleotideCode = Constants::BAM_BASECODE_W;
+ break;
+ case (Constants::BAM_DNA_Y):
+ nucleotideCode = Constants::BAM_BASECODE_Y;
+ break;
+ case (Constants::BAM_DNA_H):
+ nucleotideCode = Constants::BAM_BASECODE_H;
+ break;
+ case (Constants::BAM_DNA_K):
+ nucleotideCode = Constants::BAM_BASECODE_K;
+ break;
+ case (Constants::BAM_DNA_D):
+ nucleotideCode = Constants::BAM_BASECODE_D;
+ break;
+ case (Constants::BAM_DNA_B):
+ nucleotideCode = Constants::BAM_BASECODE_B;
+ break;
+ case (Constants::BAM_DNA_N):
+ nucleotideCode = Constants::BAM_BASECODE_N;
+ break;
+ default:
+ const std::string message = std::string("invalid base: ") + *pQuery;
+ throw BamException("BamWriter::EncodeQuerySequence", message);
+ }
+
+ // pack the nucleotide code
+ if (useHighWord) {
+ *pEncodedQuery = nucleotideCode << 4;
+ useHighWord = false;
+ } else {
+ *pEncodedQuery |= nucleotideCode;
+ ++pEncodedQuery;
+ useHighWord = true;
+ }
+
+ // increment the query position
+ ++pQuery;
+ }
+}
+
+// returns a description of the last error that occurred
+std::string BamWriterPrivate::GetErrorString() const
+{
+ return m_errorString;
+}
+
+// returns whether BAM file is open for writing or not
+bool BamWriterPrivate::IsOpen() const
+{
+ return m_stream.IsOpen();
+}
+
+// opens the alignment archive
+bool BamWriterPrivate::Open(const std::string& filename, const std::string& samHeaderText,
+ const RefVector& referenceSequences)
+{
+ try {
+
+ // open the BGZF file for writing
+ m_stream.Open(filename, IBamIODevice::WriteOnly);
+
+ // write BAM file 'metadata' components
+ WriteMagicNumber();
+ WriteSamHeaderText(samHeaderText);
+ WriteReferences(referenceSequences);
+
+ // return success
+ return true;
+
+ } catch (BamException& e) {
+ m_errorString = e.what();
+ return false;
+ }
+}
+
+// saves the alignment to the alignment archive
+bool BamWriterPrivate::SaveAlignment(const BamAlignment& al)
+{
+
+ try {
+
+ // if BamAlignment contains only the core data and a raw char data buffer
+ // (as a result of BamReader::GetNextAlignmentCore())
+ if (al.SupportData.HasCoreOnly) WriteCoreAlignment(al);
+
+ // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
+ // (resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code)
+ else
+ WriteAlignment(al);
+
+ // if we get here, everything OK
+ return true;
+
+ } catch (BamException& e) {
+ m_errorString = e.what();
+ return false;
+ }
+}
+
+void BamWriterPrivate::SetWriteCompressed(bool ok)
+{
+ // modifying compression is not allowed if BAM file is open
+ if (!IsOpen()) m_stream.SetWriteCompressed(ok);
+}
+
+void BamWriterPrivate::WriteAlignment(const BamAlignment& al)
+{
+
+ // calculate char lengths
+ const unsigned int nameLength = al.Name.size() + 1;
+ const unsigned int numCigarOperations = al.CigarData.size();
+ const unsigned int queryLength = ((al.QueryBases == "*") ? 0 : al.QueryBases.size());
+ const unsigned int tagDataLength = al.TagData.size();
+
+ // no way to tell if alignment's bin is already defined (there is no default, invalid value)
+ // so we'll go ahead calculate its bin ID before storing
+ const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition());
+
+ // create our packed cigar string
+ std::string packedCigar;
+ CreatePackedCigar(al.CigarData, packedCigar);
+ const unsigned int packedCigarLength = packedCigar.size();
+
+ // encode the query
+ unsigned int encodedQueryLength = 0;
+ std::string encodedQuery;
+ if (queryLength > 0) {
+ EncodeQuerySequence(al.QueryBases, encodedQuery);
+ encodedQueryLength = encodedQuery.size();
+ }
+
+ // write the block size
+ const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength +
+ queryLength + // here referring to quality length
+ tagDataLength;
+ unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize;
+ if (numCigarOperations >= 65536) blockSize += 16;
+ if (m_isBigEndian) BamTools::SwapEndian_32(blockSize);
+ m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
+
+ // assign the BAM core data
+ uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
+ buffer[0] = al.RefID;
+ buffer[1] = al.Position;
+ buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
+ buffer[3] = (al.AlignmentFlag << 16) | (numCigarOperations < 65536 ? numCigarOperations : 2);
+ buffer[4] = queryLength;
+ buffer[5] = al.MateRefID;
+ buffer[6] = al.MatePosition;
+ buffer[7] = al.InsertSize;
+
+ // swap BAM core endian-ness, if necessary
+ if (m_isBigEndian) {
+ for (int i = 0; i < 8; ++i)
+ BamTools::SwapEndian_32(buffer[i]);
+ }
+
+ // write the BAM core
+ m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
+
+ // write the query name
+ m_stream.Write(al.Name.c_str(), nameLength);
+
+ // write the packed cigar
+ if (numCigarOperations < 65536) {
+ if (m_isBigEndian) {
+ char* cigarData = new char[packedCigarLength]();
+ memcpy(cigarData, packedCigar.data(), packedCigarLength);
+ if (m_isBigEndian) {
+ for (size_t i = 0; i < packedCigarLength; ++i)
+ BamTools::SwapEndian_32p(&cigarData[i]);
+ }
+ m_stream.Write(cigarData, packedCigarLength);
+ delete[] cigarData; // TODO: cleanup on Write exception thrown?
+ } else
+ m_stream.Write(packedCigar.data(), packedCigarLength);
+ } else {
+ unsigned int cigar[2];
+ cigar[0] = queryLength << 4 | 4;
+ cigar[1] = (al.GetEndPosition() - al.Position) << 4 | 3;
+ if (m_isBigEndian) {
+ BamTools::SwapEndian_32(cigar[0]);
+ BamTools::SwapEndian_32(cigar[1]);
+ }
+ m_stream.Write((char*)cigar, 8);
+ }
+
+ if (queryLength > 0) {
+
+ // write the encoded query sequence
+ m_stream.Write(encodedQuery.data(), encodedQueryLength);
+
+ // write the base qualities
+ char* pBaseQualities = new char[queryLength]();
+ if (al.Qualities.empty() || (al.Qualities.size() == 1 && al.Qualities[0] == '*') ||
+ al.Qualities[0] == (char)0xFF)
+ memset(pBaseQualities, 0xFF, queryLength); // if missing or '*', fill with invalid qual
+ else {
+ for (std::size_t i = 0; i < queryLength; ++i)
+ pBaseQualities[i] =
+ al.Qualities.at(i) - 33; // FASTQ ASCII -> phred score conversion
+ }
+ m_stream.Write(pBaseQualities, queryLength);
+ delete[] pBaseQualities;
+ }
+
+ // write the tag data
+ if (m_isBigEndian) {
+
+ char* tagData = new char[tagDataLength]();
+ memcpy(tagData, al.TagData.data(), tagDataLength);
+
+ std::size_t i = 0;
+ while (i < tagDataLength) {
+
+ i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.)
+ const char type = tagData[i]; // get tag type at position i
+ ++i;
+
+ switch (type) {
+
+ case (Constants::BAM_TAG_TYPE_ASCII):
+ case (Constants::BAM_TAG_TYPE_INT8):
+ case (Constants::BAM_TAG_TYPE_UINT8):
+ ++i;
+ break;
+
+ case (Constants::BAM_TAG_TYPE_INT16):
+ case (Constants::BAM_TAG_TYPE_UINT16):
+ BamTools::SwapEndian_16p(&tagData[i]);
+ i += sizeof(uint16_t);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_FLOAT):
+ case (Constants::BAM_TAG_TYPE_INT32):
+ case (Constants::BAM_TAG_TYPE_UINT32):
+ BamTools::SwapEndian_32p(&tagData[i]);
+ i += sizeof(uint32_t);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_HEX):
+ case (Constants::BAM_TAG_TYPE_STRING):
+ // no endian swapping necessary for hex-string/string data
+ while (tagData[i])
+ ++i;
+ // increment one more for null terminator
+ ++i;
+ break;
+
+ case (Constants::BAM_TAG_TYPE_ARRAY):
+
+ {
+ // read array type
+ const char arrayType = tagData[i];
+ ++i;
+
+ // swap endian-ness of number of elements in place, then retrieve for loop
+ BamTools::SwapEndian_32p(&tagData[i]);
+ int32_t numElements;
+ memcpy(&numElements, &tagData[i], sizeof(uint32_t));
+ i += sizeof(uint32_t);
+
+ // swap endian-ness of array elements
+ for (int j = 0; j < numElements; ++j) {
+ switch (arrayType) {
+ case (Constants::BAM_TAG_TYPE_INT8):
+ case (Constants::BAM_TAG_TYPE_UINT8):
+ // no endian-swapping necessary
+ ++i;
+ break;
+ case (Constants::BAM_TAG_TYPE_INT16):
+ case (Constants::BAM_TAG_TYPE_UINT16):
+ BamTools::SwapEndian_16p(&tagData[i]);
+ i += sizeof(uint16_t);
+ break;
+ case (Constants::BAM_TAG_TYPE_FLOAT):
+ case (Constants::BAM_TAG_TYPE_INT32):
+ case (Constants::BAM_TAG_TYPE_UINT32):
+ BamTools::SwapEndian_32p(&tagData[i]);
+ i += sizeof(uint32_t);
+ break;
+ default:
+ delete[] tagData;
+ const std::string message =
+ std::string("invalid binary array type: ") + arrayType;
+ throw BamException("BamWriter::SaveAlignment", message);
+ }
+ }
+
+ break;
+ }
+
+ default:
+ delete[] tagData;
+ const std::string message = std::string("invalid tag type: ") + type;
+ throw BamException("BamWriter::SaveAlignment", message);
+ }
+ }
+
+ m_stream.Write(tagData, tagDataLength);
+ delete[] tagData; // TODO: cleanup on Write exception thrown?
+ } else
+ m_stream.Write(al.TagData.data(), tagDataLength);
+
+ if (numCigarOperations >= 65536) {
+ m_stream.Write("CGBI", 4);
+ if (m_isBigEndian) {
+ unsigned int cigar_len_buf = numCigarOperations;
+ BamTools::SwapEndian_32(cigar_len_buf);
+ m_stream.Write((char*)&cigar_len_buf, 4);
+
+ char* cigarData = new char[packedCigarLength]();
+ memcpy(cigarData, packedCigar.data(), packedCigarLength);
+ if (m_isBigEndian) {
+ for (size_t i = 0; i < packedCigarLength;
+ ++i) // FIXME: similarly, this should be "i += 4", not "++i"
+ BamTools::SwapEndian_32p(&cigarData[i]);
+ }
+ m_stream.Write(cigarData, packedCigarLength);
+ delete[] cigarData; // TODO: cleanup on Write exception thrown?
+ } else {
+ m_stream.Write((char*)&numCigarOperations, 4);
+ m_stream.Write(packedCigar.data(), packedCigarLength);
+ }
+ }
+}
+
+void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al)
+{
+
+ // write the block size
+ unsigned int blockSize = al.SupportData.BlockLength;
+ if (al.SupportData.NumCigarOperations >= 65536) blockSize += 16;
+ if (m_isBigEndian) BamTools::SwapEndian_32(blockSize);
+ m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
+
+ // re-calculate bin (in case BamAlignment's position has been previously modified)
+ const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition());
+
+ // assign the BAM core data
+ uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
+ buffer[0] = al.RefID;
+ buffer[1] = al.Position;
+ buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
+ buffer[3] = (al.AlignmentFlag << 16) |
+ (al.SupportData.NumCigarOperations < 65536 ? al.SupportData.NumCigarOperations : 2);
+ buffer[4] = al.SupportData.QuerySequenceLength;
+ buffer[5] = al.MateRefID;
+ buffer[6] = al.MatePosition;
+ buffer[7] = al.InsertSize;
+
+ // swap BAM core endian-ness, if necessary
+ if (m_isBigEndian) {
+ for (int i = 0; i < 8; ++i)
+ BamTools::SwapEndian_32(buffer[i]);
+ }
+
+ // write the BAM core
+ m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
+
+ // write the raw char data
+ if (al.SupportData.NumCigarOperations < 65536) {
+ m_stream.Write((char*)al.SupportData.AllCharData.data(),
+ al.SupportData.BlockLength - Constants::BAM_CORE_SIZE);
+ } else {
+ const char* data = al.SupportData.AllCharData.c_str();
+ const unsigned data_len = al.SupportData.BlockLength - Constants::BAM_CORE_SIZE;
+ const unsigned cigar_offset = al.SupportData.QueryNameLength;
+ const unsigned seq_offset = cigar_offset + al.SupportData.NumCigarOperations * 4;
+ unsigned fake_cigar[2];
+ fake_cigar[0] = al.SupportData.QuerySequenceLength << 4 | 4;
+ fake_cigar[1] = (al.GetEndPosition() - al.Position) << 4 | 3;
+ m_stream.Write(data, al.SupportData.QueryNameLength);
+ if (m_isBigEndian) {
+ BamTools::SwapEndian_32(fake_cigar[0]);
+ BamTools::SwapEndian_32(fake_cigar[1]);
+ }
+ m_stream.Write((char*)&fake_cigar, 8);
+ m_stream.Write(data + seq_offset, data_len - seq_offset);
+ m_stream.Write("CGBI", 4);
+ if (m_isBigEndian) {
+ unsigned cigar_len_buf = al.SupportData.NumCigarOperations;
+ BamTools::SwapEndian_32(cigar_len_buf);
+ m_stream.Write((char*)&cigar_len_buf, 4);
+ } else {
+ m_stream.Write((char*)&al.SupportData.NumCigarOperations, 4);
+ }
+ m_stream.Write(data + cigar_offset, al.SupportData.NumCigarOperations * 4);
+ }
+}
+
+void BamWriterPrivate::WriteMagicNumber()
+{
+ // write BAM file 'magic number'
+ m_stream.Write(Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH);
+}
+
+void BamWriterPrivate::WriteReferences(const BamTools::RefVector& referenceSequences)
+{
+
+ // write the number of reference sequences
+ uint32_t numReferenceSequences = referenceSequences.size();
+ if (m_isBigEndian) BamTools::SwapEndian_32(numReferenceSequences);
+ m_stream.Write((char*)&numReferenceSequences, Constants::BAM_SIZEOF_INT);
+
+ // foreach reference sequence
+ RefVector::const_iterator rsIter = referenceSequences.begin();
+ RefVector::const_iterator rsEnd = referenceSequences.end();
+ for (; rsIter != rsEnd; ++rsIter) {
+
+ // write the reference sequence name length (+1 for terminator)
+ const uint32_t actualNameLen = rsIter->RefName.size() + 1;
+ uint32_t maybeSwappedNameLen = actualNameLen;
+ if (m_isBigEndian) BamTools::SwapEndian_32(maybeSwappedNameLen);
+ m_stream.Write((char*)&maybeSwappedNameLen, Constants::BAM_SIZEOF_INT);
+
+ // write the reference sequence name
+ m_stream.Write(rsIter->RefName.c_str(), actualNameLen);
+
+ // write the reference sequence length
+ int32_t referenceLength = rsIter->RefLength;
+ if (m_isBigEndian) BamTools::SwapEndian_32(referenceLength);
+ m_stream.Write((char*)&referenceLength, Constants::BAM_SIZEOF_INT);
+ }
+}
+
+void BamWriterPrivate::WriteSamHeaderText(const std::string& samHeaderText)
+{
+
+ // write the SAM header text length
+ const uint32_t actualHeaderLen = samHeaderText.size();
+ uint32_t maybeSwappedHeaderLen = samHeaderText.size();
+ if (m_isBigEndian) BamTools::SwapEndian_32(maybeSwappedHeaderLen);
+ m_stream.Write((char*)&maybeSwappedHeaderLen, Constants::BAM_SIZEOF_INT);
+
+ // write the SAM header text
+ if (actualHeaderLen > 0) m_stream.Write(samHeaderText.data(), actualHeaderLen);
+}
diff --git a/src/api/internal/bam/BamWriter_p.h b/src/api/internal/bam/BamWriter_p.h
new file mode 100644
index 0000000..550d7fb
--- /dev/null
+++ b/src/api/internal/bam/BamWriter_p.h
@@ -0,0 +1,74 @@
+// ***************************************************************************
+// BamWriter_p.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#ifndef BAMWRITER_P_H
+#define BAMWRITER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+
+#include <string>
+#include <vector>
+#include "api/BamAux.h"
+#include "api/internal/io/BgzfStream_p.h"
+
+namespace BamTools {
+
+class BamAlignment;
+
+namespace Internal {
+
+class BamWriterPrivate
+{
+
+ // ctor & dtor
+public:
+ BamWriterPrivate();
+ ~BamWriterPrivate();
+
+ // interface methods
+public:
+ void Close();
+ std::string GetErrorString() const;
+ bool IsOpen() const;
+ bool Open(const std::string& filename, const std::string& samHeaderText,
+ const BamTools::RefVector& referenceSequences);
+ bool SaveAlignment(const BamAlignment& al);
+ void SetWriteCompressed(bool ok);
+
+ // 'internal' methods
+public:
+ uint32_t CalculateMinimumBin(const int begin, int end) const;
+ void CreatePackedCigar(const std::vector<BamTools::CigarOp>& cigarOperations,
+ std::string& packedCigar);
+ void EncodeQuerySequence(const std::string& query, std::string& encodedQuery);
+ void WriteAlignment(const BamAlignment& al);
+ void WriteCoreAlignment(const BamAlignment& al);
+ void WriteMagicNumber();
+ void WriteReferences(const BamTools::RefVector& referenceSequences);
+ void WriteSamHeaderText(const std::string& samHeaderText);
+
+ // data members
+private:
+ BgzfStream m_stream;
+ bool m_isBigEndian;
+ std::string m_errorString;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMWRITER_P_H
diff --git a/src/api/internal/bam/CMakeLists.txt b/src/api/internal/bam/CMakeLists.txt
new file mode 100644
index 0000000..1bd2569
--- /dev/null
+++ b/src/api/internal/bam/CMakeLists.txt
@@ -0,0 +1,19 @@
+# ==========================
+# BamTools CMakeLists.txt
+# (c) 2011 Derek Barnett
+#
+# src/api/internal/bam
+# ==========================
+
+set( InternalBamDir "${InternalDir}/bam" )
+
+set( InternalBamSources
+ ${InternalBamDir}/BamHeader_p.cpp
+ ${InternalBamDir}/BamMultiReader_p.cpp
+ ${InternalBamDir}/BamRandomAccessController_p.cpp
+ ${InternalBamDir}/BamReader_p.cpp
+ ${InternalBamDir}/BamWriter_p.cpp
+
+ PARENT_SCOPE # <-- leave this last
+ )
+
diff --git a/src/api/internal/index/BamIndexFactory_p.cpp b/src/api/internal/index/BamIndexFactory_p.cpp
new file mode 100644
index 0000000..a719243
--- /dev/null
+++ b/src/api/internal/index/BamIndexFactory_p.cpp
@@ -0,0 +1,111 @@
+// ***************************************************************************
+// BamIndexFactory_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 November 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides interface for generating BamIndex implementations
+// ***************************************************************************
+
+#include "api/internal/index/BamIndexFactory_p.h"
+#include "api/internal/index/BamStandardIndex_p.h"
+#include "api/internal/index/BamToolsIndex_p.h"
+
+#include <cstddef>
+
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+// generates index filename from BAM filename (depending on requested type)
+// if type is unknown, returns empty string
+const std::string BamIndexFactory::CreateIndexFilename(const std::string& bamFilename,
+ const BamIndex::IndexType& type)
+{
+ switch (type) {
+ case (BamIndex::STANDARD):
+ return (bamFilename + BamStandardIndex::Extension());
+ case (BamIndex::BAMTOOLS):
+ return (bamFilename + BamToolsIndex::Extension());
+ default:
+ return std::string();
+ }
+}
+
+// creates a new BamIndex object, depending on extension of @indexFilename
+BamIndex* BamIndexFactory::CreateIndexFromFilename(const std::string& indexFilename,
+ BamReaderPrivate* reader)
+{
+
+ // get file extension from index filename, including dot (".EXT")
+ // if can't get file extension, return null index
+ const std::string extension = FileExtension(indexFilename);
+ if (extension.empty()) return 0;
+
+ // create index based on extension
+ if (extension == BamStandardIndex::Extension())
+ return new BamStandardIndex(reader);
+ else if (extension == BamToolsIndex::Extension())
+ return new BamToolsIndex(reader);
+ else
+ return 0;
+}
+
+// creates a new BamIndex, object of requested @type
+BamIndex* BamIndexFactory::CreateIndexOfType(const BamIndex::IndexType& type,
+ BamReaderPrivate* reader)
+{
+ switch (type) {
+ case (BamIndex::STANDARD):
+ return new BamStandardIndex(reader);
+ case (BamIndex::BAMTOOLS):
+ return new BamToolsIndex(reader);
+ default:
+ return 0;
+ }
+}
+
+// retrieves file extension (including '.')
+const std::string BamIndexFactory::FileExtension(const std::string& filename)
+{
+
+ // if filename cannot contain valid path + extension, return empty string
+ if (filename.empty() || filename.length() <= 4) return std::string();
+
+ // look for last dot in filename
+ const std::size_t lastDotPosition = filename.find_last_of('.');
+
+ // if none found, return empty string
+ if (lastDotPosition == std::string::npos) return std::string();
+
+ // return substring from last dot position
+ return filename.substr(lastDotPosition);
+}
+
+// returns name of existing index file that corresponds to @bamFilename
+// will defer to @preferredType if possible, if not will attempt to load any supported type
+// returns empty string if not found
+const std::string BamIndexFactory::FindIndexFilename(const std::string& bamFilename,
+ const BamIndex::IndexType& preferredType)
+{
+ // skip if BAM filename provided is empty
+ if (bamFilename.empty()) return std::string();
+
+ // try to find index of preferred type first
+ // return index filename if found
+ std::string indexFilename = CreateIndexFilename(bamFilename, preferredType);
+ if (!indexFilename.empty()) return indexFilename;
+
+ // couldn't find preferred type, try the other supported types
+ // return index filename if found
+ if (preferredType != BamIndex::STANDARD) {
+ indexFilename = CreateIndexFilename(bamFilename, BamIndex::STANDARD);
+ if (!indexFilename.empty()) return indexFilename;
+ }
+ if (preferredType != BamIndex::BAMTOOLS) {
+ indexFilename = CreateIndexFilename(bamFilename, BamIndex::BAMTOOLS);
+ if (!indexFilename.empty()) return indexFilename;
+ }
+
+ // otherwise couldn't find any index matching this filename
+ return std::string();
+}
diff --git a/src/api/internal/index/BamIndexFactory_p.h b/src/api/internal/index/BamIndexFactory_p.h
new file mode 100644
index 0000000..fc51793
--- /dev/null
+++ b/src/api/internal/index/BamIndexFactory_p.h
@@ -0,0 +1,49 @@
+// ***************************************************************************
+// BamIndexFactory_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides interface for generating BamIndex implementations
+// ***************************************************************************
+
+#ifndef BAMINDEX_FACTORY_P_H
+#define BAMINDEX_FACTORY_P_H
+
+#include <string>
+#include "api/BamIndex.h"
+
+namespace BamTools {
+namespace Internal {
+
+class BamIndexFactory
+{
+
+ // static interface methods
+public:
+ // creates a new BamIndex object, depending on extension of @indexFilename
+ static BamIndex* CreateIndexFromFilename(const std::string& indexFilename,
+ BamReaderPrivate* reader);
+ // creates a new BamIndex object, of requested @type
+ static BamIndex* CreateIndexOfType(const BamIndex::IndexType& type, BamReaderPrivate* reader);
+ // returns name of existing index file that corresponds to @bamFilename
+ // will defer to @preferredType if possible
+ // if @preferredType not found, will attempt to load any supported index type
+ // returns empty string if no index file (of any type) is found
+ static const std::string FindIndexFilename(const std::string& bamFilename,
+ const BamIndex::IndexType& preferredType);
+
+ // internal methods
+public:
+ // generates index filename from BAM filename (depending on requested type)
+ // if type is unknown, returns empty string
+ static const std::string CreateIndexFilename(const std::string& bamFilename,
+ const BamIndex::IndexType& type);
+ // retrieves file extension (including '.')
+ static const std::string FileExtension(const std::string& filename);
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMINDEX_FACTORY_P_H
diff --git a/src/api/internal/index/BamStandardIndex_p.cpp b/src/api/internal/index/BamStandardIndex_p.cpp
new file mode 100644
index 0000000..cc81711
--- /dev/null
+++ b/src/api/internal/index/BamStandardIndex_p.cpp
@@ -0,0 +1,1023 @@
+// ***************************************************************************
+// BamStandardIndex.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 May 2012 (DB)
+// ---------------------------------------------------------------------------
+// Provides index operations for the standardized BAM index format (".bai")
+// ***************************************************************************
+
+#include "api/internal/index/BamStandardIndex_p.h"
+#include "api/BamAlignment.h"
+#include "api/internal/bam/BamReader_p.h"
+#include "api/internal/io/BamDeviceFactory_p.h"
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <sstream>
+
+// -----------------------------------
+// static BamStandardIndex constants
+// -----------------------------------
+
+const int BamStandardIndex::MAX_BIN = 37450; // =(8^6-1)/7+1
+const int BamStandardIndex::BAM_LIDX_SHIFT = 14;
+const std::string BamStandardIndex::BAI_EXTENSION = ".bai";
+const char* const BamStandardIndex::BAI_MAGIC = "BAI\1";
+const int BamStandardIndex::SIZEOF_ALIGNMENTCHUNK = sizeof(uint64_t) * 2;
+const int BamStandardIndex::SIZEOF_BINCORE = sizeof(uint32_t) + sizeof(int32_t);
+const int BamStandardIndex::SIZEOF_LINEAROFFSET = sizeof(uint64_t);
+
+// ----------------------------
+// RaiiWrapper implementation
+// ----------------------------
+
+BamStandardIndex::RaiiWrapper::RaiiWrapper()
+ : Device(0)
+ , Buffer(0)
+{}
+
+BamStandardIndex::RaiiWrapper::~RaiiWrapper()
+{
+
+ if (Device) {
+ Device->Close();
+ delete Device;
+ Device = 0;
+ }
+
+ if (Buffer) {
+ delete[] Buffer;
+ Buffer = 0;
+ }
+}
+
+// ---------------------------------
+// BamStandardIndex implementation
+// ---------------------------------
+
+// ctor
+BamStandardIndex::BamStandardIndex(Internal::BamReaderPrivate* reader)
+ : BamIndex(reader)
+ , m_bufferLength(0)
+{
+ m_isBigEndian = BamTools::SystemIsBigEndian();
+}
+
+// dtor
+BamStandardIndex::~BamStandardIndex()
+{
+ CloseFile();
+}
+
+void BamStandardIndex::AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end)
+{
+
+ // retrieve references from reader
+ const RefVector& references = m_reader->GetReferenceData();
+
+ // LeftPosition cannot be greater than or equal to reference length
+ if (region.LeftPosition >= references.at(region.LeftRefID).RefLength)
+ throw BamException("BamStandardIndex::AdjustRegion", "invalid region requested");
+
+ // set region 'begin'
+ begin = (unsigned int)region.LeftPosition;
+
+ // if right bound specified AND left&right bounds are on same reference
+ // OK to use right bound position as region 'end'
+ if (region.isRightBoundSpecified() && (region.LeftRefID == region.RightRefID))
+ end = (unsigned int)region.RightPosition;
+
+ // otherwise, set region 'end' to last reference base
+ else
+ end = (unsigned int)references.at(region.LeftRefID).RefLength;
+}
+
+// [begin, end)
+void BamStandardIndex::CalculateCandidateBins(const uint32_t& begin, const uint32_t& end,
+ std::set<uint16_t>& candidateBins)
+{
+ // initialize list, bin '0' is always a valid bin
+ candidateBins.insert(0);
+
+ // get rest of bins that contain this region
+ unsigned int k;
+ for (k = 1 + (begin >> 26); k <= 1 + (end >> 26); ++k) {
+ candidateBins.insert(k);
+ }
+ for (k = 9 + (begin >> 23); k <= 9 + (end >> 23); ++k) {
+ candidateBins.insert(k);
+ }
+ for (k = 73 + (begin >> 20); k <= 73 + (end >> 20); ++k) {
+ candidateBins.insert(k);
+ }
+ for (k = 585 + (begin >> 17); k <= 585 + (end >> 17); ++k) {
+ candidateBins.insert(k);
+ }
+ for (k = 4681 + (begin >> 14); k <= 4681 + (end >> 14); ++k) {
+ candidateBins.insert(k);
+ }
+}
+
+void BamStandardIndex::CalculateCandidateOffsets(const BaiReferenceSummary& refSummary,
+ const uint64_t& minOffset,
+ std::set<uint16_t>& candidateBins,
+ std::vector<int64_t>& offsets)
+{
+ // seek to first bin
+ Seek(refSummary.FirstBinFilePosition, SEEK_SET);
+
+ // iterate over reference bins
+ uint32_t binId;
+ int32_t numAlignmentChunks;
+ std::set<uint16_t>::iterator candidateBinIter;
+ for (int i = 0; i < refSummary.NumBins; ++i) {
+
+ // read bin contents (if successful, alignment chunks are now in m_buffer)
+ ReadBinIntoBuffer(binId, numAlignmentChunks);
+
+ // see if bin is a 'candidate bin'
+ candidateBinIter = candidateBins.find(binId);
+
+ // if not, move on to next bin
+ if (candidateBinIter == candidateBins.end()) continue;
+
+ // otherwise, check bin's contents against for overlap
+ else {
+
+ std::size_t offset = 0;
+ uint64_t chunkStart;
+ uint64_t chunkStop;
+
+ // iterate over alignment chunks
+ for (int j = 0; j < numAlignmentChunks; ++j) {
+
+ // read chunk start & stop from buffer
+ memcpy((char*)&chunkStart, m_resources.Buffer + offset, sizeof(uint64_t));
+ offset += sizeof(uint64_t);
+ memcpy((char*)&chunkStop, m_resources.Buffer + offset, sizeof(uint64_t));
+ offset += sizeof(uint64_t);
+
+ // swap endian-ness if necessary
+ if (m_isBigEndian) {
+ SwapEndian_64(chunkStart);
+ SwapEndian_64(chunkStop);
+ }
+
+ // store alignment chunk's start offset
+ // if its stop offset is larger than our 'minOffset'
+ if (chunkStop >= minOffset) offsets.push_back(chunkStart);
+ }
+
+ // 'pop' bin ID from candidate bins set
+ candidateBins.erase(candidateBinIter);
+
+ // quit if no more candidates
+ if (candidateBins.empty()) break;
+ }
+ }
+}
+
+uint64_t BamStandardIndex::CalculateMinOffset(const BaiReferenceSummary& refSummary,
+ const uint32_t& begin)
+{
+ // if no linear offsets exist, return 0
+ if (refSummary.NumLinearOffsets == 0) return 0;
+
+ // if 'begin' starts beyond last linear offset, use the last linear offset as minimum
+ // else use the offset corresponding to the requested start position
+ const int shiftedBegin = begin >> BamStandardIndex::BAM_LIDX_SHIFT;
+ if (shiftedBegin >= refSummary.NumLinearOffsets)
+ return LookupLinearOffset(refSummary, refSummary.NumLinearOffsets - 1);
+ else
+ return LookupLinearOffset(refSummary, shiftedBegin);
+}
+
+void BamStandardIndex::CheckBufferSize(char*& buffer, unsigned int& bufferLength,
+ const unsigned int& requestedBytes)
+{
+ try {
+ if (requestedBytes > bufferLength) {
+ bufferLength = requestedBytes + 10;
+ delete[] buffer;
+ buffer = new char[bufferLength];
+ }
+ } catch (std::bad_alloc&) {
+ std::stringstream s;
+ s << "out of memory when allocating " << requestedBytes << " bytes";
+ throw BamException("BamStandardIndex::CheckBufferSize", s.str());
+ }
+}
+
+void BamStandardIndex::CheckBufferSize(unsigned char*& buffer, unsigned int& bufferLength,
+ const unsigned int& requestedBytes)
+{
+ try {
+ if (requestedBytes > bufferLength) {
+ bufferLength = requestedBytes + 10;
+ delete[] buffer;
+ buffer = new unsigned char[bufferLength];
+ }
+ } catch (std::bad_alloc&) {
+ std::stringstream s;
+ s << "out of memory when allocating " << requestedBytes << " bytes";
+ throw BamException("BamStandardIndex::CheckBufferSize", s.str());
+ }
+}
+
+void BamStandardIndex::CheckMagicNumber()
+{
+
+ // check 'magic number' to see if file is BAI index
+ char magic[4];
+ const int64_t numBytesRead = m_resources.Device->Read(magic, sizeof(magic));
+ if (numBytesRead != 4)
+ throw BamException("BamStandardIndex::CheckMagicNumber", "could not read BAI magic number");
+
+ // compare to expected value
+ if (strncmp(magic, BamStandardIndex::BAI_MAGIC, 4) != 0)
+ throw BamException("BamStandardIndex::CheckMagicNumber", "invalid BAI magic number");
+}
+
+void BamStandardIndex::ClearReferenceEntry(BaiReferenceEntry& refEntry)
+{
+ refEntry.ID = -1;
+ refEntry.Bins.clear();
+ refEntry.LinearOffsets.clear();
+}
+
+void BamStandardIndex::CloseFile()
+{
+
+ // close file stream
+ if (IsDeviceOpen()) {
+ m_resources.Device->Close();
+ delete m_resources.Device;
+ m_resources.Device = 0;
+ }
+
+ // clear index file summary data
+ m_indexFileSummary.clear();
+
+ // clean up I/O buffer
+ delete[] m_resources.Buffer;
+ m_resources.Buffer = 0;
+ m_bufferLength = 0;
+}
+
+// builds index from associated BAM file & writes out to index file
+bool BamStandardIndex::Create()
+{
+
+ // skip if BamReader is invalid or not open
+ if (m_reader == 0 || !m_reader->IsOpen()) {
+ SetErrorString("BamStandardIndex::Create", "could not create index: reader is not open");
+ return false;
+ }
+
+ // rewind BamReader
+ if (!m_reader->Rewind()) {
+ const std::string readerError = m_reader->GetErrorString();
+ const std::string message = "could not create index: \n\t" + readerError;
+ SetErrorString("BamStandardIndex::Create", message);
+ return false;
+ }
+
+ try {
+
+ // open new index file (read & write)
+ std::string indexFilename = m_reader->Filename() + Extension();
+ OpenFile(indexFilename, IBamIODevice::ReadWrite);
+
+ // initialize BaiFileSummary with number of references
+ const int& numReferences = m_reader->GetReferenceCount();
+ ReserveForSummary(numReferences);
+
+ // initialize output file
+ WriteHeader();
+
+ // set up bin, ID, offset, & coordinate markers
+ const uint32_t defaultValue = 0xffffffffu;
+ uint32_t currentBin = defaultValue;
+ uint32_t lastBin = defaultValue;
+ int32_t currentRefID = defaultValue;
+ int32_t lastRefID = defaultValue;
+ uint64_t currentOffset = (uint64_t)m_reader->Tell();
+ uint64_t lastOffset = currentOffset;
+ int32_t lastPosition = defaultValue;
+
+ // iterate through alignments in BAM file
+ BamAlignment al;
+ BaiReferenceEntry refEntry;
+ while (m_reader->LoadNextAlignment(al)) {
+
+ // changed to new reference
+ if (lastRefID != al.RefID) {
+
+ // if not first reference, save previous reference data
+ if (lastRefID != (int32_t)defaultValue) {
+
+ SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset);
+ WriteReferenceEntry(refEntry);
+ ClearReferenceEntry(refEntry);
+
+ // write any empty references between (but *NOT* including) lastRefID & al.RefID
+ for (int i = lastRefID + 1; i < al.RefID; ++i) {
+ BaiReferenceEntry emptyEntry(i);
+ WriteReferenceEntry(emptyEntry);
+ }
+
+ // update bin markers
+ currentOffset = lastOffset;
+ currentBin = al.Bin;
+ lastBin = al.Bin;
+ currentRefID = al.RefID;
+ }
+
+ // otherwise, this is first pass
+ // be sure to write any empty references up to (but *NOT* including) current RefID
+ else {
+ for (int i = 0; i < al.RefID; ++i) {
+ BaiReferenceEntry emptyEntry(i);
+ WriteReferenceEntry(emptyEntry);
+ }
+ }
+
+ // update reference markers
+ refEntry.ID = al.RefID;
+ lastRefID = al.RefID;
+ lastBin = defaultValue;
+ }
+
+ // if lastPosition greater than current alignment position - file not sorted properly
+ else if (lastPosition > al.Position) {
+ std::stringstream s;
+ s << "BAM file is not properly sorted by coordinate" << std::endl
+ << "Current alignment position: " << al.Position
+ << " < previous alignment position: " << lastPosition
+ << " on reference ID: " << al.RefID << std::endl;
+ SetErrorString("BamStandardIndex::Create", s.str());
+ return false;
+ }
+
+ // if alignment's ref ID is valid & its bin is not a 'leaf'
+ if ((al.RefID >= 0) && (al.Bin < 4681))
+ SaveLinearOffsetEntry(refEntry.LinearOffsets, al.Position, al.GetEndPosition(),
+ lastOffset);
+
+ // changed to new BAI bin
+ if (al.Bin != lastBin) {
+
+ // if not first bin on reference, save previous bin data
+ if (currentBin != defaultValue)
+ SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset);
+
+ // update markers
+ currentOffset = lastOffset;
+ currentBin = al.Bin;
+ lastBin = al.Bin;
+ currentRefID = al.RefID;
+
+ // if invalid RefID, break out
+ if (currentRefID < 0) break;
+ }
+
+ // make sure that current file pointer is beyond lastOffset
+ if (m_reader->Tell() <= (int64_t)lastOffset) {
+ SetErrorString("BamStandardIndex::Create", "calculating offsets failed");
+ return false;
+ }
+
+ // update lastOffset & lastPosition
+ lastOffset = m_reader->Tell();
+ lastPosition = al.Position;
+ }
+
+ // after finishing alignments, if any data was read, check:
+ if (lastOffset != currentOffset) {
+
+ // store last alignment chunk to its bin, then write last reference entry with data
+ SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset);
+ WriteReferenceEntry(refEntry);
+ }
+
+ // then write any empty references remaining at end of file
+ for (int i = currentRefID + 1; i < numReferences; ++i) {
+ BaiReferenceEntry emptyEntry(i);
+ WriteReferenceEntry(emptyEntry);
+ }
+
+ } catch (BamException& e) {
+ m_errorString = e.what();
+ return false;
+ }
+
+ // rewind BamReader
+ if (!m_reader->Rewind()) {
+ const std::string readerError = m_reader->GetErrorString();
+ const std::string message = "could not create index: \n\t" + readerError;
+ SetErrorString("BamStandardIndex::Create", message);
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+// returns format's file extension
+const std::string BamStandardIndex::Extension()
+{
+ return BamStandardIndex::BAI_EXTENSION;
+}
+
+void BamStandardIndex::GetOffset(const BamRegion& region, int64_t& offset,
+ bool* hasAlignmentsInRegion)
+{
+
+ // cannot calculate offsets if unknown/invalid reference ID requested
+ if (region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size())
+ throw BamException("BamStandardIndex::GetOffset", "invalid reference ID requested");
+
+ // retrieve index summary for left bound reference
+ const BaiReferenceSummary& refSummary = m_indexFileSummary.at(region.LeftRefID);
+
+ // set up region boundaries based on actual BamReader data
+ uint32_t begin;
+ uint32_t end;
+ AdjustRegion(region, begin, end);
+
+ // retrieve all candidate bin IDs for region
+ std::set<uint16_t> candidateBins;
+ CalculateCandidateBins(begin, end, candidateBins);
+
+ // use reference's linear offsets to calculate the minimum offset
+ // that must be considered to find overlap
+ const uint64_t& minOffset = CalculateMinOffset(refSummary, begin);
+
+ // attempt to use reference summary, minOffset, & candidateBins to calculate offsets
+ // no data should not be error, just bail
+ std::vector<int64_t> offsets;
+ CalculateCandidateOffsets(refSummary, minOffset, candidateBins, offsets);
+ if (offsets.empty()) return;
+
+ // ensure that offsets are sorted before processing
+ sort(offsets.begin(), offsets.end());
+
+ // binary search for an overlapping block (may not be first one though)
+ BamAlignment al;
+ typedef std::vector<int64_t>::const_iterator OffsetConstIterator;
+ OffsetConstIterator offsetFirst = offsets.begin();
+ OffsetConstIterator offsetIter = offsetFirst;
+ OffsetConstIterator offsetLast = offsets.end();
+ std::iterator_traits<OffsetConstIterator>::difference_type count =
+ distance(offsetFirst, offsetLast);
+ std::iterator_traits<OffsetConstIterator>::difference_type step;
+ while (count > 0) {
+ offsetIter = offsetFirst;
+ step = count / 2;
+ advance(offsetIter, step);
+
+ // attempt seek to candidate offset
+ const int64_t& candidateOffset = (*offsetIter);
+ if (!m_reader->Seek(candidateOffset)) {
+ const std::string readerError = m_reader->GetErrorString();
+ const std::string message = "could not seek in BAM file: \n\t" + readerError;
+ throw BamException("BamToolsIndex::GetOffset", message);
+ }
+
+ // load first available alignment, setting flag to true if data exists
+ *hasAlignmentsInRegion = m_reader->LoadNextAlignment(al);
+
+ // check alignment against region
+ if (al.GetEndPosition() <= region.LeftPosition) {
+ offsetFirst = ++offsetIter;
+ count -= step + 1;
+ } else
+ count = step;
+ }
+
+ // step back to the offset before the 'current offset' (to make sure we cover overlaps)
+ if (offsetIter != offsets.begin()) --offsetIter;
+ offset = (*offsetIter);
+}
+
+// returns whether reference has alignments or no
+bool BamStandardIndex::HasAlignments(const int& referenceID) const
+{
+ if (referenceID < 0 || referenceID >= (int)m_indexFileSummary.size()) return false;
+ const BaiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID);
+ return (refSummary.NumBins > 0);
+}
+
+bool BamStandardIndex::IsDeviceOpen() const
+{
+ if (m_resources.Device == 0) return false;
+ return m_resources.Device->IsOpen();
+}
+
+// attempts to use index data to jump to @region, returns success/fail
+// a "successful" jump indicates no error, but not whether this region has data
+// * thus, the method sets a flag to indicate whether there are alignments
+// available after the jump position
+bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion)
+{
+
+ // clear out flag
+ *hasAlignmentsInRegion = false;
+
+ // skip if invalid reader or not open
+ if (m_reader == 0 || !m_reader->IsOpen()) {
+ SetErrorString("BamStandardIndex::Jump", "could not jump: reader is not open");
+ return false;
+ }
+
+ // calculate nearest offset to jump to
+ int64_t offset;
+ try {
+ GetOffset(region, offset, hasAlignmentsInRegion);
+ } catch (BamException& e) {
+ m_errorString = e.what();
+ return false;
+ }
+
+ // if region has alignments, return success/fail of seeking there
+ if (*hasAlignmentsInRegion) return m_reader->Seek(offset);
+
+ // otherwise, simply return true (but hasAlignmentsInRegion flag has been set to false)
+ // (this is OK, BamReader will check this flag before trying to load data)
+ return true;
+}
+
+// loads existing data from file into memory
+bool BamStandardIndex::Load(const std::string& filename)
+{
+
+ try {
+
+ // attempt to open file (read-only)
+ OpenFile(filename, IBamIODevice::ReadOnly);
+
+ // validate format
+ CheckMagicNumber();
+
+ // load in-memory summary of index data
+ SummarizeIndexFile();
+
+ // return success
+ return true;
+
+ } catch (BamException& e) {
+ m_errorString = e.what();
+ return false;
+ }
+}
+
+uint64_t BamStandardIndex::LookupLinearOffset(const BaiReferenceSummary& refSummary,
+ const int& index)
+{
+
+ // attempt seek to proper index file position
+ const int64_t linearOffsetFilePosition = (int64_t)refSummary.FirstLinearOffsetFilePosition +
+ index * BamStandardIndex::SIZEOF_LINEAROFFSET;
+ Seek(linearOffsetFilePosition, SEEK_SET);
+
+ // read linear offset from BAI file
+ uint64_t linearOffset;
+ ReadLinearOffset(linearOffset);
+ return linearOffset;
+}
+
+void BamStandardIndex::MergeAlignmentChunks(BaiAlignmentChunkVector& chunks)
+{
+
+ // skip if chunks are empty, nothing to merge
+ if (chunks.empty()) return;
+
+ // set up merged alignment chunk container
+ BaiAlignmentChunkVector mergedChunks;
+ mergedChunks.push_back(chunks[0]);
+
+ // iterate over chunks
+ int i = 0;
+ BaiAlignmentChunkVector::iterator chunkIter = chunks.begin();
+ BaiAlignmentChunkVector::iterator chunkEnd = chunks.end();
+ for (++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
+
+ // get 'currentMergeChunk' based on numeric index
+ BaiAlignmentChunk& currentMergeChunk = mergedChunks[i];
+
+ // get sourceChunk based on source vector iterator
+ BaiAlignmentChunk& sourceChunk = (*chunkIter);
+
+ // if currentMergeChunk ends where sourceChunk starts, then merge the two
+ if (currentMergeChunk.Stop >> 16 == sourceChunk.Start >> 16)
+ currentMergeChunk.Stop = sourceChunk.Stop;
+
+ // otherwise
+ else {
+ // append sourceChunk after currentMergeChunk
+ mergedChunks.push_back(sourceChunk);
+
+ // update i, so the next iteration will consider the
+ // recently-appended sourceChunk as new mergeChunk candidate
+ ++i;
+ }
+ }
+
+ // saved newly-merged chunks into (parameter) chunks
+ chunks = mergedChunks;
+}
+
+void BamStandardIndex::OpenFile(const std::string& filename, IBamIODevice::OpenMode mode)
+{
+
+ // make sure any previous index file is closed
+ CloseFile();
+
+ m_resources.Device = BamDeviceFactory::CreateDevice(filename);
+ if (m_resources.Device == 0) {
+ const std::string message = std::string("could not open file: ") + filename;
+ throw BamException("BamStandardIndex::OpenFile", message);
+ }
+
+ // attempt to open file
+ m_resources.Device->Open(mode);
+ if (!IsDeviceOpen()) {
+ const std::string message = std::string("could not open file: ") + filename;
+ throw BamException("BamStandardIndex::OpenFile", message);
+ }
+}
+
+void BamStandardIndex::ReadBinID(uint32_t& binId)
+{
+ const int64_t numBytesRead = m_resources.Device->Read((char*)&binId, sizeof(binId));
+ if (m_isBigEndian) SwapEndian_32(binId);
+ if (numBytesRead != sizeof(binId))
+ throw BamException("BamStandardIndex::ReadBinID", "could not read BAI bin ID");
+}
+
+void BamStandardIndex::ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks)
+{
+
+ // read bin header
+ ReadBinID(binId);
+ ReadNumAlignmentChunks(numAlignmentChunks);
+
+ // read bin contents
+ const unsigned int bytesRequested =
+ numAlignmentChunks * BamStandardIndex::SIZEOF_ALIGNMENTCHUNK;
+ ReadIntoBuffer(bytesRequested);
+}
+
+void BamStandardIndex::ReadIntoBuffer(const unsigned int& bytesRequested)
+{
+
+ // ensure that our buffer is big enough for request
+ BamStandardIndex::CheckBufferSize(m_resources.Buffer, m_bufferLength, bytesRequested);
+
+ // read from BAI file stream
+ const int64_t bytesRead = m_resources.Device->Read(m_resources.Buffer, bytesRequested);
+ if (bytesRead != static_cast<int64_t>(bytesRequested)) {
+ std::stringstream s;
+ s << "expected to read: " << bytesRequested << " bytes, "
+ << "but instead read: " << bytesRead;
+ throw BamException("BamStandardIndex::ReadIntoBuffer", s.str());
+ }
+}
+
+void BamStandardIndex::ReadLinearOffset(uint64_t& linearOffset)
+{
+ const int64_t numBytesRead =
+ m_resources.Device->Read((char*)&linearOffset, sizeof(linearOffset));
+ if (m_isBigEndian) SwapEndian_64(linearOffset);
+ if (numBytesRead != sizeof(linearOffset))
+ throw BamException("BamStandardIndex::ReadLinearOffset",
+ "could not read BAI linear offset");
+}
+
+void BamStandardIndex::ReadNumAlignmentChunks(int& numAlignmentChunks)
+{
+ const int64_t numBytesRead =
+ m_resources.Device->Read((char*)&numAlignmentChunks, sizeof(numAlignmentChunks));
+ if (m_isBigEndian) SwapEndian_32(numAlignmentChunks);
+ if (numBytesRead != sizeof(numAlignmentChunks))
+ throw BamException("BamStandardIndex::ReadNumAlignmentChunks",
+ "could not read BAI chunk count");
+}
+
+void BamStandardIndex::ReadNumBins(int& numBins)
+{
+ const int64_t numBytesRead = m_resources.Device->Read((char*)&numBins, sizeof(numBins));
+ if (m_isBigEndian) SwapEndian_32(numBins);
+ if (numBytesRead != sizeof(numBins))
+ throw BamException("BamStandardIndex::ReadNumBins", "could not read BAI bin count");
+}
+
+void BamStandardIndex::ReadNumLinearOffsets(int& numLinearOffsets)
+{
+ const int64_t numBytesRead =
+ m_resources.Device->Read((char*)&numLinearOffsets, sizeof(numLinearOffsets));
+ if (m_isBigEndian) SwapEndian_32(numLinearOffsets);
+ if (numBytesRead != sizeof(numLinearOffsets))
+ throw BamException("BamStandardIndex::ReadNumAlignmentChunks",
+ "could not read BAI linear offset count");
+}
+
+void BamStandardIndex::ReadNumReferences(int& numReferences)
+{
+ const int64_t numBytesRead =
+ m_resources.Device->Read((char*)&numReferences, sizeof(numReferences));
+ if (m_isBigEndian) SwapEndian_32(numReferences);
+ if (numBytesRead != sizeof(numReferences))
+ throw BamException("BamStandardIndex::ReadNumReferences", "could not read reference count");
+}
+
+void BamStandardIndex::ReserveForSummary(const int& numReferences)
+{
+ m_indexFileSummary.clear();
+ m_indexFileSummary.assign(numReferences, BaiReferenceSummary());
+}
+
+void BamStandardIndex::SaveAlignmentChunkToBin(BaiBinMap& binMap, const uint32_t& currentBin,
+ const uint64_t& currentOffset,
+ const uint64_t& lastOffset)
+{
+ // create new alignment chunk
+ BaiAlignmentChunk newChunk(currentOffset, lastOffset);
+
+ // if no entry exists yet for this bin, create one and store alignment chunk
+ BaiBinMap::iterator binIter = binMap.find(currentBin);
+ if (binIter == binMap.end()) {
+ BaiAlignmentChunkVector newChunks;
+ newChunks.push_back(newChunk);
+ binMap.insert(std::pair<uint32_t, BaiAlignmentChunkVector>(currentBin, newChunks));
+ }
+
+ // otherwise, just append alignment chunk
+ else {
+ BaiAlignmentChunkVector& binChunks = (*binIter).second;
+ binChunks.push_back(newChunk);
+ }
+}
+
+void BamStandardIndex::SaveBinsSummary(const int& refId, const int& numBins)
+{
+ BaiReferenceSummary& refSummary = m_indexFileSummary.at(refId);
+ refSummary.NumBins = numBins;
+ refSummary.FirstBinFilePosition = Tell();
+}
+
+void BamStandardIndex::SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets,
+ const int& alignmentStartPosition,
+ const int& alignmentStopPosition,
+ const uint64_t& lastOffset)
+{
+ // get converted offsets
+ const int beginOffset = alignmentStartPosition >> BamStandardIndex::BAM_LIDX_SHIFT;
+ const int endOffset = (alignmentStopPosition - 1) >> BamStandardIndex::BAM_LIDX_SHIFT;
+
+ // resize vector if necessary
+ int oldSize = offsets.size();
+ int newSize = endOffset + 1;
+ if (oldSize < newSize) offsets.resize(newSize, 0);
+
+ // store offset
+ for (int i = beginOffset + 1; i <= endOffset; ++i) {
+ if (offsets[i] == 0) offsets[i] = lastOffset;
+ }
+}
+
+void BamStandardIndex::SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets)
+{
+ BaiReferenceSummary& refSummary = m_indexFileSummary.at(refId);
+ refSummary.NumLinearOffsets = numLinearOffsets;
+ refSummary.FirstLinearOffsetFilePosition = Tell();
+}
+
+// seek to position in index file stream
+void BamStandardIndex::Seek(const int64_t& position, const int origin)
+{
+ if (!m_resources.Device->Seek(position, origin))
+ throw BamException("BamStandardIndex::Seek", "could not seek in BAI file");
+}
+
+void BamStandardIndex::SkipBins(const int& numBins)
+{
+ uint32_t binId;
+ int32_t numAlignmentChunks;
+ for (int i = 0; i < numBins; ++i)
+ ReadBinIntoBuffer(binId, numAlignmentChunks); // results & buffer ignored
+}
+
+void BamStandardIndex::SkipLinearOffsets(const int& numLinearOffsets)
+{
+ const unsigned int bytesRequested = numLinearOffsets * BamStandardIndex::SIZEOF_LINEAROFFSET;
+ ReadIntoBuffer(bytesRequested);
+}
+
+void BamStandardIndex::SortLinearOffsets(BaiLinearOffsetVector& linearOffsets)
+{
+ sort(linearOffsets.begin(), linearOffsets.end());
+}
+
+void BamStandardIndex::SummarizeBins(BaiReferenceSummary& refSummary)
+{
+
+ // load number of bins
+ int numBins;
+ ReadNumBins(numBins);
+
+ // store bins summary for this reference
+ refSummary.NumBins = numBins;
+ refSummary.FirstBinFilePosition = Tell();
+
+ // skip this reference's bins
+ SkipBins(numBins);
+}
+
+void BamStandardIndex::SummarizeIndexFile()
+{
+
+ // load number of reference sequences
+ int numReferences;
+ ReadNumReferences(numReferences);
+
+ // initialize file summary data
+ ReserveForSummary(numReferences);
+
+ // iterate over reference entries
+ BaiFileSummary::iterator summaryIter = m_indexFileSummary.begin();
+ BaiFileSummary::iterator summaryEnd = m_indexFileSummary.end();
+ for (int i = 0; summaryIter != summaryEnd; ++summaryIter, ++i)
+ SummarizeReference(*summaryIter);
+}
+
+void BamStandardIndex::SummarizeLinearOffsets(BaiReferenceSummary& refSummary)
+{
+
+ // load number of linear offsets
+ int numLinearOffsets;
+ ReadNumLinearOffsets(numLinearOffsets);
+
+ // store bin summary data for this reference
+ refSummary.NumLinearOffsets = numLinearOffsets;
+ refSummary.FirstLinearOffsetFilePosition = Tell();
+
+ // skip linear offsets in index file
+ SkipLinearOffsets(numLinearOffsets);
+}
+
+void BamStandardIndex::SummarizeReference(BaiReferenceSummary& refSummary)
+{
+ SummarizeBins(refSummary);
+ SummarizeLinearOffsets(refSummary);
+}
+
+// return position of file pointer in index file stream
+int64_t BamStandardIndex::Tell() const
+{
+ return m_resources.Device->Tell();
+}
+
+void BamStandardIndex::WriteAlignmentChunk(const BaiAlignmentChunk& chunk)
+{
+
+ // localize alignment chunk offsets
+ uint64_t start = chunk.Start;
+ uint64_t stop = chunk.Stop;
+
+ // swap endian-ness if necessary
+ if (m_isBigEndian) {
+ SwapEndian_64(start);
+ SwapEndian_64(stop);
+ }
+
+ // write to index file
+ int64_t numBytesWritten = 0;
+ numBytesWritten += m_resources.Device->Write((const char*)&start, sizeof(start));
+ numBytesWritten += m_resources.Device->Write((const char*)&stop, sizeof(stop));
+ if (numBytesWritten != (sizeof(start) + sizeof(stop)))
+ throw BamException("BamStandardIndex::WriteAlignmentChunk",
+ "could not write BAI alignment chunk");
+}
+
+void BamStandardIndex::WriteAlignmentChunks(BaiAlignmentChunkVector& chunks)
+{
+
+ // make sure chunks are merged (simplified) before writing & saving summary
+ MergeAlignmentChunks(chunks);
+
+ // write chunks
+ int32_t chunkCount = chunks.size();
+ if (m_isBigEndian) SwapEndian_32(chunkCount);
+ const int64_t numBytesWritten =
+ m_resources.Device->Write((const char*)&chunkCount, sizeof(chunkCount));
+ if (numBytesWritten != sizeof(chunkCount))
+ throw BamException("BamStandardIndex::WriteAlignmentChunks",
+ "could not write BAI chunk count");
+
+ // iterate over chunks
+ BaiAlignmentChunkVector::const_iterator chunkIter = chunks.begin();
+ BaiAlignmentChunkVector::const_iterator chunkEnd = chunks.end();
+ for (; chunkIter != chunkEnd; ++chunkIter)
+ WriteAlignmentChunk((*chunkIter));
+}
+
+void BamStandardIndex::WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks)
+{
+
+ // write BAM bin ID
+ uint32_t binKey = binId;
+ if (m_isBigEndian) SwapEndian_32(binKey);
+ const int64_t numBytesWritten = m_resources.Device->Write((const char*)&binKey, sizeof(binKey));
+ if (numBytesWritten != sizeof(binKey))
+ throw BamException("BamStandardIndex::WriteBin", "could not write bin ID");
+
+ // write bin's alignment chunks
+ WriteAlignmentChunks(chunks);
+}
+
+void BamStandardIndex::WriteBins(const int& refId, BaiBinMap& bins)
+{
+
+ // write number of bins
+ int32_t binCount = bins.size();
+ if (m_isBigEndian) SwapEndian_32(binCount);
+ const int64_t numBytesWritten =
+ m_resources.Device->Write((const char*)&binCount, sizeof(binCount));
+ if (numBytesWritten != sizeof(binCount))
+ throw BamException("BamStandardIndex::WriteBins", "could not write bin count");
+
+ // save summary for reference's bins
+ SaveBinsSummary(refId, bins.size());
+
+ // iterate over bins
+ BaiBinMap::iterator binIter = bins.begin();
+ BaiBinMap::iterator binEnd = bins.end();
+ for (; binIter != binEnd; ++binIter)
+ WriteBin((*binIter).first, (*binIter).second);
+}
+
+void BamStandardIndex::WriteHeader()
+{
+
+ int64_t numBytesWritten = 0;
+
+ // write magic number
+ numBytesWritten += m_resources.Device->Write(BamStandardIndex::BAI_MAGIC, 4);
+
+ // write number of reference sequences
+ int32_t numReferences = m_indexFileSummary.size();
+ if (m_isBigEndian) SwapEndian_32(numReferences);
+ numBytesWritten +=
+ m_resources.Device->Write((const char*)&numReferences, sizeof(numReferences));
+
+ if (numBytesWritten != sizeof(numReferences) + 4)
+ throw BamException("BamStandardIndex::WriteHeader", "could not write BAI header");
+}
+
+void BamStandardIndex::WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets)
+{
+
+ // make sure linear offsets are sorted before writing & saving summary
+ SortLinearOffsets(linearOffsets);
+
+ int64_t numBytesWritten = 0;
+
+ // write number of linear offsets
+ int32_t offsetCount = linearOffsets.size();
+ if (m_isBigEndian) SwapEndian_32(offsetCount);
+ numBytesWritten += m_resources.Device->Write((const char*)&offsetCount, sizeof(offsetCount));
+
+ // save summary for reference's linear offsets
+ SaveLinearOffsetsSummary(refId, linearOffsets.size());
+
+ // iterate over linear offsets
+ BaiLinearOffsetVector::const_iterator offsetIter = linearOffsets.begin();
+ BaiLinearOffsetVector::const_iterator offsetEnd = linearOffsets.end();
+ for (; offsetIter != offsetEnd; ++offsetIter) {
+
+ // write linear offset
+ uint64_t linearOffset = (*offsetIter);
+ if (m_isBigEndian) SwapEndian_64(linearOffset);
+ numBytesWritten +=
+ m_resources.Device->Write((const char*)&linearOffset, sizeof(linearOffset));
+ }
+
+ if (numBytesWritten !=
+ static_cast<int64_t>(sizeof(offsetCount) + linearOffsets.size() * sizeof(uint64_t)))
+ throw BamException("BamStandardIndex::WriteLinearOffsets",
+ "could not write BAI linear offsets");
+}
+
+void BamStandardIndex::WriteReferenceEntry(BaiReferenceEntry& refEntry)
+{
+ WriteBins(refEntry.ID, refEntry.Bins);
+ WriteLinearOffsets(refEntry.ID, refEntry.LinearOffsets);
+}
diff --git a/src/api/internal/index/BamStandardIndex_p.h b/src/api/internal/index/BamStandardIndex_p.h
new file mode 100644
index 0000000..514b638
--- /dev/null
+++ b/src/api/internal/index/BamStandardIndex_p.h
@@ -0,0 +1,236 @@
+// ***************************************************************************
+// BamStandardIndex.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 November 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides index operations for the standardized BAM index format (".bai")
+// ***************************************************************************
+
+#ifndef BAM_STANDARD_INDEX_FORMAT_H
+#define BAM_STANDARD_INDEX_FORMAT_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+#include "api/BamAux.h"
+#include "api/BamIndex.h"
+#include "api/IBamIODevice.h"
+
+namespace BamTools {
+namespace Internal {
+
+// -----------------------------------------------------------------------------
+// BamStandardIndex data structures
+
+// defines start and end of a contiguous run of alignments
+struct BaiAlignmentChunk
+{
+
+ // data members
+ uint64_t Start;
+ uint64_t Stop;
+
+ // constructor
+ BaiAlignmentChunk(const uint64_t& start = 0, const uint64_t& stop = 0)
+ : Start(start)
+ , Stop(stop)
+ {}
+};
+
+// comparison operator (for sorting)
+inline bool operator<(const BaiAlignmentChunk& lhs, const BaiAlignmentChunk& rhs)
+{
+ return lhs.Start < rhs.Start;
+}
+
+// convenience typedef for a list of all alignment 'chunks' in a BAI bin
+typedef std::vector<BaiAlignmentChunk> BaiAlignmentChunkVector;
+
+// convenience typedef for a map of all BAI bins in a reference (ID => chunks)
+typedef std::map<uint32_t, BaiAlignmentChunkVector> BaiBinMap;
+
+// convenience typedef for a list of all 'linear offsets' in a reference
+typedef std::vector<uint64_t> BaiLinearOffsetVector;
+
+// contains all fields necessary for building, loading, & writing
+// full BAI index data for a single reference
+struct BaiReferenceEntry
+{
+
+ // data members
+ int32_t ID;
+ BaiBinMap Bins;
+ BaiLinearOffsetVector LinearOffsets;
+
+ // ctor
+ BaiReferenceEntry(const int32_t& id = -1)
+ : ID(id)
+ {}
+};
+
+// provides (persistent) summary of BaiReferenceEntry's index data
+struct BaiReferenceSummary
+{
+
+ // data members
+ int NumBins;
+ int NumLinearOffsets;
+ uint64_t FirstBinFilePosition;
+ uint64_t FirstLinearOffsetFilePosition;
+
+ // ctor
+ BaiReferenceSummary()
+ : NumBins(0)
+ , NumLinearOffsets(0)
+ , FirstBinFilePosition(0)
+ , FirstLinearOffsetFilePosition(0)
+ {}
+};
+
+// convenience typedef for describing a full BAI index file summary
+typedef std::vector<BaiReferenceSummary> BaiFileSummary;
+
+// end BamStandardIndex data structures
+// -----------------------------------------------------------------------------
+
+class BamStandardIndex : public BamIndex
+{
+
+ // ctor & dtor
+public:
+ BamStandardIndex(Internal::BamReaderPrivate* reader);
+ ~BamStandardIndex();
+
+ // BamIndex implementation
+public:
+ // builds index from associated BAM file & writes out to index file
+ bool Create();
+ // returns whether reference has alignments or no
+ bool HasAlignments(const int& referenceID) const;
+ // attempts to use index data to jump to @region, returns success/fail
+ // a "successful" jump indicates no error, but not whether this region has data
+ // * thus, the method sets a flag to indicate whether there are alignments
+ // available after the jump position
+ bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
+ // loads existing data from file into memory
+ bool Load(const std::string& filename);
+ BamIndex::IndexType Type() const
+ {
+ return BamIndex::STANDARD;
+ }
+
+public:
+ // returns format's file extension
+ static const std::string Extension();
+
+ // internal methods
+private:
+ // index file ops
+ void CheckMagicNumber();
+ void CloseFile();
+ bool IsDeviceOpen() const;
+ void OpenFile(const std::string& filename, IBamIODevice::OpenMode mode);
+ void Seek(const int64_t& position, const int origin);
+ int64_t Tell() const;
+
+ // BAI index building methods
+ void ClearReferenceEntry(BaiReferenceEntry& refEntry);
+ void SaveAlignmentChunkToBin(BaiBinMap& binMap, const uint32_t& currentBin,
+ const uint64_t& currentOffset, const uint64_t& lastOffset);
+ void SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets, const int& alignmentStartPosition,
+ const int& alignmentStopPosition, const uint64_t& lastOffset);
+
+ // random-access methods
+ void AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end);
+ void CalculateCandidateBins(const uint32_t& begin, const uint32_t& end,
+ std::set<uint16_t>& candidateBins);
+ void CalculateCandidateOffsets(const BaiReferenceSummary& refSummary, const uint64_t& minOffset,
+ std::set<uint16_t>& candidateBins,
+ std::vector<int64_t>& offsets);
+ uint64_t CalculateMinOffset(const BaiReferenceSummary& refSummary, const uint32_t& begin);
+ void GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion);
+ uint64_t LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index);
+
+ // BAI summary (create/load) methods
+ void ReserveForSummary(const int& numReferences);
+ void SaveBinsSummary(const int& refId, const int& numBins);
+ void SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets);
+ void SkipBins(const int& numBins);
+ void SkipLinearOffsets(const int& numLinearOffsets);
+ void SummarizeBins(BaiReferenceSummary& refSummary);
+ void SummarizeIndexFile();
+ void SummarizeLinearOffsets(BaiReferenceSummary& refSummary);
+ void SummarizeReference(BaiReferenceSummary& refSummary);
+
+ // BAI full index input methods
+ void ReadBinID(uint32_t& binId);
+ void ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks);
+ void ReadIntoBuffer(const unsigned int& bytesRequested);
+ void ReadLinearOffset(uint64_t& linearOffset);
+ void ReadNumAlignmentChunks(int& numAlignmentChunks);
+ void ReadNumBins(int& numBins);
+ void ReadNumLinearOffsets(int& numLinearOffsets);
+ void ReadNumReferences(int& numReferences);
+
+ // BAI full index output methods
+ void MergeAlignmentChunks(BaiAlignmentChunkVector& chunks);
+ void SortLinearOffsets(BaiLinearOffsetVector& linearOffsets);
+ void WriteAlignmentChunk(const BaiAlignmentChunk& chunk);
+ void WriteAlignmentChunks(BaiAlignmentChunkVector& chunks);
+ void WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks);
+ void WriteBins(const int& refId, BaiBinMap& bins);
+ void WriteHeader();
+ void WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets);
+ void WriteReferenceEntry(BaiReferenceEntry& refEntry);
+
+ // data members
+private:
+ bool m_isBigEndian;
+ BaiFileSummary m_indexFileSummary;
+
+ // our input buffer
+ unsigned int m_bufferLength;
+ struct RaiiWrapper
+ {
+ IBamIODevice* Device;
+ char* Buffer;
+ RaiiWrapper();
+ ~RaiiWrapper();
+ };
+ RaiiWrapper m_resources;
+
+ // static methods
+private:
+ // checks if the buffer is large enough to accomodate the requested size
+ static void CheckBufferSize(char*& buffer, unsigned int& bufferLength,
+ const unsigned int& requestedBytes);
+ // checks if the buffer is large enough to accomodate the requested size
+ static void CheckBufferSize(unsigned char*& buffer, unsigned int& bufferLength,
+ const unsigned int& requestedBytes);
+ // static constants
+private:
+ static const int MAX_BIN;
+ static const int BAM_LIDX_SHIFT;
+ static const std::string BAI_EXTENSION;
+ static const char* const BAI_MAGIC;
+ static const int SIZEOF_ALIGNMENTCHUNK;
+ static const int SIZEOF_BINCORE;
+ static const int SIZEOF_LINEAROFFSET;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAM_STANDARD_INDEX_FORMAT_H
diff --git a/src/api/internal/index/BamToolsIndex_p.cpp b/src/api/internal/index/BamToolsIndex_p.cpp
new file mode 100644
index 0000000..01a2a82
--- /dev/null
+++ b/src/api/internal/index/BamToolsIndex_p.cpp
@@ -0,0 +1,677 @@
+// ***************************************************************************
+// BamToolsIndex.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 November 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides index operations for the BamTools index format (".bti")
+// ***************************************************************************
+
+#include "api/internal/index/BamToolsIndex_p.h"
+#include "api/BamAlignment.h"
+#include "api/internal/bam/BamReader_p.h"
+#include "api/internal/io/BamDeviceFactory_p.h"
+#include "api/internal/io/BgzfStream_p.h"
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <iterator>
+#include <map>
+
+// --------------------------------
+// static BamToolsIndex constants
+// --------------------------------
+
+const uint32_t BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000;
+const std::string BamToolsIndex::BTI_EXTENSION = ".bti";
+const char* const BamToolsIndex::BTI_MAGIC = "BTI\1";
+const int BamToolsIndex::SIZEOF_BLOCK = sizeof(int32_t) * 2 + sizeof(int64_t);
+
+// ----------------------------
+// RaiiWrapper implementation
+// ----------------------------
+
+BamToolsIndex::RaiiWrapper::RaiiWrapper()
+ : Device(0)
+{}
+
+BamToolsIndex::RaiiWrapper::~RaiiWrapper()
+{
+ if (Device) {
+ Device->Close();
+ delete Device;
+ Device = 0;
+ }
+}
+
+// ------------------------------
+// BamToolsIndex implementation
+// ------------------------------
+
+// ctor
+BamToolsIndex::BamToolsIndex(Internal::BamReaderPrivate* reader)
+ : BamIndex(reader)
+ , m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH)
+ , m_inputVersion(0)
+ , m_outputVersion(BTI_2_0) // latest version - used for writing new index files
+{
+ m_isBigEndian = BamTools::SystemIsBigEndian();
+}
+
+// dtor
+BamToolsIndex::~BamToolsIndex()
+{
+ CloseFile();
+}
+
+void BamToolsIndex::CheckMagicNumber()
+{
+
+ // read magic number
+ char magic[4];
+ const int64_t numBytesRead = m_resources.Device->Read(magic, 4);
+ if (numBytesRead != 4)
+ throw BamException("BamToolsIndex::CheckMagicNumber", "could not read BTI magic number");
+
+ // validate expected magic number
+ if (strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0)
+ throw BamException("BamToolsIndex::CheckMagicNumber", "invalid BTI magic number");
+}
+
+// check index file version, return true if OK
+void BamToolsIndex::CheckVersion()
+{
+
+ // read version from file
+ const int64_t numBytesRead =
+ m_resources.Device->Read((char*)&m_inputVersion, sizeof(m_inputVersion));
+ if (numBytesRead != sizeof(m_inputVersion))
+ throw BamException("BamToolsIndex::CheckVersion", "could not read format version");
+ if (m_isBigEndian) SwapEndian_32(m_inputVersion);
+
+ // if version is negative, or zero
+ if (m_inputVersion <= 0)
+ throw BamException("BamToolsIndex::CheckVersion", "invalid format version");
+
+ // if version is newer than can be supported by this version of bamtools
+ else if (m_inputVersion > m_outputVersion) {
+ const std::string message =
+ "unsupported format: this index was created by a newer version of BamTools. "
+ "Update your local version of BamTools to use the index file.";
+ throw BamException("BamToolsIndex::CheckVersion", message);
+ }
+
+ // ------------------------------------------------------------------
+ // check for deprecated, unsupported versions
+ // (the format had to be modified to accomodate a particular bug fix)
+
+ // Version 2.0: introduced support for half-open intervals, instead of the old closed intervals
+ // respondBy: throwing exception - we're not going to try to handle the old BTI files.
+ else if ((Version)m_inputVersion < BamToolsIndex::BTI_2_0) {
+ const std::string message =
+ "unsupported format: this version of the index may not properly handle "
+ "coordinate intervals. Please run 'bamtools index -bti -in yourData.bam' "
+ "to generate an up-to-date, fixed BTI file.";
+ throw BamException("BamToolsIndex::CheckVersion", message);
+ }
+}
+
+void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry)
+{
+ refEntry.ID = -1;
+ refEntry.Blocks.clear();
+}
+
+void BamToolsIndex::CloseFile()
+{
+ if (IsDeviceOpen()) {
+ m_resources.Device->Close();
+ delete m_resources.Device;
+ m_resources.Device = 0;
+ }
+ m_indexFileSummary.clear();
+}
+
+// builds index from associated BAM file & writes out to index file
+bool BamToolsIndex::Create()
+{
+
+ // skip if BamReader is invalid or not open
+ if (m_reader == 0 || !m_reader->IsOpen()) {
+ SetErrorString("BamToolsIndex::Create", "could not create index: reader is not open");
+ return false;
+ }
+
+ // rewind BamReader
+ if (!m_reader->Rewind()) {
+ const std::string readerError = m_reader->GetErrorString();
+ const std::string message = "could not create index: \n\t" + readerError;
+ SetErrorString("BamToolsIndex::Create", message);
+ return false;
+ }
+
+ try {
+ // open new index file (read & write)
+ const std::string indexFilename = m_reader->Filename() + Extension();
+ OpenFile(indexFilename, IBamIODevice::ReadWrite);
+
+ // initialize BtiFileSummary with number of references
+ const int& numReferences = m_reader->GetReferenceCount();
+ InitializeFileSummary(numReferences);
+
+ // intialize output file header
+ WriteHeader();
+
+ // index building markers
+ uint32_t currentBlockCount = 0;
+ int64_t currentAlignmentOffset = m_reader->Tell();
+ int32_t blockRefId = -1;
+ int32_t blockMaxEndPosition = -1;
+ int64_t blockStartOffset = currentAlignmentOffset;
+ int32_t blockStartPosition = -1;
+
+ // plow through alignments, storing index entries
+ BamAlignment al;
+ BtiReferenceEntry refEntry;
+ while (m_reader->LoadNextAlignment(al)) {
+
+ // if moved to new reference
+ if (al.RefID != blockRefId) {
+
+ // if first pass, check:
+ if (currentBlockCount == 0) {
+
+ // write any empty references up to (but not including) al.RefID
+ for (int i = 0; i < al.RefID; ++i)
+ WriteReferenceEntry(BtiReferenceEntry(i));
+ }
+
+ // not first pass:
+ else {
+
+ // store previous BTI block data in reference entry
+ const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ refEntry.Blocks.push_back(block);
+
+ // write reference entry, then clear
+ WriteReferenceEntry(refEntry);
+ ClearReferenceEntry(refEntry);
+
+ // write any empty references between (but not including)
+ // the last blockRefID and current al.RefID
+ for (int i = blockRefId + 1; i < al.RefID; ++i)
+ WriteReferenceEntry(BtiReferenceEntry(i));
+
+ // reset block count
+ currentBlockCount = 0;
+ }
+
+ // set ID for new reference entry
+ refEntry.ID = al.RefID;
+ }
+
+ // if beginning of block, update counters
+ if (currentBlockCount == 0) {
+ blockRefId = al.RefID;
+ blockStartOffset = currentAlignmentOffset;
+ blockStartPosition = al.Position;
+ blockMaxEndPosition = al.GetEndPosition();
+ }
+
+ // increment block counter
+ ++currentBlockCount;
+
+ // check end position
+ const int32_t alignmentEndPosition = al.GetEndPosition();
+ if (alignmentEndPosition > blockMaxEndPosition)
+ blockMaxEndPosition = alignmentEndPosition;
+
+ // if block is full, get offset for next block, reset currentBlockCount
+ if (currentBlockCount == m_blockSize) {
+
+ // store previous block data in reference entry
+ const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ refEntry.Blocks.push_back(block);
+
+ // update markers
+ blockStartOffset = m_reader->Tell();
+ currentBlockCount = 0;
+ }
+
+ // not the best name, but for the next iteration, this value will be the offset of the
+ // *current* alignment. this is necessary because we won't know if this next alignment
+ // is on a new reference until we actually read it
+ currentAlignmentOffset = m_reader->Tell();
+ }
+
+ // after finishing alignments, if any data was read, check:
+ if (blockRefId >= 0) {
+
+ // store last BTI block data in reference entry
+ const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ refEntry.Blocks.push_back(block);
+
+ // write last reference entry, then clear
+ WriteReferenceEntry(refEntry);
+ ClearReferenceEntry(refEntry);
+
+ // then write any empty references remaining at end of file
+ for (int i = blockRefId + 1; i < numReferences; ++i)
+ WriteReferenceEntry(BtiReferenceEntry(i));
+ }
+
+ } catch (BamException& e) {
+ m_errorString = e.what();
+ return false;
+ }
+
+ // rewind BamReader
+ if (!m_reader->Rewind()) {
+ const std::string readerError = m_reader->GetErrorString();
+ const std::string message = "could not create index: \n\t" + readerError;
+ SetErrorString("BamToolsIndex::Create", message);
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+// returns format's file extension
+const std::string BamToolsIndex::Extension()
+{
+ return BamToolsIndex::BTI_EXTENSION;
+}
+
+void BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion)
+{
+
+ // return false ref ID is not a valid index in file summary data
+ if (region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size())
+ throw BamException("BamToolsIndex::GetOffset", "invalid region requested");
+
+ // retrieve reference index data for left bound reference
+ BtiReferenceEntry refEntry(region.LeftRefID);
+ ReadReferenceEntry(refEntry);
+
+ // binary search for an overlapping block (may not be first one though)
+ bool found = false;
+ typedef BtiBlockVector::const_iterator BtiBlockConstIterator;
+ BtiBlockConstIterator blockFirst = refEntry.Blocks.begin();
+ BtiBlockConstIterator blockIter = blockFirst;
+ BtiBlockConstIterator blockLast = refEntry.Blocks.end();
+ std::iterator_traits<BtiBlockConstIterator>::difference_type count =
+ std::distance(blockFirst, blockLast);
+ std::iterator_traits<BtiBlockConstIterator>::difference_type step;
+ while (count > 0) {
+ blockIter = blockFirst;
+ step = count / 2;
+ advance(blockIter, step);
+
+ const BtiBlock& block = (*blockIter);
+ if (block.StartPosition <= region.RightPosition) {
+ if (block.MaxEndPosition > region.LeftPosition) {
+ offset = block.StartOffset;
+ break;
+ }
+ blockFirst = ++blockIter;
+ count -= step + 1;
+ } else
+ count = step;
+ }
+
+ // if we didn't search "off the end" of the blocks
+ if (blockIter != blockLast) {
+
+ // "walk back" until we've gone too far
+ while (blockIter != blockFirst) {
+ const BtiBlock& currentBlock = (*blockIter);
+
+ --blockIter;
+ const BtiBlock& previousBlock = (*blockIter);
+ if (previousBlock.MaxEndPosition <= region.LeftPosition) {
+ offset = currentBlock.StartOffset;
+ found = true;
+ break;
+ }
+ }
+
+ // if we walked all the way to first block, just return that and let the reader's
+ // region overlap parsing do the rest
+ if (blockIter == blockFirst) {
+ const BtiBlock& block = (*blockIter);
+ offset = block.StartOffset;
+ found = true;
+ }
+ }
+
+ // sets to false if blocks container is empty, or if no matching block could be found
+ *hasAlignmentsInRegion = found;
+}
+
+// returns whether reference has alignments or no
+bool BamToolsIndex::HasAlignments(const int& referenceID) const
+{
+ if (referenceID < 0 || referenceID >= (int)m_indexFileSummary.size()) return false;
+ const BtiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID);
+ return (refSummary.NumBlocks > 0);
+}
+
+// pre-allocates space for each reference's summary data
+void BamToolsIndex::InitializeFileSummary(const int& numReferences)
+{
+ m_indexFileSummary.clear();
+ for (int i = 0; i < numReferences; ++i)
+ m_indexFileSummary.push_back(BtiReferenceSummary());
+}
+
+// returns true if the index stream is open
+bool BamToolsIndex::IsDeviceOpen() const
+{
+ if (m_resources.Device == 0) return false;
+ return m_resources.Device->IsOpen();
+}
+
+// attempts to use index data to jump to @region, returns success/fail
+// a "successful" jump indicates no error, but not whether this region has data
+// * thus, the method sets a flag to indicate whether there are alignments
+// available after the jump position
+bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion)
+{
+
+ // clear flag
+ *hasAlignmentsInRegion = false;
+
+ // skip if invalid reader or not open
+ if (m_reader == 0 || !m_reader->IsOpen()) {
+ SetErrorString("BamToolsIndex::Jump", "could not jump: reader is not open");
+ return false;
+ }
+
+ // make sure left-bound position is valid
+ const RefVector& references = m_reader->GetReferenceData();
+ if (region.LeftPosition > references.at(region.LeftRefID).RefLength) {
+ SetErrorString("BamToolsIndex::Jump", "could not create index: invalid region requested");
+ return false;
+ }
+
+ // calculate nearest offset to jump to
+ int64_t offset;
+ try {
+ GetOffset(region, offset, hasAlignmentsInRegion);
+ } catch (BamException& e) {
+ m_errorString = e.what();
+ return false;
+ }
+
+ // return success/failure of seek
+ return m_reader->Seek(offset);
+}
+
+// loads existing data from file into memory
+bool BamToolsIndex::Load(const std::string& filename)
+{
+
+ try {
+
+ // attempt to open file (read-only)
+ OpenFile(filename, IBamIODevice::ReadOnly);
+
+ // load metadata & generate in-memory summary
+ LoadHeader();
+ LoadFileSummary();
+
+ // return success
+ return true;
+
+ } catch (BamException& e) {
+ m_errorString = e.what();
+ return false;
+ }
+}
+
+void BamToolsIndex::LoadFileSummary()
+{
+
+ // load number of reference sequences
+ int numReferences;
+ LoadNumReferences(numReferences);
+
+ // initialize file summary data
+ InitializeFileSummary(numReferences);
+
+ // load summary for each reference
+ BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin();
+ BtiFileSummary::iterator summaryEnd = m_indexFileSummary.end();
+ for (; summaryIter != summaryEnd; ++summaryIter)
+ LoadReferenceSummary(*summaryIter);
+}
+
+void BamToolsIndex::LoadHeader()
+{
+
+ // check BTI file metadata
+ CheckMagicNumber();
+ CheckVersion();
+
+ // use file's BTI block size to set member variable
+ const int64_t numBytesRead = m_resources.Device->Read((char*)&m_blockSize, sizeof(m_blockSize));
+ if (m_isBigEndian) SwapEndian_32(m_blockSize);
+ if (numBytesRead != sizeof(m_blockSize))
+ throw BamException("BamToolsIndex::LoadHeader", "could not read BTI block size");
+}
+
+void BamToolsIndex::LoadNumBlocks(int& numBlocks)
+{
+ const int64_t numBytesRead = m_resources.Device->Read((char*)&numBlocks, sizeof(numBlocks));
+ if (m_isBigEndian) SwapEndian_32(numBlocks);
+ if (numBytesRead != sizeof(numBlocks))
+ throw BamException("BamToolsIndex::LoadNumBlocks", "could not read number of BTI blocks");
+}
+
+void BamToolsIndex::LoadNumReferences(int& numReferences)
+{
+ const int64_t numBytesRead =
+ m_resources.Device->Read((char*)&numReferences, sizeof(numReferences));
+ if (m_isBigEndian) SwapEndian_32(numReferences);
+ if (numBytesRead != sizeof(numReferences))
+ throw BamException("BamToolsIndex::LoadNumReferences",
+ "could not read number of references");
+}
+
+void BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary)
+{
+
+ // load number of blocks
+ int numBlocks;
+ LoadNumBlocks(numBlocks);
+
+ // store block summary data for this reference
+ refSummary.NumBlocks = numBlocks;
+ refSummary.FirstBlockFilePosition = Tell();
+
+ // skip reference's blocks
+ SkipBlocks(numBlocks);
+}
+
+void BamToolsIndex::OpenFile(const std::string& filename, IBamIODevice::OpenMode mode)
+{
+
+ // make sure any previous index file is closed
+ CloseFile();
+
+ m_resources.Device = BamDeviceFactory::CreateDevice(filename);
+ if (m_resources.Device == 0) {
+ const std::string message = std::string("could not open file: ") + filename;
+ throw BamException("BamStandardIndex::OpenFile", message);
+ }
+
+ // attempt to open file
+ m_resources.Device->Open(mode);
+ if (!IsDeviceOpen()) {
+ const std::string message = std::string("could not open file: ") + filename;
+ throw BamException("BamToolsIndex::OpenFile", message);
+ }
+}
+
+void BamToolsIndex::ReadBlock(BtiBlock& block)
+{
+
+ // read in block data members
+ int64_t numBytesRead = 0;
+ numBytesRead +=
+ m_resources.Device->Read((char*)&block.MaxEndPosition, sizeof(block.MaxEndPosition));
+ numBytesRead += m_resources.Device->Read((char*)&block.StartOffset, sizeof(block.StartOffset));
+ numBytesRead +=
+ m_resources.Device->Read((char*)&block.StartPosition, sizeof(block.StartPosition));
+
+ // swap endian-ness if necessary
+ if (m_isBigEndian) {
+ SwapEndian_32(block.MaxEndPosition);
+ SwapEndian_64(block.StartOffset);
+ SwapEndian_32(block.StartPosition);
+ }
+
+ // check block read ok
+ const int expectedBytes =
+ sizeof(block.MaxEndPosition) + sizeof(block.StartOffset) + sizeof(block.StartPosition);
+ if (numBytesRead != expectedBytes)
+ throw BamException("BamToolsIndex::ReadBlock", "could not read block");
+}
+
+void BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks)
+{
+
+ // prep blocks container
+ blocks.clear();
+ blocks.reserve(refSummary.NumBlocks);
+
+ // skip to first block entry
+ Seek(refSummary.FirstBlockFilePosition, SEEK_SET);
+
+ // read & store block entries
+ BtiBlock block;
+ for (int i = 0; i < refSummary.NumBlocks; ++i) {
+ ReadBlock(block);
+ blocks.push_back(block);
+ }
+}
+
+void BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry)
+{
+
+ // return false if refId not valid index in file summary structure
+ if (refEntry.ID < 0 || refEntry.ID >= (int)m_indexFileSummary.size())
+ throw BamException("BamToolsIndex::ReadReferenceEntry", "invalid reference requested");
+
+ // use index summary to assist reading the reference's BTI blocks
+ const BtiReferenceSummary& refSummary = m_indexFileSummary.at(refEntry.ID);
+ ReadBlocks(refSummary, refEntry.Blocks);
+}
+
+void BamToolsIndex::Seek(const int64_t& position, const int origin)
+{
+ if (!m_resources.Device->Seek(position, origin))
+ throw BamException("BamToolsIndex::Seek", "could not seek in BAI file");
+}
+
+void BamToolsIndex::SkipBlocks(const int& numBlocks)
+{
+ Seek(numBlocks * BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR);
+}
+
+int64_t BamToolsIndex::Tell() const
+{
+ return m_resources.Device->Tell();
+}
+
+void BamToolsIndex::WriteBlock(const BtiBlock& block)
+{
+
+ // copy entry data
+ int32_t maxEndPosition = block.MaxEndPosition;
+ int64_t startOffset = block.StartOffset;
+ int32_t startPosition = block.StartPosition;
+
+ // swap endian-ness if necessary
+ if (m_isBigEndian) {
+ SwapEndian_32(maxEndPosition);
+ SwapEndian_64(startOffset);
+ SwapEndian_32(startPosition);
+ }
+
+ // write the reference index entry
+ int64_t numBytesWritten = 0;
+ numBytesWritten +=
+ m_resources.Device->Write((const char*)&maxEndPosition, sizeof(maxEndPosition));
+ numBytesWritten += m_resources.Device->Write((const char*)&startOffset, sizeof(startOffset));
+ numBytesWritten +=
+ m_resources.Device->Write((const char*)&startPosition, sizeof(startPosition));
+
+ // check block written ok
+ const int expectedBytes = sizeof(maxEndPosition) + sizeof(startOffset) + sizeof(startPosition);
+ if (numBytesWritten != expectedBytes)
+ throw BamException("BamToolsIndex::WriteBlock", "could not write BTI block");
+}
+
+void BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks)
+{
+ BtiBlockVector::const_iterator blockIter = blocks.begin();
+ BtiBlockVector::const_iterator blockEnd = blocks.end();
+ for (; blockIter != blockEnd; ++blockIter)
+ WriteBlock(*blockIter);
+}
+
+void BamToolsIndex::WriteHeader()
+{
+
+ int64_t numBytesWritten = 0;
+
+ // write BTI index format 'magic number'
+ numBytesWritten += m_resources.Device->Write(BamToolsIndex::BTI_MAGIC, 4);
+
+ // write BTI index format version
+ int32_t currentVersion = (int32_t)m_outputVersion;
+ if (m_isBigEndian) SwapEndian_32(currentVersion);
+ numBytesWritten +=
+ m_resources.Device->Write((const char*)&currentVersion, sizeof(currentVersion));
+
+ // write block size
+ uint32_t blockSize = m_blockSize;
+ if (m_isBigEndian) SwapEndian_32(blockSize);
+ numBytesWritten += m_resources.Device->Write((const char*)&blockSize, sizeof(blockSize));
+
+ // write number of references
+ int32_t numReferences = m_indexFileSummary.size();
+ if (m_isBigEndian) SwapEndian_32(numReferences);
+ numBytesWritten +=
+ m_resources.Device->Write((const char*)&numReferences, sizeof(numReferences));
+
+ // check header written ok
+ const int expectedBytes =
+ 4 + sizeof(currentVersion) + sizeof(blockSize) + sizeof(numReferences);
+ if (numBytesWritten != expectedBytes)
+ throw BamException("BamToolsIndex::WriteHeader", "could not write BTI header");
+}
+
+void BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry)
+{
+
+ // write number of blocks this reference
+ uint32_t numBlocks = refEntry.Blocks.size();
+ if (m_isBigEndian) SwapEndian_32(numBlocks);
+ const int64_t numBytesWritten =
+ m_resources.Device->Write((const char*)&numBlocks, sizeof(numBlocks));
+ if (numBytesWritten != sizeof(numBlocks))
+ throw BamException("BamToolsIndex::WriteReferenceEntry",
+ "could not write number of blocks");
+
+ // write actual block entries
+ WriteBlocks(refEntry.Blocks);
+}
diff --git a/src/api/internal/index/BamToolsIndex_p.h b/src/api/internal/index/BamToolsIndex_p.h
new file mode 100644
index 0000000..909b164
--- /dev/null
+++ b/src/api/internal/index/BamToolsIndex_p.h
@@ -0,0 +1,195 @@
+// ***************************************************************************
+// BamToolsIndex.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 November 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides index operations for the BamTools index format (".bti")
+// ***************************************************************************
+
+#ifndef BAMTOOLS_INDEX_FORMAT_H
+#define BAMTOOLS_INDEX_FORMAT_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+
+#include <map>
+#include <string>
+#include <vector>
+#include "api/BamAux.h"
+#include "api/BamIndex.h"
+#include "api/IBamIODevice.h"
+
+namespace BamTools {
+namespace Internal {
+
+// contains data for each 'block' in a BTI index
+struct BtiBlock
+{
+
+ // data members
+ int32_t MaxEndPosition;
+ int64_t StartOffset;
+ int32_t StartPosition;
+
+ // ctor
+ BtiBlock(const int32_t& maxEndPosition = 0, const int64_t& startOffset = 0,
+ const int32_t& startPosition = 0)
+ : MaxEndPosition(maxEndPosition)
+ , StartOffset(startOffset)
+ , StartPosition(startPosition)
+ {}
+};
+
+// convenience typedef for describing a a list of BTI blocks on a reference
+typedef std::vector<BtiBlock> BtiBlockVector;
+
+// contains all fields necessary for building, loading, & writing
+// full BTI index data for a single reference
+struct BtiReferenceEntry
+{
+
+ // data members
+ int32_t ID;
+ BtiBlockVector Blocks;
+
+ // ctor
+ BtiReferenceEntry(const int& id = -1)
+ : ID(id)
+ {}
+};
+
+// provides (persistent) summary of BtiReferenceEntry's index data
+struct BtiReferenceSummary
+{
+
+ // data members
+ int NumBlocks;
+ uint64_t FirstBlockFilePosition;
+
+ // ctor
+ BtiReferenceSummary()
+ : NumBlocks(0)
+ , FirstBlockFilePosition(0)
+ {}
+};
+
+// convenience typedef for describing a full BTI index file summary
+typedef std::vector<BtiReferenceSummary> BtiFileSummary;
+
+class BamToolsIndex : public BamIndex
+{
+
+ // keep a list of any supported versions here
+ // (might be useful later to handle any 'legacy' versions if the format changes)
+ // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on
+ //
+ // so a change introduced in BTI_1_2 may be handled from then on by:
+ //
+ // if ( indexVersion >= BTI_1_2 )
+ // do something new
+ // else
+ // do the old thing
+ enum Version
+ {
+ BTI_1_0 = 1,
+ BTI_1_1,
+ BTI_1_2,
+ BTI_2_0
+ };
+
+ // ctor & dtor
+public:
+ BamToolsIndex(Internal::BamReaderPrivate* reader);
+ ~BamToolsIndex();
+
+ // BamIndex implementation
+public:
+ // builds index from associated BAM file & writes out to index file
+ bool Create();
+ // returns whether reference has alignments or no
+ bool HasAlignments(const int& referenceID) const;
+ // attempts to use index data to jump to @region, returns success/fail
+ // a "successful" jump indicates no error, but not whether this region has data
+ // * thus, the method sets a flag to indicate whether there are alignments
+ // available after the jump position
+ bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
+ // loads existing data from file into memory
+ bool Load(const std::string& filename);
+ BamIndex::IndexType Type() const
+ {
+ return BamIndex::BAMTOOLS;
+ }
+
+public:
+ // returns format's file extension
+ static const std::string Extension();
+
+ // internal methods
+private:
+ // index file ops
+ void CheckMagicNumber();
+ void CheckVersion();
+ void CloseFile();
+ bool IsDeviceOpen() const;
+ void OpenFile(const std::string& filename, IBamIODevice::OpenMode mode);
+ void Seek(const int64_t& position, const int origin);
+ int64_t Tell() const;
+
+ // index-creation methods
+ void ClearReferenceEntry(BtiReferenceEntry& refEntry);
+ void WriteBlock(const BtiBlock& block);
+ void WriteBlocks(const BtiBlockVector& blocks);
+ void WriteHeader();
+ void WriteReferenceEntry(const BtiReferenceEntry& refEntry);
+
+ // random-access methods
+ void GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion);
+ void ReadBlock(BtiBlock& block);
+ void ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks);
+ void ReadReferenceEntry(BtiReferenceEntry& refEntry);
+
+ // BTI summary data methods
+ void InitializeFileSummary(const int& numReferences);
+ void LoadFileSummary();
+ void LoadHeader();
+ void LoadNumBlocks(int& numBlocks);
+ void LoadNumReferences(int& numReferences);
+ void LoadReferenceSummary(BtiReferenceSummary& refSummary);
+ void SkipBlocks(const int& numBlocks);
+
+ // data members
+private:
+ bool m_isBigEndian;
+ BtiFileSummary m_indexFileSummary;
+ uint32_t m_blockSize;
+ int32_t m_inputVersion; // Version is serialized as int
+ Version m_outputVersion;
+
+ struct RaiiWrapper
+ {
+ IBamIODevice* Device;
+ RaiiWrapper();
+ ~RaiiWrapper();
+ };
+ RaiiWrapper m_resources;
+
+ // static constants
+private:
+ static const uint32_t DEFAULT_BLOCK_LENGTH;
+ static const std::string BTI_EXTENSION;
+ static const char* const BTI_MAGIC;
+ static const int SIZEOF_BLOCK;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMTOOLS_INDEX_FORMAT_H
diff --git a/src/api/internal/index/CMakeLists.txt b/src/api/internal/index/CMakeLists.txt
new file mode 100644
index 0000000..d6a7df6
--- /dev/null
+++ b/src/api/internal/index/CMakeLists.txt
@@ -0,0 +1,17 @@
+# ==========================
+# BamTools CMakeLists.txt
+# (c) 2011 Derek Barnett
+#
+# src/api/internal/index
+# ==========================
+
+set( InternalIndexDir "${InternalDir}/index" )
+
+set( InternalIndexSources
+ ${InternalIndexDir}/BamIndexFactory_p.cpp
+ ${InternalIndexDir}/BamStandardIndex_p.cpp
+ ${InternalIndexDir}/BamToolsIndex_p.cpp
+
+ PARENT_SCOPE # <-- leave this last
+)
+
diff --git a/src/api/internal/io/BamDeviceFactory_p.cpp b/src/api/internal/io/BamDeviceFactory_p.cpp
new file mode 100644
index 0000000..2844ab1
--- /dev/null
+++ b/src/api/internal/io/BamDeviceFactory_p.cpp
@@ -0,0 +1,34 @@
+// ***************************************************************************
+// BamDeviceFactory_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 September 2011 (DB)
+// ---------------------------------------------------------------------------
+// Creates built-in concrete implementations of IBamIODevices
+// ***************************************************************************
+
+#include "api/internal/io/BamDeviceFactory_p.h"
+#include "api/internal/io/BamFile_p.h"
+#include "api/internal/io/BamFtp_p.h"
+#include "api/internal/io/BamHttp_p.h"
+#include "api/internal/io/BamPipe_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <iostream>
+
+IBamIODevice* BamDeviceFactory::CreateDevice(const std::string& source)
+{
+
+ // check for requested pipe
+ if (source == "-" || source == "stdin" || source == "stdout") return new BamPipe;
+
+ // check for HTTP prefix
+ if (source.find("http://") == 0) return new BamHttp(source);
+
+ // check for FTP prefix
+ if (source.find("ftp://") == 0) return new BamFtp(source);
+
+ // otherwise assume a "normal" file
+ return new BamFile(source);
+}
diff --git a/src/api/internal/io/BamDeviceFactory_p.h b/src/api/internal/io/BamDeviceFactory_p.h
new file mode 100644
index 0000000..ddd93b8
--- /dev/null
+++ b/src/api/internal/io/BamDeviceFactory_p.h
@@ -0,0 +1,38 @@
+// ***************************************************************************
+// BamDeviceFactory_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Creates built-in concrete implementations of IBamIODevices
+// ***************************************************************************
+
+#ifndef BAMDEVICEFACTORY_P_H
+#define BAMDEVICEFACTORY_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <string>
+#include "api/IBamIODevice.h"
+
+namespace BamTools {
+namespace Internal {
+
+class BamDeviceFactory
+{
+public:
+ static IBamIODevice* CreateDevice(const std::string& source);
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMDEVICEFACTORY_P_H
diff --git a/src/api/internal/io/BamFile_p.cpp b/src/api/internal/io/BamFile_p.cpp
new file mode 100644
index 0000000..4130bab
--- /dev/null
+++ b/src/api/internal/io/BamFile_p.cpp
@@ -0,0 +1,73 @@
+// ***************************************************************************
+// BamFile_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 November 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides BAM file-specific IO behavior
+// ***************************************************************************
+
+#include "api/internal/io/BamFile_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdio>
+#include <iostream>
+
+BamFile::BamFile(const std::string& filename)
+ : ILocalIODevice()
+ , m_filename(filename)
+{}
+
+BamFile::~BamFile() {}
+
+void BamFile::Close()
+{
+ if (IsOpen()) {
+ m_filename.clear();
+ ILocalIODevice::Close();
+ }
+}
+
+bool BamFile::IsRandomAccess() const
+{
+ return true;
+}
+
+bool BamFile::Open(const IBamIODevice::OpenMode mode)
+{
+
+ // make sure we're starting with a fresh file stream
+ Close();
+
+ // attempt to open FILE* depending on requested openmode
+ if (mode == IBamIODevice::ReadOnly)
+ m_stream = fopen(m_filename.c_str(), "rb");
+ else if (mode == IBamIODevice::WriteOnly)
+ m_stream = fopen(m_filename.c_str(), "wb");
+ else if (mode == IBamIODevice::ReadWrite)
+ m_stream = fopen(m_filename.c_str(), "w+b");
+ else {
+ SetErrorString("BamFile::Open", "unknown open mode requested");
+ return false;
+ }
+
+ // check that we obtained a valid FILE*
+ if (m_stream == 0) {
+ const std::string message_base = std::string("could not open file handle for ");
+ const std::string message =
+ message_base + ((m_filename.empty()) ? "empty filename" : m_filename);
+ SetErrorString("BamFile::Open", message);
+ return false;
+ }
+
+ // store current IO mode & return success
+ m_mode = mode;
+ return true;
+}
+
+bool BamFile::Seek(const int64_t& position, const int origin)
+{
+ BT_ASSERT_X(m_stream, "BamFile::Seek() - null stream");
+ return (fseek64(m_stream, position, origin) == 0);
+}
diff --git a/src/api/internal/io/BamFile_p.h b/src/api/internal/io/BamFile_p.h
new file mode 100644
index 0000000..47119b3
--- /dev/null
+++ b/src/api/internal/io/BamFile_p.h
@@ -0,0 +1,52 @@
+// ***************************************************************************
+// BamFile_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 November 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides BAM file-specific IO behavior
+// ***************************************************************************
+
+#ifndef BAMFILE_P_H
+#define BAMFILE_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <string>
+#include "api/internal/io/ILocalIODevice_p.h"
+
+namespace BamTools {
+namespace Internal {
+
+class BamFile : public ILocalIODevice
+{
+
+ // ctor & dtor
+public:
+ BamFile(const std::string& filename);
+ ~BamFile();
+
+ // ILocalIODevice implementation
+public:
+ void Close();
+ bool IsRandomAccess() const;
+ bool Open(const IBamIODevice::OpenMode mode);
+ bool Seek(const int64_t& position, const int origin = SEEK_SET);
+
+ // data members
+private:
+ std::string m_filename;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMFILE_P_H
diff --git a/src/api/internal/io/BamFtp_p.cpp b/src/api/internal/io/BamFtp_p.cpp
new file mode 100644
index 0000000..43dade7
--- /dev/null
+++ b/src/api/internal/io/BamFtp_p.cpp
@@ -0,0 +1,491 @@
+// ***************************************************************************
+// BamFtp_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 8 December 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides reading/writing of BAM files on FTP server
+// ***************************************************************************
+
+#include "api/internal/io/BamFtp_p.h"
+#include "api/BamAux.h"
+#include "api/internal/io/TcpSocket_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cctype>
+#include <cstddef>
+#include <cstdlib>
+#include <sstream>
+#include <vector>
+
+namespace BamTools {
+namespace Internal {
+
+// -----------
+// constants
+// -----------
+
+static const uint16_t FTP_PORT = 21;
+static const std::string FTP_PREFIX = "ftp://";
+static const std::size_t FTP_PREFIX_LENGTH = 6;
+static const std::string FTP_NEWLINE = "\r\n";
+
+static const std::string DEFAULT_USER = "anonymous";
+static const std::string DEFAULT_PASS = "anonymous@";
+
+static const std::string ABOR_CMD = "ABOR";
+static const std::string USER_CMD = "USER";
+static const std::string PASS_CMD = "PASS";
+static const std::string PASV_CMD = "PASV";
+static const std::string REIN_CMD = "REIN";
+static const std::string REST_CMD = "REST";
+static const std::string RETR_CMD = "RETR";
+static const std::string TYPE_CMD = "TYPE";
+
+static const char CMD_SEPARATOR = ' ';
+static const char HOST_SEPARATOR = '/';
+static const char IP_SEPARATOR = '.';
+
+static const char MULTILINE_CONTINUE = '-';
+
+static const char PASV_REPLY_PREFIX = '(';
+static const char PASV_REPLY_SEPARATOR = ',';
+static const char PASV_REPLY_SUFFIX = ')';
+
+// -----------------
+// utility methods
+// -----------------
+
+static inline std::vector<std::string> split(const std::string& source, const char delim)
+{
+
+ std::stringstream ss(source);
+ std::string field;
+ std::vector<std::string> fields;
+
+ while (std::getline(ss, field, delim))
+ fields.push_back(field);
+ return fields;
+}
+
+static inline bool startsWith(const std::string& source, const std::string& pattern)
+{
+ return (source.find(pattern) == 0);
+}
+
+static inline std::string toLower(const std::string& s)
+{
+ std::string out;
+ const std::size_t sSize = s.size();
+ out.resize(sSize);
+ for (std::size_t i = 0; i < sSize; ++i)
+ out[i] = tolower(s[i]);
+ return out;
+}
+
+} // namespace Internal
+} // namespace BamTools
+
+// -----------------------
+// BamFtp implementation
+// -----------------------
+
+BamFtp::BamFtp(const std::string& url)
+ : IBamIODevice()
+ , m_commandSocket(new TcpSocket)
+ , m_dataSocket(new TcpSocket)
+ , m_port(FTP_PORT)
+ , m_dataPort(0)
+ , m_username(DEFAULT_USER)
+ , m_password(DEFAULT_PASS)
+ , m_isUrlParsed(false)
+ , m_filePosition(-1)
+{
+ ParseUrl(url);
+}
+
+BamFtp::~BamFtp()
+{
+
+ // close connection & clean up
+ Close();
+ if (m_commandSocket) delete m_commandSocket;
+ if (m_dataSocket) delete m_dataSocket;
+}
+
+void BamFtp::Close()
+{
+
+ // disconnect socket
+ m_commandSocket->DisconnectFromHost();
+ m_dataSocket->DisconnectFromHost();
+
+ // reset state - necessary??
+ m_isUrlParsed = false;
+ m_filePosition = -1;
+ m_username = DEFAULT_USER;
+ m_password = DEFAULT_PASS;
+ m_dataHostname.clear();
+ m_dataPort = 0;
+}
+
+bool BamFtp::ConnectCommandSocket()
+{
+
+ BT_ASSERT_X(m_commandSocket, "null command socket?");
+
+ // connect to FTP server
+ if (!m_commandSocket->ConnectToHost(m_hostname, m_port, m_mode)) {
+ SetErrorString("BamFtp::ConnectCommandSocket", "could not connect to host - ");
+ return false;
+ }
+
+ // receive initial reply from host
+ if (!ReceiveReply()) {
+ Close();
+ return false;
+ }
+
+ // send USER command
+ std::string userCommand = USER_CMD + CMD_SEPARATOR + m_username + FTP_NEWLINE;
+ if (!SendCommand(userCommand, true)) {
+ Close();
+ return false;
+ }
+
+ // send PASS command
+ std::string passwordCommand = PASS_CMD + CMD_SEPARATOR + m_password + FTP_NEWLINE;
+ if (!SendCommand(passwordCommand, true)) {
+ Close();
+ return false;
+ }
+
+ // send TYPE command
+ std::string typeCommand = TYPE_CMD + CMD_SEPARATOR + 'I' + FTP_NEWLINE;
+ if (!SendCommand(typeCommand, true)) {
+ Close();
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+bool BamFtp::ConnectDataSocket()
+{
+
+ // failure if can't connect to command socket first
+ if (!m_commandSocket->IsConnected()) {
+ if (!ConnectCommandSocket()) return false;
+ }
+
+ // make sure we're starting with a fresh data channel
+ if (m_dataSocket->IsConnected()) m_dataSocket->DisconnectFromHost();
+
+ // send passive connection command
+ const std::string passiveCommand = PASV_CMD + FTP_NEWLINE;
+ if (!SendCommand(passiveCommand, true)) {
+ // TODO: set error string
+ return false;
+ }
+
+ // retrieve passive connection port
+ if (!ParsePassiveResponse()) {
+ // TODO: set error string
+ return false;
+ }
+
+ // set up restart command (tell server where to start fetching bytes from)
+ if (m_filePosition >= 0) {
+
+ std::stringstream fpStream;
+ fpStream << m_filePosition;
+ std::string restartCommand = REST_CMD + CMD_SEPARATOR + fpStream.str() + FTP_NEWLINE;
+ if (!SendCommand(restartCommand, true)) {
+ // TODO: set error string
+ return false;
+ }
+ }
+
+ // main file retrieval request
+ std::string retrieveCommand = RETR_CMD + CMD_SEPARATOR + m_filename + FTP_NEWLINE;
+ if (!SendCommand(retrieveCommand, false)) {
+ // TODO: set error string
+ return false;
+ }
+
+ // make data channel connection
+ if (!m_dataSocket->ConnectToHost(m_dataHostname, m_dataPort)) {
+ // TODO: set error string
+ return false;
+ }
+
+ // fetch intial reply from server
+ if (!ReceiveReply()) {
+ // TODO: set error string
+ m_dataSocket->DisconnectFromHost();
+ return false;
+ }
+
+ // make sure we have reply code 150 (all good)
+ if (!startsWith(m_response, "150")) {
+ // TODO: set error string
+ m_dataSocket->DisconnectFromHost();
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+bool BamFtp::IsOpen() const
+{
+ return IBamIODevice::IsOpen() && m_isUrlParsed;
+}
+
+bool BamFtp::IsRandomAccess() const
+{
+ return true;
+}
+
+bool BamFtp::Open(const IBamIODevice::OpenMode mode)
+{
+
+ // BamFtp only supports read-only access
+ if (mode != IBamIODevice::ReadOnly) {
+ SetErrorString("BamFtp::Open", "writing on this device is not supported");
+ return false;
+ }
+
+ // initialize basic valid state
+ m_mode = mode;
+ m_filePosition = 0;
+
+ // attempt connection to command & data sockets
+ return (ConnectCommandSocket() && ConnectDataSocket());
+}
+
+bool BamFtp::ParsePassiveResponse()
+{
+
+ // fail if empty
+ if (m_response.empty()) return false;
+
+ // find parentheses
+ const std::size_t leftParenFound = m_response.find(PASV_REPLY_PREFIX);
+ const std::size_t rightParenFound = m_response.find(PASV_REPLY_SUFFIX);
+ if (leftParenFound == std::string::npos || rightParenFound == std::string::npos) return false;
+
+ // grab everything between ( should be "h1,h2,h3,h4,p1,p2" )
+ std::string::const_iterator responseBegin = m_response.begin();
+ const std::string hostAndPort(responseBegin + leftParenFound + 1,
+ responseBegin + rightParenFound);
+
+ // parse into string fields
+ std::vector<std::string> fields = split(hostAndPort, PASV_REPLY_SEPARATOR);
+ if (fields.size() != 6) return false;
+
+ // fetch passive connection IP
+ m_dataHostname =
+ fields[0] + IP_SEPARATOR + fields[1] + IP_SEPARATOR + fields[2] + IP_SEPARATOR + fields[3];
+
+ // fetch passive connection port
+ const uint8_t portUpper = static_cast<uint8_t>(std::atoi(fields[4].c_str()));
+ const uint8_t portLower = static_cast<uint8_t>(std::atoi(fields[5].c_str()));
+ m_dataPort = (portUpper << 8) + portLower;
+
+ // return success
+ return true;
+}
+
+void BamFtp::ParseUrl(const std::string& url)
+{
+
+ // clear flag to start
+ m_isUrlParsed = false;
+
+ // make sure url starts with "ftp://", case-insensitive
+ std::string tempUrl(url);
+ toLower(tempUrl);
+ const std::size_t prefixFound = tempUrl.find(FTP_PREFIX);
+ if (prefixFound == std::string::npos) return;
+
+ // find end of host name portion (first '/' hit after the prefix)
+ const std::size_t firstSlashFound = tempUrl.find(HOST_SEPARATOR, FTP_PREFIX_LENGTH);
+ if (firstSlashFound == std::string::npos) {
+ ; // no slash found... no filename given along with host?
+ }
+
+ // fetch hostname
+ std::string hostname = tempUrl.substr(FTP_PREFIX_LENGTH, (firstSlashFound - FTP_PREFIX_LENGTH));
+ m_hostname = hostname;
+ m_port = FTP_PORT;
+
+ // store remainder of URL as filename (must be non-empty)
+ std::string filename = tempUrl.substr(firstSlashFound);
+ if (filename.empty()) return;
+ m_filename = filename;
+
+ // set parsed OK flag
+ m_isUrlParsed = true;
+}
+
+int64_t BamFtp::Read(char* data, const unsigned int numBytes)
+{
+
+ // if BamHttp not in a valid state
+ if (!IsOpen()) return -1;
+
+ // read until hit desired @numBytes
+ int64_t bytesReadSoFar = 0;
+ while (bytesReadSoFar < numBytes) {
+
+ // calculate number of bytes we're going to try to read this iteration
+ const std::size_t remainingBytes = (numBytes - bytesReadSoFar);
+
+ // if either disconnected somehow, or (more likely) we have seeked since last read
+ if (!m_dataSocket->IsConnected()) {
+ if (!ConnectDataSocket()) {
+ // TODO: set error string
+ return -1;
+ }
+ }
+
+ // read bytes from data socket
+ const int64_t socketBytesRead = ReadDataSocket(data + bytesReadSoFar, remainingBytes);
+ if (socketBytesRead < 0) // error
+ return -1;
+ else if (socketBytesRead == 0) // EOF
+ return bytesReadSoFar;
+ bytesReadSoFar += socketBytesRead;
+ m_filePosition += socketBytesRead;
+ }
+
+ // return actual number bytes successfully read
+ return bytesReadSoFar;
+}
+
+int64_t BamFtp::ReadCommandSocket(char* data, const unsigned int maxNumBytes)
+{
+ return m_commandSocket->Read(data, maxNumBytes);
+}
+
+int64_t BamFtp::ReadDataSocket(char* data, const unsigned int maxNumBytes)
+{
+ return m_dataSocket->Read(data, maxNumBytes);
+}
+
+bool BamFtp::ReceiveReply()
+{
+
+ // failure if not connected
+ if (!m_commandSocket->IsConnected()) {
+ SetErrorString("BamFtp::ReceiveReply()", "command socket not connected");
+ return false;
+ }
+
+ m_response.clear();
+
+ // read header data (& discard for now)
+ bool headerEnd = false;
+ while (!headerEnd) {
+
+ const std::string headerLine = m_commandSocket->ReadLine();
+ m_response += headerLine;
+
+ // if line is of form 'xyz ', quit reading lines
+ if ((headerLine.length() >= 4) && isdigit(headerLine[0]) && isdigit(headerLine[1]) &&
+ isdigit(headerLine[2]) && (headerLine[3] != MULTILINE_CONTINUE)) {
+ headerEnd = true;
+ }
+ }
+
+ // return success, depending on response
+ if (m_response.empty()) {
+ SetErrorString("BamFtp::ReceiveReply", "error reading server reply");
+ return false;
+ }
+ return true;
+}
+
+bool BamFtp::Seek(const int64_t& position, const int origin)
+{
+
+ // if FTP device not in a valid state
+ if (!IsOpen()) {
+ // TODO: set error string
+ return false;
+ }
+
+ // ----------------------
+ // UGLY !! but works??
+ // ----------------------
+ // disconnect from server
+ m_dataSocket->DisconnectFromHost();
+ m_commandSocket->DisconnectFromHost();
+
+ // update file position & return success
+ if (origin == SEEK_CUR)
+ m_filePosition += position;
+ else if (origin == SEEK_SET)
+ m_filePosition = position;
+ else {
+ // TODO: set error string
+ return false;
+ }
+ return true;
+}
+
+bool BamFtp::SendCommand(const std::string& command, bool waitForReply)
+{
+
+ // failure if not connected
+ if (!m_commandSocket->IsConnected()) {
+ SetErrorString("BamFtp::SendCommand", "command socket not connected");
+ return false;
+ }
+
+ // write command to 'command socket'
+ if (WriteCommandSocket(command.c_str(), command.length()) == -1) {
+ SetErrorString("BamFtp::SendCommand", "error writing to socket");
+ // get actual error from command socket??
+ return false;
+ }
+
+ // if we sent a command that receives a response
+ if (waitForReply) return ReceiveReply();
+
+ // return success
+ return true;
+}
+
+int64_t BamFtp::Tell() const
+{
+ return (IsOpen() ? m_filePosition : -1);
+}
+
+int64_t BamFtp::Write(const char* data, const unsigned int numBytes)
+{
+ (void)data;
+ (void)numBytes;
+ BT_ASSERT_X(false, "BamFtp::Write : write-mode not supported on this device");
+ SetErrorString("BamFtp::Write", "write-mode not supported on this device");
+ return -1;
+}
+
+int64_t BamFtp::WriteCommandSocket(const char* data, const unsigned int numBytes)
+{
+ if (!m_commandSocket->IsConnected()) return -1;
+ m_commandSocket->ClearBuffer();
+ return m_commandSocket->Write(data, numBytes);
+}
+
+int64_t BamFtp::WriteDataSocket(const char* data, const unsigned int numBytes)
+{
+ (void)data;
+ (void)numBytes;
+ BT_ASSERT_X(false, "BamFtp::WriteDataSocket: write-mode not supported on this device");
+ SetErrorString("BamFtp::Write", "write-mode not supported on this device");
+ return -1;
+}
diff --git a/src/api/internal/io/BamFtp_p.h b/src/api/internal/io/BamFtp_p.h
new file mode 100644
index 0000000..563299a
--- /dev/null
+++ b/src/api/internal/io/BamFtp_p.h
@@ -0,0 +1,91 @@
+// ***************************************************************************
+// BamFtp_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 November 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides reading/writing of BAM files on FTP server
+// ***************************************************************************
+
+#ifndef BAMFTP_P_H
+#define BAMFTP_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <string>
+#include "api/IBamIODevice.h"
+
+namespace BamTools {
+namespace Internal {
+
+class TcpSocket;
+
+class BamFtp : public IBamIODevice
+{
+
+ // ctor & dtor
+public:
+ BamFtp(const std::string& url);
+ ~BamFtp();
+
+ // IBamIODevice implementation
+public:
+ void Close();
+ bool IsOpen() const;
+ bool IsRandomAccess() const;
+ bool Open(const IBamIODevice::OpenMode mode);
+ int64_t Read(char* data, const unsigned int numBytes);
+ bool Seek(const int64_t& position, const int origin = SEEK_SET);
+ int64_t Tell() const;
+ int64_t Write(const char* data, const unsigned int numBytes);
+
+ // internal methods
+private:
+ bool ConnectCommandSocket();
+ bool ConnectDataSocket();
+ bool ParsePassiveResponse();
+ void ParseUrl(const std::string& url);
+ int64_t ReadCommandSocket(char* data, const unsigned int numBytes);
+ int64_t ReadDataSocket(char* data, const unsigned int numBytes);
+ bool ReceiveReply();
+ bool SendCommand(const std::string& command, bool waitForReply);
+ int64_t WriteCommandSocket(const char* data, const unsigned int numBytes);
+ int64_t WriteDataSocket(const char* data, const unsigned int numBytes);
+
+ // data members
+private:
+ // our main sockets
+ TcpSocket* m_commandSocket;
+ TcpSocket* m_dataSocket;
+
+ // our connection data
+ std::string m_hostname;
+ uint16_t m_port;
+ std::string m_dataHostname;
+ uint16_t m_dataPort;
+ std::string m_filename;
+
+ std::string m_username;
+ std::string m_password;
+
+ std::string m_response;
+
+ // internal state flags
+ bool m_isUrlParsed;
+
+ // file position
+ int64_t m_filePosition;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMFTP_P_H
diff --git a/src/api/internal/io/BamHttp_p.cpp b/src/api/internal/io/BamHttp_p.cpp
new file mode 100644
index 0000000..81017be
--- /dev/null
+++ b/src/api/internal/io/BamHttp_p.cpp
@@ -0,0 +1,554 @@
+// ***************************************************************************
+// BamHttp_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 24 July 2013 (DB)
+// ---------------------------------------------------------------------------
+// Provides reading/writing of BAM files on HTTP server
+// ***************************************************************************
+
+#include "api/internal/io/BamHttp_p.h"
+#include "api/BamAux.h"
+#include "api/internal/io/HttpHeader_p.h"
+#include "api/internal/io/TcpSocket_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cstddef>
+#include <cstdlib>
+#include <sstream>
+
+namespace BamTools {
+namespace Internal {
+
+// -----------
+// constants
+// -----------
+
+static const std::string HTTP_PORT = "80";
+static const std::string HTTP_PREFIX = "http://";
+static const std::size_t HTTP_PREFIX_LENGTH = 7;
+
+static const std::string DOUBLE_NEWLINE = "\n\n";
+
+static const std::string GET_METHOD = "GET";
+static const std::string HEAD_METHOD = "HEAD";
+static const std::string HOST_HEADER = "Host";
+static const std::string RANGE_HEADER = "Range";
+static const std::string BYTES_PREFIX = "bytes=";
+static const std::string CONTENT_LENGTH_HEADER = "Content-Length";
+
+static const char HOST_SEPARATOR = '/';
+static const char PROXY_SEPARATOR = ':';
+
+// -----------------
+// utility methods
+// -----------------
+
+static inline bool endsWith(const std::string& source, const std::string& pattern)
+{
+ return (source.find(pattern) == (source.length() - pattern.length()));
+}
+
+static inline std::string toLower(const std::string& s)
+{
+ std::string out;
+ const std::size_t sSize = s.size();
+ out.reserve(sSize);
+ for (std::size_t i = 0; i < sSize; ++i)
+ out[i] = tolower(s[i]);
+ return out;
+}
+
+} // namespace Internal
+} // namespace BamTools
+
+// ------------------------
+// BamHttp implementation
+// ------------------------
+
+BamHttp::BamHttp(const std::string& url)
+ : IBamIODevice()
+ , m_socket(new TcpSocket)
+ , m_port(HTTP_PORT)
+ , m_request(0)
+ , m_response(0)
+ , m_isUrlParsed(false)
+ , m_filePosition(-1)
+ , m_fileEndPosition(-1)
+ , m_rangeEndPosition(-1)
+{
+ ParseUrl(url);
+}
+
+BamHttp::~BamHttp()
+{
+
+ // close connection & clean up
+ Close();
+ if (m_socket) delete m_socket;
+}
+
+void BamHttp::ClearResponse()
+{
+ if (m_response) {
+ delete m_response;
+ m_response = 0;
+ }
+}
+
+void BamHttp::Close()
+{
+
+ // disconnect socket & clear related resources
+ DisconnectSocket();
+
+ // reset state
+ m_isUrlParsed = false;
+ m_filePosition = -1;
+ m_fileEndPosition = -1;
+ m_rangeEndPosition = -1;
+ m_mode = IBamIODevice::NotOpen;
+}
+
+bool BamHttp::ConnectSocket()
+{
+
+ BT_ASSERT_X(m_socket, "null socket?");
+
+ // any state checks, etc?
+ if (!m_socket->ConnectToHost(m_hostname, m_port, m_mode)) {
+ SetErrorString("BamHttp::ConnectSocket", m_socket->GetErrorString());
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+void BamHttp::DisconnectSocket()
+{
+
+ // disconnect socket & clean up
+ m_socket->DisconnectFromHost();
+ ClearResponse();
+ if (m_request) {
+ delete m_request;
+ m_request = 0;
+ }
+}
+
+bool BamHttp::EnsureSocketConnection()
+{
+ if (m_socket->IsConnected()) return true;
+ return ConnectSocket();
+}
+
+bool BamHttp::IsOpen() const
+{
+ return IBamIODevice::IsOpen() && m_isUrlParsed;
+}
+
+bool BamHttp::IsRandomAccess() const
+{
+ return true;
+}
+
+bool BamHttp::Open(const IBamIODevice::OpenMode mode)
+{
+
+ // BamHttp only supports read-only access
+ if (mode != IBamIODevice::ReadOnly) {
+ SetErrorString("BamHttp::Open", "writing on this device is not supported");
+ return false;
+ }
+ m_mode = mode;
+
+ // attempt connection to socket
+ if (!ConnectSocket()) {
+ SetErrorString("BamHttp::Open", m_socket->GetErrorString());
+ return false;
+ }
+
+ // initialize our file positions
+ m_filePosition = 0;
+ m_fileEndPosition = 0;
+ m_rangeEndPosition = 0;
+
+ // attempt to send initial request (just 'HEAD' to check connection)
+ if (!SendHeadRequest()) {
+ SetErrorString("BamHttp::Open", m_socket->GetErrorString());
+ return false;
+ }
+
+ // clear response from HEAD request, not needed
+ ClearResponse();
+
+ // return success
+ return true;
+}
+
+void BamHttp::ParseUrl(const std::string& url)
+{
+
+ // clear flag to start
+ m_isUrlParsed = false;
+
+ // make sure url starts with "http://", case-insensitive
+ std::string tempUrl(url);
+ toLower(tempUrl);
+ const std::size_t prefixFound = tempUrl.find(HTTP_PREFIX);
+ if (prefixFound == std::string::npos) return;
+
+ // find end of host name portion (first '/' hit after the prefix)
+ const std::size_t firstSlashFound = tempUrl.find(HOST_SEPARATOR, HTTP_PREFIX_LENGTH);
+ if (firstSlashFound == std::string::npos) {
+ ; // no slash found... no filename given along with host?
+ }
+
+ // fetch hostname (check for proxy port)
+ std::string hostname =
+ tempUrl.substr(HTTP_PREFIX_LENGTH, (firstSlashFound - HTTP_PREFIX_LENGTH));
+ const std::size_t colonFound = hostname.find(PROXY_SEPARATOR);
+ if (colonFound != std::string::npos) {
+ ; // TODO: handle proxy port (later, just skip for now)
+ } else {
+ m_hostname = hostname;
+ m_port = HTTP_PORT;
+ }
+
+ // store remainder of URL as filename (must be non-empty)
+ std::string filename = tempUrl.substr(firstSlashFound);
+ if (filename.empty()) return;
+ m_filename = filename;
+
+ // set parsed OK flag
+ m_isUrlParsed = true;
+}
+
+int64_t BamHttp::Read(char* data, const unsigned int numBytes)
+{
+
+ // if BamHttp not in a valid state
+ if (!IsOpen()) return -1;
+
+ int64_t numBytesReadSoFar = 0;
+ while (numBytesReadSoFar < numBytes) {
+
+ const std::size_t remaining = static_cast<std::size_t>(numBytes - numBytesReadSoFar);
+
+ // if we're not holding a valid GET reponse, get one
+ if (m_response == 0) {
+ if (!SendGetRequest(remaining)) return -1;
+ }
+ BT_ASSERT_X(m_response, "null HTTP response");
+
+ // check response status code
+ const int statusCode = m_response->GetStatusCode();
+
+ // if we receieved full file contents in response
+ if (statusCode == 200) {
+
+ // try to read 'remaining' bytes from socket
+ const int64_t socketBytesRead = ReadFromSocket(data + numBytesReadSoFar, remaining);
+
+ // if error
+ if (socketBytesRead < 0) {
+ SetErrorString("BamHttp::Read", m_socket->GetErrorString());
+ return -1;
+ }
+
+ // EOF
+ else if (socketBytesRead == 0)
+ return numBytesReadSoFar;
+
+ // update counters
+ numBytesReadSoFar += socketBytesRead;
+ m_filePosition += socketBytesRead;
+
+ }
+
+ // else if we received a range of bytes in response
+ else if (statusCode == 206) {
+
+ // if we've exhausted the last request
+ if (m_filePosition == m_rangeEndPosition) {
+ if (!SendGetRequest(remaining)) return -1;
+ }
+
+ else {
+
+ // try to read 'remaining' bytes from socket
+ const int64_t socketBytesRead = ReadFromSocket(data + numBytesReadSoFar, remaining);
+
+ // if error
+ if (socketBytesRead < 0) {
+ SetErrorString("BamHttp::Read", m_socket->GetErrorString());
+ return -1;
+ }
+
+ // maybe EOF
+ else if (socketBytesRead == 0) {
+
+ // if we know we're not at end position, fire off a new request
+ if (m_fileEndPosition > 0 && m_filePosition < m_fileEndPosition) {
+ if (!SendGetRequest()) return -1;
+ } else
+ return numBytesReadSoFar;
+ }
+
+ // update counters
+ numBytesReadSoFar += socketBytesRead;
+ m_filePosition += socketBytesRead;
+ }
+ }
+
+ // else some other HTTP status
+ else {
+ SetErrorString("BamHttp::Read", "unsupported status code in response");
+ return -1;
+ }
+ }
+
+ // return actual number of bytes read
+ return numBytesReadSoFar;
+}
+
+int64_t BamHttp::ReadFromSocket(char* data, const unsigned int maxNumBytes)
+{
+ return m_socket->Read(data, maxNumBytes);
+}
+
+bool BamHttp::ReceiveResponse()
+{
+
+ // fetch header, up until double new line
+ std::string responseHeader;
+ do {
+
+ // make sure we can read a line
+ if (!m_socket->WaitForReadLine()) return false;
+
+ // read line & append to full header
+ const std::string headerLine = m_socket->ReadLine();
+ responseHeader += headerLine;
+
+ } while (!endsWith(responseHeader, DOUBLE_NEWLINE));
+
+ // sanity check
+ if (responseHeader.empty()) {
+ SetErrorString("BamHttp::ReceiveResponse", "empty HTTP response");
+ Close();
+ return false;
+ }
+
+ // create response from header text
+ m_response = new HttpResponseHeader(responseHeader);
+ if (!m_response->IsValid()) {
+ SetErrorString("BamHttp::ReceiveResponse", "could not parse HTTP response");
+ Close();
+ return false;
+ }
+
+ // if we get here, success
+ return true;
+}
+
+bool BamHttp::Seek(const int64_t& position, const int origin)
+{
+
+ // if HTTP device not in a valid state
+ if (!IsOpen()) {
+ SetErrorString("BamHttp::Seek", "cannot seek on unopen connection");
+ return false;
+ }
+
+ // reset the connection
+ DisconnectSocket();
+ if (!ConnectSocket()) {
+ SetErrorString("BamHttp::Seek", m_socket->GetErrorString());
+ return false;
+ }
+
+ // udpate file position
+ switch (origin) {
+ case SEEK_CUR:
+ m_filePosition += position;
+ break;
+ case SEEK_SET:
+ m_filePosition = position;
+ break;
+ default:
+ SetErrorString("BamHttp::Seek", "unsupported seek origin");
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+bool BamHttp::SendGetRequest(const std::size_t numBytes)
+{
+
+ // clear previous data
+ ClearResponse();
+ if (m_request) delete m_request;
+ m_socket->ClearBuffer();
+
+ // make sure we're connected
+ if (!EnsureSocketConnection()) return false;
+
+ // create range string
+ const int64_t endPosition =
+ m_filePosition + std::max(static_cast<std::size_t>(0x10000), numBytes);
+ std::stringstream range;
+ range << BYTES_PREFIX << m_filePosition << '-' << endPosition;
+
+ // create request
+ m_request = new HttpRequestHeader(GET_METHOD, m_filename);
+ m_request->SetField(HOST_HEADER, m_hostname);
+ m_request->SetField(RANGE_HEADER, range.str());
+
+ // send request
+ const std::string requestHeader = m_request->ToString();
+ const int64_t headerSize = requestHeader.size();
+ if (WriteToSocket(requestHeader.c_str(), headerSize) != headerSize) {
+ SetErrorString("BamHttp::SendHeadRequest", m_socket->GetErrorString());
+ return false;
+ }
+
+ // ensure clean buffer
+ m_socket->ClearBuffer();
+
+ // wait for response
+ if (!ReceiveResponse()) {
+ SetErrorString("BamHttp::SendGetRequest", m_socket->GetErrorString());
+ Close();
+ return false;
+ }
+ BT_ASSERT_X(m_response, "BamHttp::SendGetRequest : null HttpResponse");
+ BT_ASSERT_X(m_response->IsValid(), "BamHttp::SendGetRequest : invalid HttpResponse");
+
+ // check response status code
+ const int statusCode = m_response->GetStatusCode();
+ switch (statusCode) {
+
+ // ranged response, as requested
+ case 206:
+ // get content length if available
+ if (m_response->ContainsKey(CONTENT_LENGTH_HEADER)) {
+ const std::string contentLengthString = m_response->GetValue(CONTENT_LENGTH_HEADER);
+ m_rangeEndPosition = m_filePosition + std::atoi(contentLengthString.c_str());
+ }
+ return true;
+
+ // full contents, not range
+ case 200: {
+ // skip up to current file position
+ RaiiBuffer tmp(0x8000);
+ int64_t numBytesRead = 0;
+ while (numBytesRead < m_filePosition) {
+
+ // read data from response
+ const int64_t remaining = m_filePosition - numBytesRead;
+ const std::size_t bytesToRead =
+ static_cast<std::size_t>((remaining > 0x8000) ? 0x8000 : remaining);
+ const int64_t socketBytesRead = ReadFromSocket(tmp.Buffer, bytesToRead);
+
+ // if error
+ if (socketBytesRead < 0) {
+ SetErrorString("BamHttp::SendGetRequest", m_socket->GetErrorString());
+ Close();
+ return false;
+ }
+
+ // else if EOF
+ else if (socketBytesRead == 0 && m_socket->BufferBytesAvailable() == 0)
+ break;
+
+ // update byte counter
+ numBytesRead += socketBytesRead;
+ }
+
+ // return success
+ return (numBytesRead == m_filePosition);
+ }
+
+ // any other status codes
+ default:
+ break;
+ }
+
+ // fail on unexpected status code
+ SetErrorString("BamHttp::SendGetRequest", "unsupported status code in response");
+ Close();
+ return false;
+}
+
+bool BamHttp::SendHeadRequest()
+{
+
+ // ensure clean slate
+ ClearResponse();
+ if (m_request) delete m_request;
+ m_socket->ClearBuffer();
+
+ // make sure we're connected
+ if (!EnsureSocketConnection()) return false;
+
+ // create request
+ m_request = new HttpRequestHeader(HEAD_METHOD, m_filename);
+ m_request->SetField(HOST_HEADER, m_hostname);
+
+ // send request
+ const std::string requestHeader = m_request->ToString();
+ const int64_t headerSize = requestHeader.size();
+ if (WriteToSocket(requestHeader.c_str(), headerSize) != headerSize) {
+ SetErrorString("BamHttp::SendHeadRequest", m_socket->GetErrorString());
+ return false;
+ }
+
+ m_socket->ClearBuffer();
+
+ // wait for response from server
+ if (!ReceiveResponse()) {
+ SetErrorString("BamHttp::SendHeadRequest", m_socket->GetErrorString());
+ Close();
+ return false;
+ }
+ BT_ASSERT_X(m_response, "BamHttp::SendHeadRequest : null HttpResponse");
+ BT_ASSERT_X(m_response->IsValid(), "BamHttp::SendHeadRequest : invalid HttpResponse");
+
+ // get content length if available
+ if (m_response->ContainsKey(CONTENT_LENGTH_HEADER)) {
+ const std::string contentLengthString = m_response->GetValue(CONTENT_LENGTH_HEADER);
+ m_fileEndPosition = std::atoi(contentLengthString.c_str()) - 1;
+ }
+
+ // return whether we found any errors
+ return m_socket->GetError() == TcpSocket::NoError;
+}
+
+int64_t BamHttp::Tell() const
+{
+ return (IsOpen() ? m_filePosition : -1);
+}
+
+int64_t BamHttp::Write(const char* data, const unsigned int numBytes)
+{
+ (void)data;
+ (void)numBytes;
+ BT_ASSERT_X(false, "BamHttp::Write : write-mode not supported on this device");
+ SetErrorString("BamHttp::Write", "write-mode not supported on this device");
+ return -1;
+}
+
+int64_t BamHttp::WriteToSocket(const char* data, const unsigned int numBytes)
+{
+ if (!m_socket->IsConnected()) return -1;
+ m_socket->ClearBuffer();
+ return m_socket->Write(data, numBytes);
+}
diff --git a/src/api/internal/io/BamHttp_p.h b/src/api/internal/io/BamHttp_p.h
new file mode 100644
index 0000000..62d0d7b
--- /dev/null
+++ b/src/api/internal/io/BamHttp_p.h
@@ -0,0 +1,92 @@
+// ***************************************************************************
+// BamHttp_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 November 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides reading/writing of BAM files on HTTP server
+// ***************************************************************************
+
+#ifndef BAMHTTP_P_H
+#define BAMHTTP_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <cstddef>
+#include <string>
+#include "api/IBamIODevice.h"
+
+namespace BamTools {
+namespace Internal {
+
+class HttpRequestHeader;
+class HttpResponseHeader;
+class TcpSocket;
+
+class BamHttp : public IBamIODevice
+{
+
+ // ctor & dtor
+public:
+ BamHttp(const std::string& url);
+ ~BamHttp();
+
+ // IBamIODevice implementation
+public:
+ void Close();
+ bool IsOpen() const;
+ bool IsRandomAccess() const;
+ bool Open(const IBamIODevice::OpenMode mode);
+ int64_t Read(char* data, const unsigned int numBytes);
+ bool Seek(const int64_t& position, const int origin = SEEK_SET);
+ int64_t Tell() const;
+ int64_t Write(const char* data, const unsigned int numBytes);
+
+ // internal methods
+private:
+ void ClearResponse();
+ bool ConnectSocket();
+ void DisconnectSocket();
+ bool EnsureSocketConnection();
+ void ParseUrl(const std::string& url);
+ int64_t ReadFromSocket(char* data, const unsigned int numBytes);
+ bool ReceiveResponse();
+ bool SendGetRequest(const std::size_t numBytes = 0x10000);
+ bool SendHeadRequest();
+ int64_t WriteToSocket(const char* data, const unsigned int numBytes);
+
+ // data members
+private:
+ // our main socket
+ TcpSocket* m_socket;
+
+ // our connection data
+ std::string m_hostname;
+ std::string m_port;
+ std::string m_filename;
+
+ // our last (active) request & response info
+ HttpRequestHeader* m_request;
+ HttpResponseHeader* m_response;
+
+ // internal state flags
+ bool m_isUrlParsed;
+
+ // file position
+ int64_t m_filePosition;
+ int64_t m_fileEndPosition;
+ int64_t m_rangeEndPosition;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMHTTP_P_H
diff --git a/src/api/internal/io/BamPipe_p.cpp b/src/api/internal/io/BamPipe_p.cpp
new file mode 100644
index 0000000..3dd2c94
--- /dev/null
+++ b/src/api/internal/io/BamPipe_p.cpp
@@ -0,0 +1,73 @@
+// ***************************************************************************
+// BamPipe_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 18 October 2012 (DB)
+// ---------------------------------------------------------------------------
+// Provides BAM pipe-specific IO behavior
+// ***************************************************************************
+
+#include "api/internal/io/BamPipe_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdio>
+#include <iostream>
+
+BamPipe::BamPipe()
+ : ILocalIODevice()
+{}
+
+BamPipe::~BamPipe() {}
+
+bool BamPipe::IsRandomAccess() const
+{
+ return false;
+}
+
+bool BamPipe::Open(const IBamIODevice::OpenMode mode)
+{
+
+ // make sure we're starting with a fresh pipe
+ Close();
+
+ // open stdin/stdout depending on requested openmode
+#if defined(SYSTEM_NODEJS) && SYSTEM_NODEJS == 1
+ if (mode == IBamIODevice::ReadOnly)
+ m_stream = stdin;
+ else if (mode == IBamIODevice::WriteOnly)
+ m_stream = stdout;
+#else
+ if (mode == IBamIODevice::ReadOnly)
+ m_stream = freopen(0, "rb", stdin);
+ else if (mode == IBamIODevice::WriteOnly)
+ m_stream = freopen(0, "wb", stdout);
+#endif // SYSTEM_NODEJS
+
+ else {
+ const std::string errorType =
+ std::string((mode == IBamIODevice::ReadWrite) ? "unsupported" : "unknown");
+ const std::string message = errorType + " open mode requested";
+ SetErrorString("BamPipe::Open", message);
+ return false;
+ }
+
+ // check that we obtained a valid FILE*
+ if (m_stream == 0) {
+ const std::string message_base = std::string("could not open handle on ");
+ const std::string message =
+ message_base + ((mode == IBamIODevice::ReadOnly) ? "stdin" : "stdout");
+ SetErrorString("BamPipe::Open", message);
+ return false;
+ }
+
+ // store current IO mode & return success
+ m_mode = mode;
+ return true;
+}
+
+bool BamPipe::Seek(const int64_t&, const int)
+{
+ SetErrorString("BamPipe::Seek", "random access not allowed in FIFO pipe");
+ return false;
+}
diff --git a/src/api/internal/io/BamPipe_p.h b/src/api/internal/io/BamPipe_p.h
new file mode 100644
index 0000000..764823d
--- /dev/null
+++ b/src/api/internal/io/BamPipe_p.h
@@ -0,0 +1,47 @@
+// ***************************************************************************
+// BamPipe_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 November 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides BAM pipe-specific IO behavior
+// ***************************************************************************
+
+#ifndef BAMPIPE_P_H
+#define BAMPIPE_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <string>
+#include "api/internal/io/ILocalIODevice_p.h"
+
+namespace BamTools {
+namespace Internal {
+
+class BamPipe : public ILocalIODevice
+{
+
+ // ctor & dtor
+public:
+ BamPipe();
+ ~BamPipe();
+
+ // IBamIODevice implementation
+public:
+ bool IsRandomAccess() const;
+ bool Open(const IBamIODevice::OpenMode mode);
+ bool Seek(const int64_t& position, const int origin = SEEK_SET);
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMPIPE_P_H
diff --git a/src/api/internal/io/BgzfStream_p.cpp b/src/api/internal/io/BgzfStream_p.cpp
new file mode 100644
index 0000000..1adf87e
--- /dev/null
+++ b/src/api/internal/io/BgzfStream_p.cpp
@@ -0,0 +1,468 @@
+// ***************************************************************************
+// BgzfStream_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 17 January 2012(DB)
+// ---------------------------------------------------------------------------
+// Based on BGZF routines developed at the Broad Institute.
+// Provides the basic functionality for reading & writing BGZF files
+// Replaces the old BGZF.* files to avoid clashing with other toolkits
+// ***************************************************************************
+
+#include "api/internal/io/BgzfStream_p.h"
+#include "api/BamAux.h"
+#include "api/BamConstants.h"
+#include "api/internal/io/BamDeviceFactory_p.h"
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <zlib.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstring>
+#include <iostream>
+#include <sstream>
+
+// ---------------------------
+// BgzfStream implementation
+// ---------------------------
+
+// constructor
+BgzfStream::BgzfStream()
+ : m_blockLength(0)
+ , m_blockOffset(0)
+ , m_blockAddress(0)
+ , m_isWriteCompressed(true)
+ , m_device(0)
+ , m_uncompressedBlock(Constants::BGZF_DEFAULT_BLOCK_SIZE)
+ , m_compressedBlock(Constants::BGZF_MAX_BLOCK_SIZE)
+{}
+
+// destructor
+BgzfStream::~BgzfStream()
+{
+ Close();
+}
+
+// checks BGZF block header
+bool BgzfStream::CheckBlockHeader(char* header)
+{
+ return (header[0] == Constants::GZIP_ID1 && header[1] == Constants::GZIP_ID2 &&
+ header[2] == Z_DEFLATED && (header[3] & Constants::FLG_FEXTRA) != 0 &&
+ BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN &&
+ header[12] == Constants::BGZF_ID1 && header[13] == Constants::BGZF_ID2 &&
+ BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN);
+}
+
+// closes BGZF file
+void BgzfStream::Close()
+{
+
+ // skip if no device open
+ if (m_device == 0) return;
+
+ // if writing to file, flush the current BGZF block,
+ // then write an empty block (as EOF marker)
+ if (m_device->IsOpen() && (m_device->Mode() == IBamIODevice::WriteOnly)) {
+ FlushBlock();
+ const std::size_t blockLength = DeflateBlock(0);
+ m_device->Write(m_compressedBlock.Buffer, blockLength);
+ }
+
+ // close device
+ m_device->Close();
+ delete m_device;
+ m_device = 0;
+
+ // ensure our buffers are cleared out
+ m_uncompressedBlock.Clear();
+ m_compressedBlock.Clear();
+
+ // reset state
+ m_blockLength = 0;
+ m_blockOffset = 0;
+ m_blockAddress = 0;
+ m_isWriteCompressed = true;
+}
+
+// compresses the current block
+std::size_t BgzfStream::DeflateBlock(int32_t blockLength)
+{
+
+ // initialize the gzip header
+ char* buffer = m_compressedBlock.Buffer;
+ memset(buffer, 0, 18);
+ buffer[0] = Constants::GZIP_ID1;
+ buffer[1] = Constants::GZIP_ID2;
+ buffer[2] = Constants::CM_DEFLATE;
+ buffer[3] = Constants::FLG_FEXTRA;
+ buffer[9] = Constants::OS_UNKNOWN;
+ buffer[10] = Constants::BGZF_XLEN;
+ buffer[12] = Constants::BGZF_ID1;
+ buffer[13] = Constants::BGZF_ID2;
+ buffer[14] = Constants::BGZF_LEN;
+
+ // set compression level
+ const int compressionLevel = (m_isWriteCompressed ? Z_DEFAULT_COMPRESSION : 0);
+
+ // loop to retry for blocks that do not compress enough
+ int inputLength = blockLength;
+ std::size_t compressedLength = 0;
+ const unsigned int bufferSize = Constants::BGZF_MAX_BLOCK_SIZE;
+
+ while (true) {
+
+ // initialize zstream values
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = (Bytef*)m_uncompressedBlock.Buffer;
+ zs.avail_in = inputLength;
+ zs.next_out = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH];
+ zs.avail_out =
+ bufferSize - Constants::BGZF_BLOCK_HEADER_LENGTH - Constants::BGZF_BLOCK_FOOTER_LENGTH;
+
+ // initialize the zlib compression algorithm
+ int status = deflateInit2(&zs, compressionLevel, Z_DEFLATED, Constants::GZIP_WINDOW_BITS,
+ Constants::Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ if (status != Z_OK)
+ throw BamException("BgzfStream::DeflateBlock", "zlib deflateInit2 failed");
+
+ // compress the data
+ status = deflate(&zs, Z_FINISH);
+
+ // if not at stream end
+ if (status != Z_STREAM_END) {
+
+ deflateEnd(&zs);
+
+ // there was not enough space available in buffer
+ // try to reduce the input length & re-start loop
+ if (status == Z_OK) {
+ inputLength -= 1024;
+ if (inputLength < 0)
+ throw BamException("BgzfStream::DeflateBlock", "input reduction failed");
+ continue;
+ }
+
+ throw BamException("BgzfStream::DeflateBlock", "zlib deflate failed");
+ }
+
+ // finalize the compression routine
+ status = deflateEnd(&zs);
+ if (status != Z_OK)
+ throw BamException("BgzfStream::DeflateBlock", "zlib deflateEnd failed");
+
+ // update compressedLength
+ compressedLength = zs.total_out + Constants::BGZF_BLOCK_HEADER_LENGTH +
+ Constants::BGZF_BLOCK_FOOTER_LENGTH;
+ if (compressedLength > Constants::BGZF_MAX_BLOCK_SIZE)
+ throw BamException("BgzfStream::DeflateBlock", "deflate overflow");
+
+ // quit while loop
+ break;
+ }
+
+ // store the compressed length
+ BamTools::PackUnsignedShort(&buffer[16], static_cast<uint16_t>(compressedLength - 1));
+
+ // store the CRC32 checksum
+ uint32_t crc = crc32(0, NULL, 0);
+ crc = crc32(crc, (Bytef*)m_uncompressedBlock.Buffer, inputLength);
+ BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc);
+ BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
+
+ // ensure that we have less than a block of data left
+ int remaining = blockLength - inputLength;
+ if (remaining > 0) {
+ if (remaining > inputLength)
+ throw BamException("BgzfStream::DeflateBlock", "after deflate, remainder too large");
+ memcpy(m_uncompressedBlock.Buffer, m_uncompressedBlock.Buffer + inputLength, remaining);
+ }
+
+ // update block data
+ m_blockOffset = remaining;
+
+ // return result
+ return compressedLength;
+}
+
+// flushes the data in the BGZF block
+void BgzfStream::FlushBlock()
+{
+
+ BT_ASSERT_X(m_device, "BgzfStream::FlushBlock() - attempting to flush to null device");
+
+ // flush all of the remaining blocks
+ while (m_blockOffset > 0) {
+
+ // compress the data block
+ const std::size_t blockLength = DeflateBlock(m_blockOffset);
+
+ // flush the data to our output device
+ const int64_t numBytesWritten = m_device->Write(m_compressedBlock.Buffer, blockLength);
+
+ // check for device error
+ if (numBytesWritten < 0) {
+ const std::string message = std::string("device error: ") + m_device->GetErrorString();
+ throw BamException("BgzfStream::FlushBlock", message);
+ }
+
+ // check that we wrote expected numBytes
+ if (numBytesWritten != static_cast<int64_t>(blockLength)) {
+ std::stringstream s;
+ s << "expected to write " << blockLength << " bytes during flushing, but wrote "
+ << numBytesWritten;
+ throw BamException("BgzfStream::FlushBlock", s.str());
+ }
+
+ // update block data
+ m_blockAddress += blockLength;
+ }
+}
+
+// decompresses the current block
+std::size_t BgzfStream::InflateBlock(const std::size_t& blockLength)
+{
+
+ // setup zlib stream object
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = (Bytef*)m_compressedBlock.Buffer + 18;
+ zs.avail_in = blockLength - 16;
+ zs.next_out = (Bytef*)m_uncompressedBlock.Buffer;
+ zs.avail_out = Constants::BGZF_DEFAULT_BLOCK_SIZE;
+
+ // initialize
+ int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS);
+ if (status != Z_OK) throw BamException("BgzfStream::InflateBlock", "zlib inflateInit failed");
+
+ // decompress
+ status = inflate(&zs, Z_FINISH);
+ if (status != Z_STREAM_END) {
+ inflateEnd(&zs);
+ throw BamException("BgzfStream::InflateBlock", "zlib inflate failed");
+ }
+
+ // finalize
+ status = inflateEnd(&zs);
+ if (status != Z_OK) {
+ inflateEnd(&zs);
+ throw BamException("BgzfStream::InflateBlock", "zlib inflateEnd failed");
+ }
+
+ // return result
+ return zs.total_out;
+}
+
+bool BgzfStream::IsOpen() const
+{
+ if (m_device == 0) return false;
+ return m_device->IsOpen();
+}
+
+void BgzfStream::Open(const std::string& filename, const IBamIODevice::OpenMode mode)
+{
+
+ // close current device if necessary
+ Close();
+ BT_ASSERT_X((m_device == 0),
+ "BgzfStream::Open() - unable to properly close previous IO device");
+
+ // retrieve new IO device depending on filename
+ m_device = BamDeviceFactory::CreateDevice(filename);
+ BT_ASSERT_X(m_device, "BgzfStream::Open() - unable to create IO device from filename");
+
+ // if device fails to open
+ if (!m_device->Open(mode)) {
+ const std::string deviceError = m_device->GetErrorString();
+ const std::string message = std::string("could not open BGZF stream: \n\t") + deviceError;
+ throw BamException("BgzfStream::Open", message);
+ }
+}
+
+// reads BGZF data into a byte buffer
+std::size_t BgzfStream::Read(char* data, const std::size_t dataLength)
+{
+
+ if (dataLength == 0) return 0;
+
+ // if stream not open for reading
+ BT_ASSERT_X(m_device, "BgzfStream::Read() - trying to read from null device");
+ if (!m_device->IsOpen() || (m_device->Mode() != IBamIODevice::ReadOnly)) return 0;
+
+ // read blocks as needed until desired data length is retrieved
+ char* output = data;
+ std::size_t numBytesRead = 0;
+ while (numBytesRead < dataLength) {
+
+ // determine bytes available in current block
+ int bytesAvailable = m_blockLength - m_blockOffset;
+
+ // read (and decompress) next block if needed
+ if (bytesAvailable <= 0) {
+ ReadBlock();
+ bytesAvailable = m_blockLength - m_blockOffset;
+ if (bytesAvailable <= 0) break;
+ }
+
+ // copy data from uncompressed source buffer into data destination buffer
+ const std::size_t copyLength =
+ std::min((dataLength - numBytesRead), static_cast<std::size_t>(bytesAvailable));
+ memcpy(output, m_uncompressedBlock.Buffer + m_blockOffset, copyLength);
+
+ // update counters
+ m_blockOffset += copyLength;
+ output += copyLength;
+ numBytesRead += copyLength;
+ }
+
+ // update block data
+ if (m_blockOffset == m_blockLength) {
+ m_blockAddress = m_device->Tell();
+ m_blockOffset = 0;
+ m_blockLength = 0;
+ }
+
+ // return actual number of bytes read
+ return numBytesRead;
+}
+
+// reads a BGZF block
+void BgzfStream::ReadBlock()
+{
+
+ BT_ASSERT_X(m_device, "BgzfStream::ReadBlock() - trying to read from null IO device");
+
+ // store block's starting address
+ const int64_t blockAddress = m_device->Tell();
+
+ // read block header from file
+ char header[Constants::BGZF_BLOCK_HEADER_LENGTH];
+ int64_t numBytesRead = m_device->Read(header, Constants::BGZF_BLOCK_HEADER_LENGTH);
+
+ // check for device error
+ if (numBytesRead < 0) {
+ const std::string message = std::string("device error: ") + m_device->GetErrorString();
+ throw BamException("BgzfStream::ReadBlock", message);
+ }
+
+ // if block header empty
+ if (numBytesRead == 0) {
+ m_blockLength = 0;
+ return;
+ }
+
+ // if block header invalid size
+ if (numBytesRead != static_cast<int8_t>(Constants::BGZF_BLOCK_HEADER_LENGTH))
+ throw BamException("BgzfStream::ReadBlock", "invalid block header size");
+
+ // validate block header contents
+ if (!BgzfStream::CheckBlockHeader(header))
+ throw BamException("BgzfStream::ReadBlock", "invalid block header contents");
+
+ // copy header contents to compressed buffer
+ const std::size_t blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1;
+ memcpy(m_compressedBlock.Buffer, header, Constants::BGZF_BLOCK_HEADER_LENGTH);
+
+ // read remainder of block
+ const std::size_t remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH;
+ numBytesRead =
+ m_device->Read(&m_compressedBlock.Buffer[Constants::BGZF_BLOCK_HEADER_LENGTH], remaining);
+
+ // check for device error
+ if (numBytesRead < 0) {
+ const std::string message = std::string("device error: ") + m_device->GetErrorString();
+ throw BamException("BgzfStream::ReadBlock", message);
+ }
+
+ // check that we read in expected numBytes
+ if (numBytesRead != static_cast<int64_t>(remaining))
+ throw BamException("BgzfStream::ReadBlock", "could not read data from block");
+
+ // decompress block data
+ const std::size_t newBlockLength = InflateBlock(blockLength);
+
+ // update block data
+ if (m_blockLength != 0) m_blockOffset = 0;
+ m_blockAddress = blockAddress;
+ m_blockLength = newBlockLength;
+}
+
+// seek to position in BGZF file
+void BgzfStream::Seek(const int64_t& position)
+{
+
+ BT_ASSERT_X(m_device, "BgzfStream::Seek() - trying to seek on null IO device");
+
+ // skip if device is not open
+ if (!IsOpen()) return;
+
+ // determine adjusted offset & address
+ int blockOffset = (position & 0xFFFF);
+ int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
+
+ // attempt seek in file
+ if (m_device->IsRandomAccess() && m_device->Seek(blockAddress)) {
+
+ // update block data & return success
+ m_blockLength = 0;
+ m_blockAddress = blockAddress;
+ m_blockOffset = blockOffset;
+ } else {
+ std::stringstream s;
+ s << "unable to seek to position: " << position;
+ throw BamException("BgzfStream::Seek", s.str());
+ }
+}
+
+void BgzfStream::SetWriteCompressed(bool ok)
+{
+ m_isWriteCompressed = ok;
+}
+
+// get file position in BGZF file
+int64_t BgzfStream::Tell() const
+{
+ if (!IsOpen()) return 0;
+ return ((m_blockAddress << 16) | (m_blockOffset & 0xFFFF));
+}
+
+// writes the supplied data into the BGZF buffer
+std::size_t BgzfStream::Write(const char* data, const std::size_t dataLength)
+{
+
+ BT_ASSERT_X(m_device, "BgzfStream::Write() - trying to write to null IO device");
+ BT_ASSERT_X((m_device->Mode() == IBamIODevice::WriteOnly),
+ "BgzfStream::Write() - trying to write to non-writable IO device");
+
+ // skip if file not open for writing
+ if (!IsOpen()) return 0;
+
+ // write blocks as needed til all data is written
+ std::size_t numBytesWritten = 0;
+ const char* input = data;
+ const std::size_t blockLength = Constants::BGZF_DEFAULT_BLOCK_SIZE;
+ while (numBytesWritten < dataLength) {
+
+ // copy data contents to uncompressed output buffer
+ unsigned int copyLength =
+ std::min(blockLength - m_blockOffset, dataLength - numBytesWritten);
+ char* buffer = m_uncompressedBlock.Buffer;
+ memcpy(buffer + m_blockOffset, input, copyLength);
+
+ // update counter
+ m_blockOffset += copyLength;
+ input += copyLength;
+ numBytesWritten += copyLength;
+
+ // flush (& compress) output buffer when full
+ if (m_blockOffset == static_cast<int32_t>(blockLength)) FlushBlock();
+ }
+
+ // return actual number of bytes written
+ return numBytesWritten;
+}
diff --git a/src/api/internal/io/BgzfStream_p.h b/src/api/internal/io/BgzfStream_p.h
new file mode 100644
index 0000000..abf3290
--- /dev/null
+++ b/src/api/internal/io/BgzfStream_p.h
@@ -0,0 +1,95 @@
+// ***************************************************************************
+// BgzfStream_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 17 January 2012(DB)
+// ---------------------------------------------------------------------------
+// Based on BGZF routines developed at the Broad Institute.
+// Provides the basic functionality for reading & writing BGZF files
+// Replaces the old BGZF.* files to avoid clashing with other toolkits
+// ***************************************************************************
+
+#ifndef BGZFSTREAM_P_H
+#define BGZFSTREAM_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <cstddef>
+#include <string>
+#include "api/BamAux.h"
+#include "api/IBamIODevice.h"
+#include "api/api_global.h"
+
+namespace BamTools {
+namespace Internal {
+
+class BgzfStream
+{
+
+ // constructor & destructor
+public:
+ BgzfStream();
+ ~BgzfStream();
+
+ // main interface methods
+public:
+ // closes BGZF file
+ void Close();
+ // returns true if BgzfStream open for IO
+ bool IsOpen() const;
+ // opens the BGZF file
+ void Open(const std::string& filename, const IBamIODevice::OpenMode mode);
+ // reads BGZF data into a byte buffer
+ std::size_t Read(char* data, const std::size_t dataLength);
+ // seek to position in BGZF file
+ void Seek(const int64_t& position);
+ // sets IO device (closes previous, if any, but does not attempt to open)
+ void SetIODevice(IBamIODevice* device);
+ // enable/disable compressed output
+ void SetWriteCompressed(bool ok);
+ // get file position in BGZF file
+ int64_t Tell() const;
+ // writes the supplied data into the BGZF buffer
+ std::size_t Write(const char* data, const std::size_t dataLength);
+
+ // internal methods
+private:
+ // compresses the current block
+ std::size_t DeflateBlock(int32_t blockLength);
+ // flushes the data in the BGZF block
+ void FlushBlock();
+ // de-compresses the current block
+ std::size_t InflateBlock(const std::size_t& blockLength);
+ // reads a BGZF block
+ void ReadBlock();
+
+ // static 'utility' methods
+public:
+ // checks BGZF block header
+ static bool CheckBlockHeader(char* header);
+
+ // data members
+public:
+ int32_t m_blockLength;
+ int32_t m_blockOffset;
+ int64_t m_blockAddress;
+
+ bool m_isWriteCompressed;
+ IBamIODevice* m_device;
+
+ RaiiBuffer m_uncompressedBlock;
+ RaiiBuffer m_compressedBlock;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BGZFSTREAM_P_H
diff --git a/src/api/internal/io/ByteArray_p.cpp b/src/api/internal/io/ByteArray_p.cpp
new file mode 100644
index 0000000..c8d3e8f
--- /dev/null
+++ b/src/api/internal/io/ByteArray_p.cpp
@@ -0,0 +1,120 @@
+// ***************************************************************************
+// ByteArray_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 November 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides a dynamic, variable-length byte buffer
+// ***************************************************************************
+
+#include "api/internal/io/ByteArray_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+
+// --------------------------
+// ByteArray implementation
+// --------------------------
+
+ByteArray::ByteArray()
+ : m_data()
+{}
+
+ByteArray::ByteArray(const std::string& value)
+ : m_data(value.begin(), value.end())
+{}
+
+ByteArray::ByteArray(const std::vector<char>& value)
+ : m_data(value)
+{}
+
+ByteArray::ByteArray(const char* value, std::size_t n)
+{
+ const std::string s(value, n);
+ m_data.assign(s.begin(), s.end());
+}
+
+ByteArray::ByteArray(const ByteArray& other)
+ : m_data(other.m_data)
+{}
+
+ByteArray::~ByteArray() {}
+
+ByteArray& ByteArray::operator=(const ByteArray& other)
+{
+ m_data = other.m_data;
+ return *this;
+}
+
+void ByteArray::Clear()
+{
+ m_data.clear();
+}
+
+const char* ByteArray::ConstData() const
+{
+ return &m_data[0];
+}
+
+char* ByteArray::Data()
+{
+ return &m_data[0];
+}
+
+const char& ByteArray::operator[](std::size_t i) const
+{
+ return m_data[i];
+}
+
+char& ByteArray::operator[](std::size_t i)
+{
+ return m_data[i];
+}
+
+std::size_t ByteArray::IndexOf(const char c, const std::size_t from, const std::size_t to) const
+{
+ const std::size_t size = ((to == 0) ? m_data.size() : to);
+ for (std::size_t i = from; i < size; ++i) {
+ if (m_data.at(i) == c) return i;
+ }
+ return m_data.size();
+}
+
+ByteArray& ByteArray::Remove(std::size_t from, std::size_t n)
+{
+
+ // if 'from' outside range, just return
+ const std::size_t originalSize = m_data.size();
+ if (from >= originalSize) return *this;
+
+ // if asked to clip from 'from' to end (or beyond), simply resize
+ if (from + n >= originalSize) Resize(from);
+
+ // otherwise, shift data & resize
+ else {
+ memmove(&m_data[from], &m_data[from + n], (originalSize - from - n));
+ Resize(originalSize - n);
+ }
+
+ // return reference to modified byte array
+ return *this;
+}
+
+void ByteArray::Resize(std::size_t n)
+{
+ m_data.resize(n, 0);
+}
+
+std::size_t ByteArray::Size() const
+{
+ return m_data.size();
+}
+
+void ByteArray::Squeeze()
+{
+ std::vector<char> t(m_data);
+ t.swap(m_data);
+}
diff --git a/src/api/internal/io/ByteArray_p.h b/src/api/internal/io/ByteArray_p.h
new file mode 100644
index 0000000..9f0f527
--- /dev/null
+++ b/src/api/internal/io/ByteArray_p.h
@@ -0,0 +1,70 @@
+// ***************************************************************************
+// ByteArray_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 November 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides a dynamic, variable-length byte buffer
+// ***************************************************************************
+
+#ifndef BYTEARRAY_P_H
+#define BYTEARRAY_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <cstddef>
+#include <string>
+#include <vector>
+#include "api/api_global.h"
+
+namespace BamTools {
+namespace Internal {
+
+// provides a wrapper around a byte vector
+class ByteArray
+{
+
+ // ctors & dtor
+public:
+ ByteArray();
+ ByteArray(const std::string& value);
+ ByteArray(const std::vector<char>& value);
+ ByteArray(const char* value, std::size_t n);
+ ByteArray(const ByteArray& other);
+ ~ByteArray();
+
+ ByteArray& operator=(const ByteArray& other);
+
+ // ByteArray interface
+public:
+ // data access
+ const char* ConstData() const;
+ char* Data();
+ const char& operator[](std::size_t i) const;
+ char& operator[](std::size_t i);
+
+ // byte array manipulation
+ void Clear();
+ std::size_t IndexOf(const char c, const std::size_t from = 0, const std::size_t to = 0) const;
+ ByteArray& Remove(std::size_t from, std::size_t n);
+ void Resize(std::size_t n);
+ std::size_t Size() const;
+ void Squeeze();
+
+ // data members
+private:
+ std::vector<char> m_data;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BYTEARRAY_P_H
diff --git a/src/api/internal/io/CMakeLists.txt b/src/api/internal/io/CMakeLists.txt
new file mode 100644
index 0000000..28153d5
--- /dev/null
+++ b/src/api/internal/io/CMakeLists.txt
@@ -0,0 +1,48 @@
+# ==========================
+# BamTools CMakeLists.txt
+# (c) 2011 Derek Barnett
+#
+# src/api/internal/io
+# ==========================
+
+set( InternalIODir "${InternalDir}/io" )
+
+#--------------------------
+# platform-independent IO
+#--------------------------
+set( CommonIOSources
+ ${InternalIODir}/BamDeviceFactory_p.cpp
+ ${InternalIODir}/BamFile_p.cpp
+ ${InternalIODir}/BamFtp_p.cpp
+ ${InternalIODir}/BamHttp_p.cpp
+ ${InternalIODir}/BamPipe_p.cpp
+ ${InternalIODir}/BgzfStream_p.cpp
+ ${InternalIODir}/ByteArray_p.cpp
+ ${InternalIODir}/HostAddress_p.cpp
+ ${InternalIODir}/HostInfo_p.cpp
+ ${InternalIODir}/HttpHeader_p.cpp
+ ${InternalIODir}/ILocalIODevice_p.cpp
+ ${InternalIODir}/RollingBuffer_p.cpp
+ ${InternalIODir}/TcpSocket_p.cpp
+ ${InternalIODir}/TcpSocketEngine_p.cpp
+)
+
+#------------------------
+# platform-dependent IO
+#------------------------
+if( WIN32 )
+ set( PlatformIOSources ${InternalIODir}/TcpSocketEngine_win_p.cpp )
+else()
+ set( PlatformIOSources ${InternalIODir}/TcpSocketEngine_unix_p.cpp )
+endif()
+
+#---------------------------
+# make build-specific list
+#---------------------------
+set( InternalIOSources
+ ${CommonIOSources}
+ ${PlatformIOSources}
+
+ PARENT_SCOPE # <-- leave this last
+)
+
diff --git a/src/api/internal/io/HostAddress_p.cpp b/src/api/internal/io/HostAddress_p.cpp
new file mode 100644
index 0000000..3a3f43e
--- /dev/null
+++ b/src/api/internal/io/HostAddress_p.cpp
@@ -0,0 +1,393 @@
+// ***************************************************************************
+// HostAddress_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 8 December 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides a generic IP address container
+// ***************************************************************************
+
+#include "api/internal/io/HostAddress_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cctype>
+#include <cstddef>
+#include <cstdlib>
+#include <sstream>
+#include <vector>
+
+// ------------------------
+// static utility methods
+// ------------------------
+
+namespace BamTools {
+namespace Internal {
+
+// split a string into fields, on delimiter character
+static inline std::vector<std::string> Split(const std::string& source, char delim)
+{
+ std::stringstream ss(source);
+ std::string field;
+ std::vector<std::string> fields;
+ while (std::getline(ss, field, delim))
+ fields.push_back(field);
+ return fields;
+}
+
+// return number of occurrences of @pattern in @source
+static inline uint8_t CountHits(const std::string& source, const std::string& pattern)
+{
+
+ uint8_t count(0);
+ std::size_t found = source.find(pattern);
+ while (found != std::string::npos) {
+ ++count;
+ found = source.find(pattern, found + 1);
+ }
+ return count;
+}
+
+static bool ParseIp4(const std::string& address, uint32_t& maybeIp4)
+{
+
+ // split IP address into string fields
+ std::vector<std::string> addressFields = Split(address, '.');
+ if (addressFields.size() != 4) return false;
+
+ // convert each field to integer value
+ uint32_t ipv4(0);
+ for (uint8_t i = 0; i < 4; ++i) {
+
+ const std::string& field = addressFields.at(i);
+ const std::size_t fieldSize = field.size();
+ for (std::size_t j = 0; j < fieldSize; ++j) {
+ if (!isdigit(field[j])) return false;
+ }
+
+ int value = std::atoi(addressFields.at(i).c_str());
+ if (value < 0 || value > 255) return false;
+
+ // append byte value
+ ipv4 <<= 8;
+ ipv4 += value;
+ }
+
+ // store 32-bit IP address & return success
+ maybeIp4 = ipv4;
+ return true;
+}
+
+static bool ParseIp6(const std::string& address, uint8_t* maybeIp6)
+{
+
+ std::string tmp = address;
+
+ // look for '%' char (if found, lop off that part of address)
+ // we're going to ignore any link-local zone index, for now at least
+ const std::size_t percentFound = tmp.rfind('%');
+ if (percentFound != std::string::npos) tmp = tmp.substr(0, percentFound);
+
+ // split IP address into string fields
+ std::vector<std::string> fields = Split(tmp, ':');
+ const uint8_t numFields = fields.size();
+ if (numFields < 3 || numFields > 8) return false;
+
+ // get number of '::' separators
+ const uint8_t numColonColons = CountHits(tmp, "::");
+ if (numFields == 8 && numColonColons > 1) return false;
+
+ // check valid IPv6 'compression'
+ // must be valid 'pure' IPv6 or mixed IPv4/6 notation
+ const std::size_t dotFound = tmp.find('.');
+ const bool isMixed = (dotFound != std::string::npos);
+ if (numColonColons != 1 && (numFields < (isMixed ? 7 : 8))) return false;
+
+ // iterate over provided fields
+ std::size_t index = 16;
+ std::size_t fillCount = 9 - numFields;
+ for (int8_t i = numFields - 1; i >= 0; --i) {
+ if (index == 0) return false;
+ const std::string& field = fields.at(i);
+
+ // if field empty
+ if (field.empty()) {
+
+ // if last field empty
+ if (i == numFields - 1) {
+ const std::string& previousField = fields.at(i - 1);
+ if (previousField.empty()) return false;
+ maybeIp6[--index] = 0;
+ maybeIp6[--index] = 0;
+ }
+
+ // if first field empty
+ else if (i == 0) {
+ // make sure ':' isn't first character
+ const std::string& nextField = fields.at(i + 1);
+ if (nextField.empty()) return false;
+ maybeIp6[--index] = 0;
+ maybeIp6[--index] = 0;
+ }
+
+ // fill in 'compressed' 0s
+ else {
+ for (uint8_t j = 0; j < fillCount; ++j) {
+ if (index == 0) return false;
+ maybeIp6[--index] = 0;
+ maybeIp6[--index] = 0;
+ }
+ }
+ }
+
+ // field has data
+ else {
+ uint32_t value = static_cast<uint32_t>(strtoul(field.c_str(), 0, 16));
+
+ if (value <= 0xffff) {
+ maybeIp6[--index] = value & 0xff;
+ maybeIp6[--index] = (value >> 8) & 0xff;
+ }
+
+ // possible mixed IPv4/6 notation
+ else {
+
+ // mixed field must be last
+ if (i != numFields - 1) return false;
+
+ // parse the IPv4 section
+ uint32_t maybeIp4;
+ if (!ParseIp4(field, maybeIp4)) return false;
+
+ // store IPv4 fields in IPv6 container
+ maybeIp6[--index] = maybeIp4 & 0xff;
+ maybeIp6[--index] = (maybeIp4 >> 8) & 0xff;
+ maybeIp6[--index] = (maybeIp4 >> 16) & 0xff;
+ maybeIp6[--index] = (maybeIp4 >> 24) & 0xff;
+ --fillCount;
+ }
+ }
+ }
+
+ // should have parsed OK, return success
+ return true;
+}
+
+} // namespace Internal
+} // namespace BamTools
+
+// ----------------------------
+// HostAddress implementation
+// ----------------------------
+
+HostAddress::HostAddress()
+ : m_protocol(HostAddress::UnknownNetworkProtocol)
+ , m_ip4Address(0)
+ , m_hasIpAddress(true)
+{}
+
+HostAddress::HostAddress(const uint32_t ip4Address)
+ : m_protocol(HostAddress::UnknownNetworkProtocol)
+ , m_ip4Address(0)
+ , m_hasIpAddress(true)
+{
+ SetAddress(ip4Address);
+}
+
+HostAddress::HostAddress(const uint8_t* ip6Address)
+ : m_protocol(HostAddress::UnknownNetworkProtocol)
+ , m_ip4Address(0)
+ , m_hasIpAddress(true)
+{
+ SetAddress(ip6Address);
+}
+
+HostAddress::HostAddress(const IPv6Address& ip6Address)
+ : m_protocol(HostAddress::UnknownNetworkProtocol)
+ , m_ip4Address(0)
+ , m_hasIpAddress(true)
+{
+ SetAddress(ip6Address);
+}
+
+HostAddress::HostAddress(const std::string& address)
+ : m_protocol(HostAddress::UnknownNetworkProtocol)
+ , m_ip4Address(0)
+{
+ SetAddress(address);
+}
+
+HostAddress::HostAddress(const HostAddress& other)
+ : m_protocol(other.m_protocol)
+ , m_ip4Address(other.m_ip4Address)
+ , m_ip6Address(other.m_ip6Address)
+ , m_ipString(other.m_ipString)
+ , m_hasIpAddress(other.m_hasIpAddress)
+{}
+
+HostAddress::~HostAddress() {}
+
+bool HostAddress::operator==(const HostAddress& other) const
+{
+
+ // if self is IPv4
+ if (m_protocol == HostAddress::IPv4Protocol) {
+ return (other.m_protocol == HostAddress::IPv4Protocol &&
+ m_ip4Address == other.m_ip4Address);
+ }
+
+ // if self is IPv6
+ else if (m_protocol == HostAddress::IPv6Protocol) {
+ return (other.m_protocol == HostAddress::IPv6Protocol &&
+ memcmp(&m_ip6Address, &other.m_ip6Address, sizeof(IPv6Address)) == 0);
+ }
+
+ // otherwise compare protocols
+ else
+ return m_protocol == other.m_protocol;
+}
+
+bool HostAddress::operator<(const HostAddress& other) const
+{
+
+ // if self is IPv4
+ if (m_protocol == HostAddress::IPv4Protocol) {
+ if (other.m_protocol == HostAddress::IPv4Protocol) return m_ip4Address < other.m_ip4Address;
+ }
+
+ // if self is IPv6
+ else if (m_protocol == HostAddress::IPv6Protocol) {
+ if (other.m_protocol == HostAddress::IPv6Protocol)
+ return (memcmp(&m_ip6Address, &other.m_ip6Address, sizeof(IPv6Address)) < 0);
+ }
+
+ // otherwise compare protocol types
+ return m_protocol < other.m_protocol;
+}
+
+void HostAddress::Clear()
+{
+
+ m_protocol = HostAddress::UnknownNetworkProtocol;
+ m_ip4Address = 0;
+ memset(&m_ip6Address, 0, sizeof(IPv6Address));
+ m_ipString.clear();
+
+ // this may feel funny, but cleared IP (equivalent to '0.0.0.0') is technically valid
+ // and that's not really what this flag is checking anyway
+ //
+ // this flag is false *iff* the string passed in is a 'plain-text' hostname (www.foo.bar)
+ m_hasIpAddress = true;
+}
+
+bool HostAddress::HasIPAddress() const
+{
+ return m_hasIpAddress;
+}
+
+bool HostAddress::IsNull() const
+{
+ return m_protocol == HostAddress::UnknownNetworkProtocol;
+}
+
+uint32_t HostAddress::GetIPv4Address() const
+{
+ return m_ip4Address;
+}
+
+IPv6Address HostAddress::GetIPv6Address() const
+{
+ return m_ip6Address;
+}
+
+std::string HostAddress::GetIPString() const
+{
+
+ std::stringstream ss;
+
+ // IPv4 format
+ if (m_protocol == HostAddress::IPv4Protocol) {
+ ss << ((m_ip4Address >> 24) & 0xff) << '.' << ((m_ip4Address >> 16) & 0xff) << '.'
+ << ((m_ip4Address >> 8) & 0xff) << '.' << (m_ip4Address & 0xff);
+
+ }
+
+ // IPv6 format
+ else if (m_protocol == HostAddress::IPv6Protocol) {
+ for (uint8_t i = 0; i < 8; ++i) {
+ if (i != 0) ss << ':';
+ ss << std::hex
+ << ((uint16_t(m_ip6Address[2 * i]) << 8) | (uint16_t(m_ip6Address[2 * i + 1])));
+ }
+ }
+
+ // return result (empty string if unknown protocol)
+ return ss.str();
+}
+
+HostAddress::NetworkProtocol HostAddress::GetProtocol() const
+{
+ return m_protocol;
+}
+
+bool HostAddress::ParseAddress()
+{
+
+ // all IPv6 addresses should have a ':'
+ std::string s = m_ipString;
+ std::size_t found = s.find(':');
+ if (found != std::string::npos) {
+ // try parse IP6 address
+ uint8_t maybeIp6[16];
+ if (ParseIp6(s, maybeIp6)) {
+ SetAddress(maybeIp6);
+ m_protocol = HostAddress::IPv6Protocol;
+ return true;
+ }
+ }
+
+ // all IPv4 addresses should have a '.'
+ found = s.find('.');
+ if (found != std::string::npos) {
+ uint32_t maybeIp4(0);
+ if (ParseIp4(s, maybeIp4)) {
+ SetAddress(maybeIp4);
+ m_protocol = HostAddress::IPv4Protocol;
+ return true;
+ }
+ }
+
+ // else likely just a plain-text host name "www.foo.bar"
+ // will need to look up IP address info later
+ m_protocol = HostAddress::UnknownNetworkProtocol;
+ return false;
+}
+
+void HostAddress::SetAddress(const uint32_t ip4Address)
+{
+ m_ip4Address = ip4Address;
+ m_protocol = HostAddress::IPv4Protocol;
+ m_hasIpAddress = true;
+}
+
+void HostAddress::SetAddress(const uint8_t* ip6Address)
+{
+ for (uint8_t i = 0; i < 16; ++i)
+ m_ip6Address[i] = ip6Address[i];
+ m_protocol = HostAddress::IPv6Protocol;
+ m_hasIpAddress = true;
+}
+
+void HostAddress::SetAddress(const IPv6Address& ip6Address)
+{
+ m_ip6Address = ip6Address;
+ m_ip4Address = 0;
+ m_protocol = HostAddress::IPv6Protocol;
+ m_hasIpAddress = true;
+}
+
+void HostAddress::SetAddress(const std::string& address)
+{
+ m_ipString = address;
+ m_hasIpAddress = ParseAddress();
+}
diff --git a/src/api/internal/io/HostAddress_p.h b/src/api/internal/io/HostAddress_p.h
new file mode 100644
index 0000000..c330200
--- /dev/null
+++ b/src/api/internal/io/HostAddress_p.h
@@ -0,0 +1,117 @@
+// ***************************************************************************
+// HostAddress_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 November 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides a generic IP address container
+// ***************************************************************************
+
+#ifndef HOSTADDRESS_P_H
+#define HOSTADDRESS_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <cstddef>
+#include <cstring>
+#include <string>
+#include "api/api_global.h"
+
+namespace BamTools {
+namespace Internal {
+
+struct IPv6Address
+{
+
+ // ctor
+ inline IPv6Address()
+ {
+ memset(&data, 0, sizeof(uint8_t) * 16);
+ }
+
+ // data access (no bounds checking)
+ inline uint8_t& operator[](std::size_t index)
+ {
+ return data[index];
+ }
+ inline uint8_t operator[](std::size_t index) const
+ {
+ return data[index];
+ }
+
+ // data
+ uint8_t data[16];
+};
+
+class HostAddress
+{
+
+ // enums
+public:
+ enum NetworkProtocol
+ {
+ UnknownNetworkProtocol = -1,
+ IPv4Protocol = 0,
+ IPv6Protocol
+ };
+
+ // ctors & dtor
+public:
+ HostAddress();
+ explicit HostAddress(const uint32_t ip4Address);
+ explicit HostAddress(const uint8_t* ip6Address);
+ explicit HostAddress(const IPv6Address& ip6Address);
+ explicit HostAddress(const std::string& address);
+ HostAddress(const HostAddress& other);
+ ~HostAddress();
+
+ // HostAddress interface
+public:
+ void Clear();
+ bool HasIPAddress() const; // returns whether string address could be converted to IP address
+ bool IsNull() const;
+
+ uint32_t GetIPv4Address() const;
+ IPv6Address GetIPv6Address() const;
+ std::string GetIPString() const;
+ HostAddress::NetworkProtocol GetProtocol() const;
+
+ void SetAddress(const uint32_t ip4Address);
+ void SetAddress(const uint8_t* ip6Address);
+ void SetAddress(const IPv6Address& ip6Address);
+ void SetAddress(const std::string& address);
+
+ // HostAddress comparison operators
+public:
+ bool operator==(const HostAddress& other) const;
+ bool operator!=(const HostAddress& other) const
+ {
+ return !(operator==(other));
+ }
+ bool operator<(const HostAddress& other) const;
+
+ // internal methods
+private:
+ bool ParseAddress();
+
+ // data members
+private:
+ HostAddress::NetworkProtocol m_protocol;
+ uint32_t m_ip4Address;
+ IPv6Address m_ip6Address;
+ std::string m_ipString;
+ bool m_hasIpAddress; // true until string passed in, then signifies whether string was an IP
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // HOSTADDRESS_P_H
diff --git a/src/api/internal/io/HostInfo_p.cpp b/src/api/internal/io/HostInfo_p.cpp
new file mode 100644
index 0000000..56b5165
--- /dev/null
+++ b/src/api/internal/io/HostInfo_p.cpp
@@ -0,0 +1,229 @@
+// ***************************************************************************
+// HostInfo_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 8 December 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides DNS lookup functionality for hostname & its discovered addresses
+// ***************************************************************************
+
+#include "api/internal/io/HostInfo_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+// platorm-specifics
+#ifdef _WIN32
+#include "api/internal/io/NetWin_p.h"
+#else
+#include "api/internal/io/NetUnix_p.h"
+#endif
+
+// standard C++ includes
+#include <cstdlib>
+#include <cstring>
+#include <set>
+
+// -------------------------
+// HostInfo implementation
+// -------------------------
+
+HostInfo::HostInfo()
+ : m_error(HostInfo::NoError)
+{}
+
+HostInfo::HostInfo(const HostInfo& other)
+ : m_hostName(other.m_hostName)
+ , m_addresses(other.m_addresses)
+ , m_error(other.m_error)
+ , m_errorString(other.m_errorString)
+{}
+
+HostInfo::~HostInfo() {}
+
+std::vector<HostAddress> HostInfo::Addresses() const
+{
+ return m_addresses;
+}
+
+HostInfo::ErrorType HostInfo::GetError() const
+{
+ return m_error;
+}
+
+std::string HostInfo::GetErrorString() const
+{
+ return m_errorString;
+}
+
+std::string HostInfo::HostName() const
+{
+ return m_hostName;
+}
+
+void HostInfo::SetAddresses(const std::vector<HostAddress>& addresses)
+{
+ m_addresses = addresses;
+}
+
+void HostInfo::SetError(const HostInfo::ErrorType error)
+{
+ m_error = error;
+}
+
+void HostInfo::SetErrorString(const std::string& errorString)
+{
+ m_errorString = errorString;
+}
+
+void HostInfo::SetHostName(const std::string& name)
+{
+ m_hostName = name;
+}
+
+// ---------------------------------
+// HostInfo::Lookup(host, port)
+// - the real "heavy-lifter" here
+// ---------------------------------
+
+HostInfo HostInfo::Lookup(const std::string& hostname, const std::string& port)
+{
+
+ HostInfo result;
+ result.SetHostName(hostname);
+ std::set<HostAddress> uniqueAddresses;
+
+#ifdef _WIN32
+ WindowsSockInit init;
+#endif
+
+ HostAddress address;
+ address.SetAddress(hostname);
+
+ // if hostname is an IP string ('0.0.0.0' or IPv6 format)
+ // do reverse lookup for host domain name
+ //
+ // TODO: might just remove this... not sure if proper 'hostname' from IP string is needed
+ //
+ // so far, haven't been able to successfully fetch a domain name with reverse DNS
+ // getnameinfo() on test sites just returns original IP string. BUT this is likely a rare
+ // case that client code tries to use an IP string and the connection should work fine
+ // anyway. GetHostName() just won't quite show what I was hoping for. :(
+ if (address.HasIPAddress()) {
+
+ const uint16_t portNum = static_cast<uint16_t>(std::atoi(port.c_str()));
+
+ sockaddr_in sa4;
+ sockaddr_in6 sa6;
+ sockaddr* sa = 0;
+ BT_SOCKLEN_T saSize = 0;
+
+ // IPv4
+ if (address.GetProtocol() == HostAddress::IPv4Protocol) {
+ sa = (sockaddr*)&sa4;
+ saSize = sizeof(sa4);
+ memset(&sa4, 0, sizeof(sa4));
+ sa4.sin_family = AF_INET;
+ sa4.sin_addr.s_addr = htonl(address.GetIPv4Address());
+ sa4.sin_port = htons(portNum);
+ }
+
+ // IPv6
+ else if (address.GetProtocol() == HostAddress::IPv4Protocol) {
+ sa = (sockaddr*)&sa6;
+ saSize = sizeof(sa6);
+ memset(&sa6, 0, sizeof(sa6));
+ sa6.sin6_family = AF_INET6;
+ memcpy(sa6.sin6_addr.s6_addr, address.GetIPv6Address().data,
+ sizeof(sa6.sin6_addr.s6_addr));
+ sa6.sin6_port = htons(portNum);
+ }
+
+ // unknown (should be unreachable)
+ else
+ BT_ASSERT_X(false, "HostInfo::Lookup: unknown network protocol");
+
+ // lookup name for IP
+ char hbuf[NI_MAXHOST];
+ char serv[NI_MAXSERV];
+ if (sa && (getnameinfo(sa, saSize, hbuf, sizeof(hbuf), serv, sizeof(serv), 0) == 0))
+ result.SetHostName(std::string(hbuf));
+
+ // if no domain name found, just use the original address's IP string
+ if (result.HostName().empty()) result.SetHostName(address.GetIPString());
+
+ // store address in HostInfo
+ uniqueAddresses.insert(address);
+ }
+
+ // otherwise, hostname is a domain name ('www.foo.bar')
+ // do 'normal' lookup
+ else {
+
+ // setup address lookup 'hints'
+ addrinfo hints;
+ memset(&hints, 0, sizeof(hints));
+ hints.ai_family = AF_UNSPEC; // allow either IPv4 or IPv6
+ hints.ai_socktype = SOCK_STREAM; // for TCP
+ hints.ai_protocol = IPPROTO_TCP;
+
+ // fetch addresses for requested hostname/port
+ addrinfo* res;
+ int status = getaddrinfo(hostname.c_str(), port.c_str(), &hints, &res);
+
+ // if everything OK
+ if (status == 0) {
+
+ // iterate over all IP addresses found
+ addrinfo* p = res;
+ for (; p != NULL; p = p->ai_next) {
+
+ // IPv4
+ if (p->ai_family == AF_INET) {
+ sockaddr_in* ipv4 = (sockaddr_in*)p->ai_addr;
+ HostAddress a(ntohl(ipv4->sin_addr.s_addr));
+ uniqueAddresses.insert(a);
+ }
+
+ // IPv6
+ else if (p->ai_family == AF_INET6) {
+ sockaddr_in6* ipv6 = (sockaddr_in6*)p->ai_addr;
+ HostAddress a(ipv6->sin6_addr.s6_addr);
+ uniqueAddresses.insert(a);
+ }
+ }
+
+ // if we iterated, but no addresses were stored
+ if (uniqueAddresses.empty() && (p == NULL)) {
+ result.SetError(HostInfo::UnknownError);
+ result.SetErrorString("HostInfo: unknown address types found");
+ }
+ }
+
+ // handle error cases
+ else if (
+#ifndef _WIN32
+ status == EAI_NONAME || status == EAI_FAIL
+#ifdef EAI_NODATA
+ || status == EAI_NODATA // officially deprecated, but just in case we happen to hit it
+#endif // EAI_NODATA
+
+#else // _WIN32
+ WSAGetLastError() == WSAHOST_NOT_FOUND || WSAGetLastError() == WSANO_DATA ||
+ WSAGetLastError() == WSANO_RECOVERY
+#endif // _WIN32
+ ) {
+ result.SetError(HostInfo::HostNotFound);
+ result.SetErrorString("HostInfo: host not found");
+ } else {
+ result.SetError(HostInfo::UnknownError);
+ result.SetErrorString("HostInfo: unknown error encountered");
+ }
+
+ // cleanup
+ freeaddrinfo(res);
+ }
+
+ // store fetched addresses (converting set -> vector) in result & return
+ result.SetAddresses(std::vector<HostAddress>(uniqueAddresses.begin(), uniqueAddresses.end()));
+ return result;
+}
diff --git a/src/api/internal/io/HostInfo_p.h b/src/api/internal/io/HostInfo_p.h
new file mode 100644
index 0000000..677073a
--- /dev/null
+++ b/src/api/internal/io/HostInfo_p.h
@@ -0,0 +1,78 @@
+// ***************************************************************************
+// HostInfo_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 November 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides DNS lookup functionality for hostname/IP addresses
+// ***************************************************************************
+
+#ifndef HOSTINFO_P_H
+#define HOSTINFO_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <string>
+#include <vector>
+#include "api/internal/io/HostAddress_p.h"
+
+namespace BamTools {
+namespace Internal {
+
+class HostInfo
+{
+
+public:
+ enum ErrorType
+ {
+ NoError = 0,
+ HostNotFound,
+ UnknownError
+ };
+
+ // ctors & dtor
+public:
+ HostInfo();
+ HostInfo(const HostInfo& other);
+ ~HostInfo();
+
+ // HostInfo interface
+public:
+ std::string HostName() const;
+ void SetHostName(const std::string& name);
+
+ std::vector<HostAddress> Addresses() const;
+ void SetAddresses(const std::vector<HostAddress>& addresses);
+
+ HostInfo::ErrorType GetError() const;
+ std::string GetErrorString() const;
+
+ // internal methods
+private:
+ void SetError(const HostInfo::ErrorType error);
+ void SetErrorString(const std::string& errorString);
+
+ // static methods
+public:
+ static HostInfo Lookup(const std::string& hostname, const std::string& port);
+
+ // data members
+private:
+ std::string m_hostName;
+ std::vector<HostAddress> m_addresses;
+ HostInfo::ErrorType m_error;
+ std::string m_errorString;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // HOSTINFO_P_H
diff --git a/src/api/internal/io/HttpHeader_p.cpp b/src/api/internal/io/HttpHeader_p.cpp
new file mode 100644
index 0000000..bd25f2e
--- /dev/null
+++ b/src/api/internal/io/HttpHeader_p.cpp
@@ -0,0 +1,403 @@
+// ***************************************************************************
+// HttpHeader_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 13 January 2012 (DB)
+// ---------------------------------------------------------------------------
+// Provides a generic interface for parsing/generating HTTP headers, along
+// with specialized request & response header types
+// ***************************************************************************
+
+#include "api/internal/io/HttpHeader_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstddef>
+#include <cstdlib>
+#include <sstream>
+#include <vector>
+
+namespace BamTools {
+
+// -----------
+// constants
+// -----------
+
+namespace Constants {
+
+static const char CAR_RET_CHAR = '\r';
+static const char COLON_CHAR = ':';
+static const char DOT_CHAR = '.';
+static const char NEWLINE_CHAR = '\n';
+static const char SPACE_CHAR = ' ';
+
+static const std::string FIELD_NEWLINE = "\r\n";
+static const std::string FIELD_SEPARATOR = ": ";
+static const std::string HTTP_STRING = "HTTP/";
+
+} // namespace Constants
+
+// ------------------------
+// static utility methods
+// ------------------------
+
+namespace Internal {
+
+static inline bool IsSpace(const char c)
+{
+ const int n = static_cast<int>(c);
+ return (n == 0 || (n <= 13 && n >= 9));
+}
+
+// split on hitting single char delim
+static std::vector<std::string> Split(const std::string& source, const char delim)
+{
+ std::stringstream ss(source);
+ std::string field;
+ std::vector<std::string> fields;
+ while (std::getline(ss, field, delim))
+ fields.push_back(field);
+ return fields;
+}
+
+static std::string Trim(const std::string& source)
+{
+
+ // skip if empty string
+ if (source.empty()) return source;
+
+ // fetch string data
+ const char* s = source.data(); // ignoring null-term on purpose
+ const std::size_t size = source.size();
+ std::size_t start = 0;
+ std::size_t end = size - 1;
+
+ // skip if no spaces at start or end
+ if (!IsSpace(s[start]) && !IsSpace(s[end])) return source;
+
+ // remove leading whitespace
+ while ((start != end) && IsSpace(s[start]))
+ ++start;
+
+ // remove trailing whitespace
+ if (start <= end) {
+ while (end && IsSpace(s[end]))
+ --end;
+ }
+
+ // return result
+ return std::string(s + start, (end - start) + 1);
+}
+
+} // namespace Internal
+} // namespace BamTools
+
+// ---------------------------
+// HttpHeader implementation
+// ---------------------------
+
+HttpHeader::HttpHeader()
+ : m_isValid(true)
+ , m_majorVersion(1)
+ , m_minorVersion(1)
+{}
+
+HttpHeader::HttpHeader(const std::string& s)
+ : m_isValid(true)
+ , m_majorVersion(1)
+ , m_minorVersion(1)
+{
+ Parse(s);
+}
+
+HttpHeader::~HttpHeader() {}
+
+bool HttpHeader::ContainsKey(const std::string& key) const
+{
+ return (m_fields.find(key) != m_fields.end());
+}
+
+int HttpHeader::GetMajorVersion() const
+{
+ return m_majorVersion;
+}
+
+int HttpHeader::GetMinorVersion() const
+{
+ return m_minorVersion;
+}
+
+std::string HttpHeader::GetValue(const std::string& key)
+{
+ if (ContainsKey(key))
+ return m_fields[key];
+ else
+ return std::string();
+}
+
+bool HttpHeader::IsValid() const
+{
+ return m_isValid;
+}
+
+void HttpHeader::Parse(const std::string& s)
+{
+
+ // trim whitespace from input string
+ const std::string trimmed = Trim(s);
+
+ // split into list of header lines
+ std::vector<std::string> rawFields = Split(trimmed, Constants::NEWLINE_CHAR);
+
+ // prep our 'cleaned' fields container
+ std::vector<std::string> cleanFields;
+ cleanFields.reserve(rawFields.size());
+
+ // remove any empty fields and clean any trailing windows-style carriage returns ('\r')
+ std::vector<std::string>::iterator rawFieldIter = rawFields.begin();
+ std::vector<std::string>::iterator rawFieldEnd = rawFields.end();
+ for (; rawFieldIter != rawFieldEnd; ++rawFieldIter) {
+ std::string& field = (*rawFieldIter);
+
+ // skip empty fields
+ if (field.empty()) continue;
+
+ // remove carriage returns
+ const std::size_t fieldSize = field.size();
+ if (field[fieldSize - 1] == Constants::CAR_RET_CHAR) field.resize(fieldSize - 1);
+
+ // store cleaned field
+ cleanFields.push_back(field);
+ }
+
+ // skip add'l processing if nothing here
+ if (cleanFields.empty()) return;
+
+ // parse header lines
+ int lineNumber = 0;
+ std::vector<std::string>::const_iterator fieldIter = cleanFields.begin();
+ std::vector<std::string>::const_iterator fieldEnd = cleanFields.end();
+ for (; fieldIter != fieldEnd; ++fieldIter, ++lineNumber) {
+ if (!ParseLine((*fieldIter), lineNumber)) {
+ m_isValid = false;
+ return;
+ }
+ }
+}
+
+bool HttpHeader::ParseLine(const std::string& line, int)
+{
+
+ // find colon position, return failure if not found
+ const std::size_t colonFound = line.find(Constants::COLON_CHAR);
+ if (colonFound == std::string::npos) return false;
+
+ // store key/value (without leading/trailing whitespace) & return success
+ const std::string key = Trim(line.substr(0, colonFound));
+ const std::string value = Trim(line.substr(colonFound + 1));
+ m_fields[key] = value;
+ return true;
+}
+
+void HttpHeader::RemoveField(const std::string& key)
+{
+ m_fields.erase(key);
+}
+
+void HttpHeader::SetField(const std::string& key, const std::string& value)
+{
+ m_fields[key] = value;
+}
+
+void HttpHeader::SetValid(bool ok)
+{
+ m_isValid = ok;
+}
+
+void HttpHeader::SetVersion(int major, int minor)
+{
+ m_majorVersion = major;
+ m_minorVersion = minor;
+}
+
+std::string HttpHeader::ToString() const
+{
+ std::string result;
+ if (m_isValid) {
+ std::map<std::string, std::string>::const_iterator fieldIter = m_fields.begin();
+ std::map<std::string, std::string>::const_iterator fieldEnd = m_fields.end();
+ for (; fieldIter != fieldEnd; ++fieldIter) {
+ const std::string& key = (*fieldIter).first;
+ const std::string& value = (*fieldIter).second;
+ const std::string& line =
+ key + Constants::FIELD_SEPARATOR + value + Constants::FIELD_NEWLINE;
+ result += line;
+ }
+ }
+ return result;
+}
+
+// ----------------------------------
+// HttpRequestHeader implementation
+// ----------------------------------
+
+HttpRequestHeader::HttpRequestHeader(const std::string& method, const std::string& resource,
+ int majorVersion, int minorVersion)
+ : HttpHeader()
+ , m_method(method)
+ , m_resource(resource)
+{
+ SetVersion(majorVersion, minorVersion);
+}
+
+HttpRequestHeader::~HttpRequestHeader() {}
+
+std::string HttpRequestHeader::GetMethod() const
+{
+ return m_method;
+}
+
+std::string HttpRequestHeader::GetResource() const
+{
+ return m_resource;
+}
+
+bool HttpRequestHeader::ParseLine(const std::string& line, int lineNumber)
+{
+
+ // if not 'request line', just let base class parse
+ if (lineNumber != 0) return HttpHeader::ParseLine(line, lineNumber);
+
+ // fail if empty line
+ if (line.empty()) return false;
+
+ // walk through request line, storing positions
+ // GET /path/to/resource HTTP/1.1
+ // ^ ^^ ^^
+ const std::size_t foundMethod =
+ line.find_first_not_of(Constants::SPACE_CHAR); // skip any leading whitespace
+ if (foundMethod == std::string::npos) return false;
+ const std::size_t foundFirstSpace = line.find(Constants::SPACE_CHAR, foundMethod + 1);
+ if (foundFirstSpace == std::string::npos) return false;
+ const std::size_t foundResource =
+ line.find_first_not_of(Constants::SPACE_CHAR, foundFirstSpace + 1);
+ if (foundResource == std::string::npos) return false;
+ const std::size_t foundSecondSpace = line.find(Constants::SPACE_CHAR, foundResource + 1);
+ if (foundSecondSpace == std::string::npos) return false;
+ const std::size_t foundVersion =
+ line.find_first_not_of(Constants::SPACE_CHAR, foundSecondSpace + 1);
+ if (foundVersion == std::string::npos) return false;
+
+ // parse out method & resource
+ m_method = line.substr(foundMethod, foundFirstSpace - foundMethod);
+ m_resource = line.substr(foundResource, foundSecondSpace - foundResource);
+
+ // parse out version numbers
+ const std::string temp = line.substr(foundVersion);
+ if ((temp.find(Constants::HTTP_STRING) != 0) || (temp.size() != 8)) return false;
+ const int major = static_cast<int>(temp.at(5) - '0');
+ const int minor = static_cast<int>(temp.at(7) - '0');
+ SetVersion(major, minor);
+
+ // if we get here, return success
+ return true;
+}
+
+std::string HttpRequestHeader::ToString() const
+{
+ std::stringstream request;
+ request << m_method << Constants::SPACE_CHAR << m_resource << Constants::SPACE_CHAR
+ << Constants::HTTP_STRING << GetMajorVersion() << Constants::DOT_CHAR
+ << GetMinorVersion() << Constants::FIELD_NEWLINE << HttpHeader::ToString()
+ << Constants::FIELD_NEWLINE;
+ return request.str();
+}
+
+// -----------------------------------
+// HttpResponseHeader implementation
+// -----------------------------------
+
+HttpResponseHeader::HttpResponseHeader(const int statusCode, const std::string& reason,
+ int majorVersion, int minorVersion)
+
+ : HttpHeader()
+ , m_statusCode(statusCode)
+ , m_reason(reason)
+{
+ SetVersion(majorVersion, minorVersion);
+}
+
+HttpResponseHeader::HttpResponseHeader(const std::string& s)
+ : HttpHeader()
+ , m_statusCode(0)
+{
+ Parse(s);
+}
+
+HttpResponseHeader::~HttpResponseHeader() {}
+
+std::string HttpResponseHeader::GetReason() const
+{
+ return m_reason;
+}
+
+int HttpResponseHeader::GetStatusCode() const
+{
+ return m_statusCode;
+}
+
+bool HttpResponseHeader::ParseLine(const std::string& line, int lineNumber)
+{
+
+ // if not 'status line', just let base class
+ if (lineNumber != 0) return HttpHeader::ParseLine(line, lineNumber);
+
+ // fail if empty line
+ if (line.empty()) return false;
+
+ // walk through status line, storing positions
+ // HTTP/1.1 200 OK
+ // ^ ^^ ^^
+
+ const std::size_t foundVersion =
+ line.find_first_not_of(Constants::SPACE_CHAR); // skip any leading whitespace
+ if (foundVersion == std::string::npos) return false;
+ const std::size_t foundFirstSpace = line.find(Constants::SPACE_CHAR, foundVersion + 1);
+ if (foundFirstSpace == std::string::npos) return false;
+ const std::size_t foundStatusCode =
+ line.find_first_not_of(Constants::SPACE_CHAR, foundFirstSpace + 1);
+ if (foundStatusCode == std::string::npos) return false;
+ const std::size_t foundSecondSpace = line.find(Constants::SPACE_CHAR, foundStatusCode + 1);
+ if (foundSecondSpace == std::string::npos) return false;
+ const std::size_t foundReason =
+ line.find_first_not_of(Constants::SPACE_CHAR, foundSecondSpace + 1);
+ if (foundReason == std::string::npos) return false;
+
+ // parse version numbers
+ std::string temp = line.substr(foundVersion, foundFirstSpace - foundVersion);
+ if ((temp.find(Constants::HTTP_STRING) != 0) || (temp.size() != 8)) return false;
+ const int major = static_cast<int>(temp.at(5) - '0');
+ const int minor = static_cast<int>(temp.at(7) - '0');
+ SetVersion(major, minor);
+
+ // parse status code
+ temp = line.substr(foundStatusCode, foundSecondSpace - foundStatusCode);
+ if (temp.size() != 3) return false;
+ m_statusCode = std::atoi(temp.c_str());
+
+ // reason phrase should be everything else left
+ m_reason = line.substr(foundReason);
+
+ // if we get here, return success
+ return true;
+}
+
+std::string HttpResponseHeader::ToString() const
+{
+ std::stringstream response;
+ response << Constants::HTTP_STRING << GetMajorVersion() << Constants::DOT_CHAR
+ << GetMinorVersion() << Constants::SPACE_CHAR << m_statusCode << Constants::SPACE_CHAR
+ << m_reason << Constants::FIELD_NEWLINE << HttpHeader::ToString()
+ << Constants::FIELD_NEWLINE;
+ return response.str();
+}
diff --git a/src/api/internal/io/HttpHeader_p.h b/src/api/internal/io/HttpHeader_p.h
new file mode 100644
index 0000000..c7c4617
--- /dev/null
+++ b/src/api/internal/io/HttpHeader_p.h
@@ -0,0 +1,136 @@
+// ***************************************************************************
+// HttpHeader_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 13 January 2012 (DB)
+// ---------------------------------------------------------------------------
+// Provides a generic interface for parsing/generating HTTP headers, along
+// with specialized request & response header types
+// ***************************************************************************
+
+#ifndef HTTP_HEADER_P_H
+#define HTTP_HEADER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <map>
+#include <string>
+#include "api/api_global.h"
+
+namespace BamTools {
+namespace Internal {
+
+class HttpHeader
+{
+
+ // ctors & dtor
+public:
+ HttpHeader();
+ HttpHeader(const std::string& s);
+ virtual ~HttpHeader();
+
+ // HttpHeader interface
+public:
+ // header field=>value access
+ bool ContainsKey(const std::string& key) const;
+ std::string GetValue(const std::string& key);
+ void RemoveField(const std::string& key);
+ void SetField(const std::string& key, const std::string& value);
+
+ // get formatted header string
+ virtual std::string ToString() const;
+
+ // query HTTP version used
+ int GetMajorVersion() const;
+ int GetMinorVersion() const;
+
+ // see if header was parsed OK
+ bool IsValid() const;
+
+ // internal methods
+protected:
+ void Parse(const std::string& s);
+ virtual bool ParseLine(const std::string& line, int lineNumber);
+ void SetValid(bool ok);
+ void SetVersion(int major, int minor);
+
+ // data members
+private:
+ std::map<std::string, std::string> m_fields;
+
+ bool m_isValid; // should usually be true, only false if error processing a header line
+ int m_majorVersion;
+ int m_minorVersion;
+};
+
+class HttpRequestHeader : public HttpHeader
+{
+
+ // ctor & dtor
+public:
+ HttpRequestHeader(const std::string& method, // "GET", "HEAD", ...
+ const std::string& resource, // filename
+ int majorVersion = 1, // version info
+ int minorVersion = 1);
+ ~HttpRequestHeader();
+
+ // HttpRequestHeader interface
+public:
+ std::string GetMethod() const;
+ std::string GetResource() const;
+
+ // HttpHeader implementation
+public:
+ std::string ToString() const;
+
+protected:
+ bool ParseLine(const std::string& line, int lineNumber);
+
+ // data members
+private:
+ std::string m_method;
+ std::string m_resource;
+};
+
+class HttpResponseHeader : public HttpHeader
+{
+
+ // ctor & dtor
+public:
+ HttpResponseHeader(const int statusCode, // 200, 404, etc
+ const std::string& reason = std::string(), // 'reason phrase' for code
+ int majorVersion = 1, // version info
+ int minorVersion = 1);
+ HttpResponseHeader(const std::string& s);
+ ~HttpResponseHeader();
+
+ // HttpRequestHeader interface
+public:
+ std::string GetReason() const;
+ int GetStatusCode() const;
+
+ // HttpHeader implementation
+public:
+ std::string ToString() const;
+
+protected:
+ bool ParseLine(const std::string& line, int lineNumber);
+
+ // data members
+private:
+ int m_statusCode;
+ std::string m_reason;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // HTTP_HEADER_P_H
diff --git a/src/api/internal/io/ILocalIODevice_p.cpp b/src/api/internal/io/ILocalIODevice_p.cpp
new file mode 100644
index 0000000..9e81eeb
--- /dev/null
+++ b/src/api/internal/io/ILocalIODevice_p.cpp
@@ -0,0 +1,61 @@
+// ***************************************************************************
+// ILocalIODevice_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 27 July 2012 (DB)
+// ---------------------------------------------------------------------------
+// Provides shared behavior for files & pipes
+// ***************************************************************************
+
+#include "api/internal/io/ILocalIODevice_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdio>
+
+ILocalIODevice::ILocalIODevice()
+ : IBamIODevice()
+ , m_stream(0)
+{}
+
+ILocalIODevice::~ILocalIODevice()
+{
+ Close();
+}
+
+void ILocalIODevice::Close()
+{
+
+ // skip if not open
+ if (!IsOpen()) return;
+
+ // flush & close FILE*
+ fflush(m_stream);
+ fclose(m_stream);
+ m_stream = 0;
+
+ // reset other device state
+ m_mode = IBamIODevice::NotOpen;
+}
+
+int64_t ILocalIODevice::Read(char* data, const unsigned int numBytes)
+{
+ BT_ASSERT_X(m_stream, "ILocalIODevice::Read: trying to read from null stream");
+ BT_ASSERT_X((m_mode & IBamIODevice::ReadOnly),
+ "ILocalIODevice::Read: device not in read-able mode");
+ return static_cast<int64_t>(fread(data, sizeof(char), numBytes, m_stream));
+}
+
+int64_t ILocalIODevice::Tell() const
+{
+ BT_ASSERT_X(m_stream, "ILocalIODevice::Tell: trying to get file position fromnull stream");
+ return ftell64(m_stream);
+}
+
+int64_t ILocalIODevice::Write(const char* data, const unsigned int numBytes)
+{
+ BT_ASSERT_X(m_stream, "ILocalIODevice::Write: tryint to write to null stream");
+ BT_ASSERT_X((m_mode & IBamIODevice::WriteOnly),
+ "ILocalIODevice::Write: device not in write-able mode");
+ return static_cast<int64_t>(fwrite(data, sizeof(char), numBytes, m_stream));
+}
diff --git a/src/api/internal/io/ILocalIODevice_p.h b/src/api/internal/io/ILocalIODevice_p.h
new file mode 100644
index 0000000..64fc634
--- /dev/null
+++ b/src/api/internal/io/ILocalIODevice_p.h
@@ -0,0 +1,51 @@
+// ***************************************************************************
+// ILocalIODevice_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides shared behavior for files & pipes
+// ***************************************************************************
+
+#ifndef ILOCALIODEVICE_P_H
+#define ILOCALIODEVICE_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/IBamIODevice.h"
+
+namespace BamTools {
+namespace Internal {
+
+class ILocalIODevice : public IBamIODevice
+{
+
+ // ctor & dtor
+public:
+ ILocalIODevice();
+ virtual ~ILocalIODevice();
+
+ // IBamIODevice implementation
+public:
+ virtual void Close();
+ virtual int64_t Read(char* data, const unsigned int numBytes);
+ virtual int64_t Tell() const;
+ virtual int64_t Write(const char* data, const unsigned int numBytes);
+
+ // data members
+protected:
+ FILE* m_stream;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // ILOCALIODEVICE_P_H
diff --git a/src/api/internal/io/NetUnix_p.h b/src/api/internal/io/NetUnix_p.h
new file mode 100644
index 0000000..bb13cef
--- /dev/null
+++ b/src/api/internal/io/NetUnix_p.h
@@ -0,0 +1,43 @@
+// ***************************************************************************
+// NetUnix_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 November 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides common networking-related includes, etc. for all UNIX-like systems
+// ***************************************************************************
+
+#ifndef NETUNIX_P_H
+#define NETUNIX_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#ifndef _WIN32 // <-- source files only include the proper Net*_p.h, but this is a double-check
+
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#ifdef __FreeBSD__
+#include <netinet/in.h>
+#endif
+
+#ifndef BT_SOCKLEN_T
+#define BT_SOCKLEN_T socklen_t
+#endif
+
+#endif // _WIN32
+#endif // NETUNIX_P_H
diff --git a/src/api/internal/io/NetWin_p.h b/src/api/internal/io/NetWin_p.h
new file mode 100644
index 0000000..909b254
--- /dev/null
+++ b/src/api/internal/io/NetWin_p.h
@@ -0,0 +1,62 @@
+// ***************************************************************************
+// NetWin_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 8 December 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides common networking-related includes, etc. for Windows systems
+//
+// Note: requires Windows XP or later
+// ***************************************************************************
+
+#ifndef NETWIN_P_H
+#define NETWIN_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#ifdef _WIN32 // <-- source files only include the proper Net*_p.h, but this is a double-check
+
+#include <Ws2tcpip.h>
+#include <winsock2.h> // <-- should bring 'windows.h' along with it
+
+#ifndef BT_SOCKLEN_T
+#define BT_SOCKLEN_T int
+#endif
+
+#ifdef _MSC_VER
+#pragma comment(lib, "ws2_32.lib")
+#endif
+
+namespace BamTools {
+namespace Internal {
+
+// use RAII to ensure WSA is initialized
+class WindowsSockInit
+{
+public:
+ WindowsSockInit()
+ {
+ WSAData wsadata;
+ WSAStartup(MAKEWORD(2, 2), &wsadata); // catch error ?
+ }
+
+ ~WindowsSockInit()
+ {
+ WSACleanup();
+ }
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // _WIN32
+
+#endif // NETWIN_P_H
diff --git a/src/api/internal/io/RollingBuffer_p.cpp b/src/api/internal/io/RollingBuffer_p.cpp
new file mode 100644
index 0000000..3cbfd1a
--- /dev/null
+++ b/src/api/internal/io/RollingBuffer_p.cpp
@@ -0,0 +1,317 @@
+// ***************************************************************************
+// RollingBuffer_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 8 December 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides a dynamic I/O FIFO byte queue, which removes bytes as they are
+// read from the front of the buffer and grows to accept bytes being written
+// to buffer end.
+//
+// implementation note: basically a 'smart' wrapper around 1..* ByteArrays
+// ***************************************************************************
+
+#include "api/internal/io/RollingBuffer_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <algorithm>
+#include <climits>
+#include <cstddef>
+#include <cstring>
+#include <string>
+
+// ------------------------------
+// RollingBuffer implementation
+// ------------------------------
+
+RollingBuffer::RollingBuffer(std::size_t growth)
+ : m_bufferGrowth(growth)
+{
+ // buffer always contains at least 1 (maybe empty) byte array
+ m_data.push_back(ByteArray());
+
+ // set cleared state
+ Clear();
+}
+
+RollingBuffer::~RollingBuffer() {}
+
+std::size_t RollingBuffer::BlockSize() const
+{
+
+ // if only one byte array in buffer <- needed?
+ if (m_tailBufferIndex == 0) return m_tail - m_head;
+
+ // otherwise return remaining num bytes in first array
+ const ByteArray& first = m_data.front();
+ return first.Size() - m_head;
+}
+
+bool RollingBuffer::CanReadLine() const
+{
+ return IndexOf('\n') != std::string::npos;
+}
+
+void RollingBuffer::Chop(std::size_t n)
+{
+
+ // update buffer size
+ if (n > m_totalBufferSize)
+ m_totalBufferSize = 0;
+ else
+ m_totalBufferSize -= n;
+
+ // loop until target case hit
+ for (;;) {
+
+ // if only one array, decrement tail
+ if (m_tailBufferIndex == 0) {
+ m_tail -= n;
+
+ // if all data chopped
+ if (m_tail <= m_head) {
+ m_head = 0;
+ m_tail = 0;
+ }
+ return;
+ }
+
+ // if there's room in last byte array to 'chop', just decrement tail
+ if (n <= m_tail) {
+ m_tail -= n;
+ return;
+ }
+
+ // otherwise we're going to overlap our internal byte arrays
+ // reduce our chop amount by the amount of data in the last byte array
+ n -= m_tail;
+
+ // remove last byte array & set tail to it's end
+ m_data.pop_back();
+ --m_tailBufferIndex;
+ m_tail = m_data.at(m_tailBufferIndex).Size();
+ }
+
+ // if buffer is now empty, reset state & clear up memory
+ if (IsEmpty()) Clear();
+}
+
+void RollingBuffer::Clear()
+{
+
+ // remove all byte arrays (except first)
+ m_data.erase(m_data.begin() + 1, m_data.end());
+
+ // clear out first byte array
+ m_data[0].Resize(0);
+ m_data[0].Squeeze();
+
+ // reset index & size markers
+ m_head = 0;
+ m_tail = 0;
+ m_tailBufferIndex = 0;
+ m_totalBufferSize = 0;
+}
+
+void RollingBuffer::Free(std::size_t n)
+{
+
+ // update buffer size
+ if (n > m_totalBufferSize)
+ m_totalBufferSize = 0;
+ else
+ m_totalBufferSize -= n;
+
+ // loop until target case hit
+ for (;;) {
+
+ const std::size_t blockSize = BlockSize();
+
+ // if there's room in current array
+ if (n < blockSize) {
+
+ // shift 'head' over @n bytes
+ m_head += n;
+
+ // check for emptied, single byte array
+ if (m_head == m_tail && m_tailBufferIndex == 0) {
+ m_head = 0;
+ m_tail = 0;
+ }
+
+ break;
+ }
+
+ // otherwise we need to check next byte array
+ // first update amount to remove
+ n -= blockSize;
+
+ // special case - there was only 1 array
+ if (m_data.size() == 1) {
+ if (m_data.at(0).Size() != m_bufferGrowth) m_data[0].Resize(m_bufferGrowth);
+ m_head = 0;
+ m_tail = 0;
+ m_tailBufferIndex = 0;
+ break;
+ }
+
+ // otherwise, remove first array and move to next iteration
+ m_data.pop_front();
+ --m_tailBufferIndex;
+ m_head = 0;
+ }
+
+ // if buffer is now empty, reset state & clear up memory
+ if (IsEmpty()) Clear();
+}
+
+std::size_t RollingBuffer::IndexOf(char c) const
+{
+
+ // skip processing if empty buffer
+ if (IsEmpty()) return std::string::npos;
+
+ std::size_t index(0);
+
+ // iterate over byte arrays
+ const std::size_t numBuffers = m_data.size();
+ for (std::size_t i = 0; i < numBuffers; ++i) {
+ const ByteArray& current = m_data.at(i);
+
+ // if on first array, use head; else 0
+ const std::size_t start = ((i == 0) ? m_head : 0);
+
+ // if on last array, set end; else use current byte array size
+ const std::size_t end = ((i == m_tailBufferIndex) ? m_tail : current.Size());
+
+ // look through this iteration's byte array for @c
+ const char* p = current.ConstData() + start;
+ for (std::size_t j = start; j < end; ++j) {
+ if (*p++ == c) return index;
+ ++index;
+ }
+ }
+
+ // no match found
+ return std::string::npos;
+}
+
+bool RollingBuffer::IsEmpty() const
+{
+ return (m_tailBufferIndex == 0) && (m_tail == 0);
+}
+
+std::size_t RollingBuffer::Read(char* dest, std::size_t max)
+{
+
+ std::size_t bytesToRead = std::min(Size(), max);
+ std::size_t bytesReadSoFar = 0;
+
+ while (bytesReadSoFar < bytesToRead) {
+ const char* readPtr = ReadPointer();
+ std::size_t blockBytes = std::min((bytesToRead - bytesReadSoFar), BlockSize());
+ if (dest) memcpy(dest + bytesReadSoFar, readPtr, blockBytes);
+ bytesReadSoFar += blockBytes;
+ Free(blockBytes);
+ }
+
+ return bytesReadSoFar;
+}
+
+std::size_t RollingBuffer::ReadLine(char* dest, std::size_t max)
+{
+
+ // if we can't read line or if max is 0
+ if (!CanReadLine() || max == 0) return 0;
+
+ // otherwise, read until we hit newline
+ std::size_t bytesReadSoFar = 0;
+ bool finished = false;
+ while (!finished) {
+
+ const std::size_t index = IndexOf('\n');
+ const char* readPtr = ReadPointer();
+ std::size_t bytesToRead = std::min((index + 1) - bytesReadSoFar, BlockSize());
+ bytesToRead = std::min(bytesToRead, (max - 1) - bytesReadSoFar);
+ memcpy(dest + bytesReadSoFar, readPtr, bytesToRead);
+ bytesReadSoFar += bytesToRead;
+ Free(bytesToRead);
+
+ if (!((bytesReadSoFar < index + 1) && (bytesReadSoFar < max - 1))) finished = true;
+ }
+
+ // null terminate 'dest' & return numBytesRead
+ dest[bytesReadSoFar] = '\0';
+ return bytesReadSoFar;
+}
+
+const char* RollingBuffer::ReadPointer() const
+{
+
+ // return null if empty buffer
+ if (m_data.empty()) return 0;
+
+ // otherwise return pointer to current position
+ const ByteArray& first = m_data.front();
+ return first.ConstData() + m_head;
+}
+
+char* RollingBuffer::Reserve(std::size_t n)
+{
+
+ // if empty buffer
+ if (m_totalBufferSize == 0) {
+ m_data[0].Resize(std::max(m_bufferGrowth, n));
+ m_totalBufferSize += n;
+ m_tail = n;
+ return m_data[m_tailBufferIndex].Data();
+ }
+
+ // increment buffer's byte count
+ m_totalBufferSize += n;
+
+ // if buffer already contains enough space to fit @n more bytes
+ if ((m_tail + n) <= m_data.at(m_tailBufferIndex).Size()) {
+
+ // fetch write pointer at current 'tail', increment tail by @n & return
+ char* ptr = m_data[m_tailBufferIndex].Data(); //+ m_tail;
+ m_tail += n;
+ return ptr;
+ }
+
+ // if last byte array isn't half full
+ if (m_tail < m_data.at(m_tailBufferIndex).Size() / 2) {
+
+ // we'll allow simple resize
+ m_data[m_tailBufferIndex].Resize(m_tail + n);
+
+ // fetch write pointer at current 'tail', increment tail by @n & return
+ char* ptr = m_data[m_tailBufferIndex].Data(); //+ m_tail;
+ m_tail += n;
+ return ptr;
+ }
+
+ // otherwise, shrink last byte array to current used size
+ m_data[m_tailBufferIndex].Resize(m_tail);
+
+ // then append new byte array
+ m_data.push_back(ByteArray());
+ ++m_tailBufferIndex;
+ m_data[m_tailBufferIndex].Resize(std::max(m_bufferGrowth, n));
+ m_tail = n;
+
+ // return write-able pointer on new array
+ return m_data[m_tailBufferIndex].Data();
+}
+
+std::size_t RollingBuffer::Size() const
+{
+ return m_totalBufferSize;
+}
+
+void RollingBuffer::Write(const char* src, std::size_t n)
+{
+ char* writePtr = Reserve(n);
+ memcpy(writePtr, src, n);
+}
diff --git a/src/api/internal/io/RollingBuffer_p.h b/src/api/internal/io/RollingBuffer_p.h
new file mode 100644
index 0000000..2e22426
--- /dev/null
+++ b/src/api/internal/io/RollingBuffer_p.h
@@ -0,0 +1,88 @@
+// ***************************************************************************
+// RollingBuffer_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 7 December 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides a dynamic I/O FIFO byte queue, which removes bytes as they are
+// read from the front of the buffer and grows to accept bytes being written
+// to buffer end.
+//
+// implementation note: basically a 'smart' wrapper around 1..* ByteArrays
+// ***************************************************************************
+
+#ifndef ROLLINGBUFFER_P_H
+#define ROLLINGBUFFER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <cstddef>
+#include <deque>
+#include <string>
+#include "api/api_global.h"
+#include "api/internal/io/ByteArray_p.h"
+
+namespace BamTools {
+namespace Internal {
+
+class RollingBuffer
+{
+
+ // ctors & dtor
+public:
+ RollingBuffer(std::size_t growth);
+ ~RollingBuffer();
+
+ // RollingBuffer interface
+public:
+ // returns current buffer size
+ std::size_t BlockSize() const;
+ // checks buffer for new line
+ bool CanReadLine() const;
+ // frees @n bytes from end of buffer
+ void Chop(std::size_t n);
+ // clears entire buffer structure
+ void Clear();
+ // frees @n bytes from front of buffer
+ void Free(std::size_t n);
+ // checks buffer for @c
+ std::size_t IndexOf(char c) const;
+ // returns whether buffer contains data
+ bool IsEmpty() const;
+ // reads up to @maxLen bytes into @dest
+ // returns exactly how many bytes were read from buffer
+ std::size_t Read(char* dest, std::size_t max);
+ // reads until newline (or up to @maxLen bytes)
+ // returns exactly how many bytes were read from buffer
+ std::size_t ReadLine(char* dest, std::size_t max);
+ // returns a C-fxn compatible char* to byte data
+ const char* ReadPointer() const;
+ // ensures that buffer contains space for @n incoming bytes, returns write-able char*
+ char* Reserve(std::size_t n);
+ // returns current number of bytes stored in buffer
+ std::size_t Size() const;
+ // reserves space for @n bytes, then appends contents of @src to buffer
+ void Write(const char* src, std::size_t n);
+
+ // data members
+private:
+ std::size_t m_head; // index into current data (next char)
+ std::size_t m_tail; // index into last data position
+ std::size_t m_tailBufferIndex; // m_data::size() - 1
+ std::size_t m_totalBufferSize; // total buffer size
+ std::size_t m_bufferGrowth; // new buffers are typically initialized with this size
+ std::deque<ByteArray> m_data; // basic 'buffer of buffers'
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // ROLLINGBUFFER_P_H
diff --git a/src/api/internal/io/TcpSocketEngine_p.cpp b/src/api/internal/io/TcpSocketEngine_p.cpp
new file mode 100644
index 0000000..de373c4
--- /dev/null
+++ b/src/api/internal/io/TcpSocketEngine_p.cpp
@@ -0,0 +1,212 @@
+// ***************************************************************************
+// TcpSocketEngine_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 8 December 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides low-level implementation of TCP I/O
+// ***************************************************************************
+
+// N.B. - this file contains the top-level, platform-independent logic. "Native" methods
+// are called as needed from the TcpSocketEngine_<X>.cpp files. Selection of the proper
+// native method file should have been handled at build-time by CMake.
+
+#include "api/internal/io/TcpSocketEngine_p.h"
+#include "api/internal/io/HostInfo_p.h"
+
+#include <cstddef>
+
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+TcpSocketEngine::TcpSocketEngine()
+ : m_socketDescriptor(-1)
+ // , m_localPort(0)
+ , m_remotePort(0)
+ , m_socketError(TcpSocket::UnknownSocketError)
+ , m_socketState(TcpSocket::UnconnectedState)
+{}
+
+TcpSocketEngine::TcpSocketEngine(const TcpSocketEngine& other)
+ : m_socketDescriptor(other.m_socketDescriptor)
+ // , m_localAddress(other.m_localAddress)
+ , m_remoteAddress(other.m_remoteAddress)
+ // , m_localPort(other.m_localPort)
+ , m_remotePort(other.m_remotePort)
+ , m_socketError(other.m_socketError)
+ , m_socketState(other.m_socketState)
+ , m_errorString(other.m_errorString)
+{}
+
+TcpSocketEngine::~TcpSocketEngine()
+{
+ Close();
+}
+
+void TcpSocketEngine::Close()
+{
+
+ // close socket if we have valid FD
+ if (m_socketDescriptor != -1) {
+ nativeClose();
+ m_socketDescriptor = -1;
+ }
+
+ // reset state
+ m_socketState = TcpSocket::UnconnectedState;
+ // m_localAddress.Clear();
+ m_remoteAddress.Clear();
+ // m_localPort = 0;
+ m_remotePort = 0;
+}
+
+bool TcpSocketEngine::Connect(const HostAddress& address, const uint16_t port)
+{
+
+ // return failure if invalid FD or already connected
+ if (!IsValid() || (m_socketState == TcpSocket::ConnectedState)) {
+ // TODO: set error string
+ return false;
+ }
+
+ // attempt to connect to host address on requested port
+ if (!nativeConnect(address, port)) {
+ // TODO: set error string
+ return false;
+ }
+
+ // if successful, store remote host address port & return success
+ // TODO: (later) fetch proxied remote & local host/port here
+ m_remoteAddress = address;
+ m_remotePort = port;
+ return true;
+}
+
+std::string TcpSocketEngine::GetErrorString() const
+{
+ return m_errorString;
+}
+
+//HostAddress TcpSocketEngine::GetLocalAddress() const {
+// return m_localAddress;
+//}
+
+//uint16_t TcpSocketEngine::GetLocalPort() const {
+// return m_localPort;
+//}
+
+HostAddress TcpSocketEngine::GetRemoteAddress() const
+{
+ return m_remoteAddress;
+}
+
+uint16_t TcpSocketEngine::GetRemotePort() const
+{
+ return m_remotePort;
+}
+
+int TcpSocketEngine::GetSocketDescriptor() const
+{
+ return m_socketDescriptor;
+}
+
+TcpSocket::SocketError TcpSocketEngine::GetSocketError()
+{
+ return m_socketError;
+}
+
+TcpSocket::SocketState TcpSocketEngine::GetSocketState()
+{
+ return m_socketState;
+}
+
+bool TcpSocketEngine::Initialize(HostAddress::NetworkProtocol protocol)
+{
+
+ // close current socket if we have one open
+ if (IsValid()) Close();
+
+ // attempt to create new socket
+ return nativeCreateSocket(protocol);
+}
+
+bool TcpSocketEngine::IsValid() const
+{
+ return (m_socketDescriptor != -1);
+}
+
+int64_t TcpSocketEngine::NumBytesAvailable() const
+{
+
+ // return 0 if socket FD is invalid
+ if (!IsValid()) {
+ // TODO: set error string
+ return -1;
+ }
+
+ // otherwise check socket to see how much is ready
+ return nativeNumBytesAvailable();
+}
+
+int64_t TcpSocketEngine::Read(char* dest, std::size_t max)
+{
+
+ // return failure if can't read
+ if (!IsValid() || (m_socketState != TcpSocket::ConnectedState)) return -1;
+
+ // otherwise return number of bytes read
+ return nativeRead(dest, max);
+}
+
+bool TcpSocketEngine::WaitForRead(int msec, bool* timedOut)
+{
+
+ // reset timedOut flag
+ *timedOut = false;
+
+ // need to wait for our socket to be ready to read
+ const int ret = nativeSelect(msec, true);
+
+ // if timed out
+ if (ret == 0) {
+ *timedOut = true;
+ m_socketError = TcpSocket::SocketTimeoutError;
+ m_errorString = "socket timed out";
+ }
+
+ // return if any sockets available for reading
+ return (ret > 0);
+}
+
+bool TcpSocketEngine::WaitForWrite(int msec, bool* timedOut)
+{
+
+ // reset timedOut flag
+ *timedOut = false;
+
+ // need to wait for our socket to be ready to write
+ const int ret = nativeSelect(msec, false);
+
+ // if timed out
+ if (ret == 0) {
+ *timedOut = true;
+ m_socketError = TcpSocket::SocketTimeoutError;
+ m_errorString = "socket timed out";
+ }
+
+ // return if any sockets available for reading
+ return (ret > 0);
+}
+
+int64_t TcpSocketEngine::Write(const char* data, std::size_t length)
+{
+
+ // return failure if can't write
+ if (!IsValid() || (m_socketState != TcpSocket::ConnectedState)) {
+ // TODO: set error string
+ return -1;
+ }
+
+ // otherwise return number of bytes written
+ return nativeWrite(data, length);
+}
diff --git a/src/api/internal/io/TcpSocketEngine_p.h b/src/api/internal/io/TcpSocketEngine_p.h
new file mode 100644
index 0000000..b3a6495
--- /dev/null
+++ b/src/api/internal/io/TcpSocketEngine_p.h
@@ -0,0 +1,105 @@
+// ***************************************************************************
+// TcpSocketEngine_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 8 December 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides low-level implementation of TCP I/O
+// ***************************************************************************
+
+#ifndef TCPSOCKETENGINE_P_H
+#define TCPSOCKETENGINE_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/internal/io/HostAddress_p.h"
+#include "api/internal/io/TcpSocket_p.h"
+
+#ifdef _WIN32
+#include "api/internal/io/NetWin_p.h"
+#endif
+
+#include <cstddef>
+
+namespace BamTools {
+namespace Internal {
+
+class TcpSocketEngine
+{
+
+ // ctors & dtor
+public:
+ TcpSocketEngine();
+ TcpSocketEngine(const TcpSocketEngine& other);
+ ~TcpSocketEngine();
+
+ // TcpSocketEngine interface
+public:
+ // connection-related methods
+ void Close();
+ bool Connect(const HostAddress& address, const uint16_t port);
+ bool Initialize(HostAddress::NetworkProtocol protocol);
+ bool IsValid() const;
+
+ // IO-related methods
+ int64_t NumBytesAvailable() const;
+ int64_t Read(char* dest, std::size_t max);
+ int64_t Write(const char* data, std::size_t length);
+
+ bool WaitForRead(int msec, bool* timedOut);
+ bool WaitForWrite(int msec, bool* timedOut);
+
+ // query connection state
+ // HostAddress GetLocalAddress() const;
+ // uint16_t GetLocalPort() const;
+ HostAddress GetRemoteAddress() const;
+ uint16_t GetRemotePort() const;
+
+ int GetSocketDescriptor() const;
+ TcpSocket::SocketError GetSocketError();
+ TcpSocket::SocketState GetSocketState();
+
+ std::string GetErrorString() const;
+
+ // platform-dependent internal methods
+ // provided in the corresponding TcpSocketEngine_<OS>_p.cpp
+private:
+ void nativeClose();
+ bool nativeConnect(const HostAddress& address, const uint16_t port);
+ bool nativeCreateSocket(HostAddress::NetworkProtocol protocol);
+ void nativeDisconnect();
+ int64_t nativeNumBytesAvailable() const;
+ int64_t nativeRead(char* dest, std::size_t max);
+ int nativeSelect(int msecs, bool isRead) const;
+ int64_t nativeWrite(const char* data, std::size_t length);
+
+ // data members
+private:
+ int m_socketDescriptor;
+
+ // HostAddress m_localAddress;
+ HostAddress m_remoteAddress;
+ // uint16_t m_localPort;
+ uint16_t m_remotePort;
+
+ TcpSocket::SocketError m_socketError;
+ TcpSocket::SocketState m_socketState;
+ std::string m_errorString;
+
+#ifdef _WIN32
+ WindowsSockInit m_win;
+#endif
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // TCPSOCKETENGINE_P_H
diff --git a/src/api/internal/io/TcpSocketEngine_unix_p.cpp b/src/api/internal/io/TcpSocketEngine_unix_p.cpp
new file mode 100644
index 0000000..35cd307
--- /dev/null
+++ b/src/api/internal/io/TcpSocketEngine_unix_p.cpp
@@ -0,0 +1,220 @@
+// ***************************************************************************
+// TcpSocketEngine_unix_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 15 November 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides low-level implementation of TCP I/O for all UNIX-like systems
+// ***************************************************************************
+
+#include "api/internal/io/NetUnix_p.h"
+#include "api/internal/io/TcpSocketEngine_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#ifdef SUN_OS
+#include <sys/filio.h>
+#endif
+
+#include <cerrno>
+#include <cstddef>
+#include <ctime>
+#include <iostream>
+
+// ------------------------
+// static utility methods
+// ------------------------
+
+namespace BamTools {
+namespace Internal {} // namespace Internal
+} // namespace BamTools
+
+// --------------------------------
+// TcpSocketEngine implementation
+// --------------------------------
+
+void TcpSocketEngine::nativeClose()
+{
+ close(m_socketDescriptor);
+}
+
+bool TcpSocketEngine::nativeConnect(const HostAddress& address, const uint16_t port)
+{
+
+ // setup connection parameters from address/port
+ sockaddr_in sockAddrIPv4;
+ sockaddr_in6 sockAddrIPv6;
+ sockaddr* sockAddrPtr = 0;
+ BT_SOCKLEN_T sockAddrSize = 0;
+
+ // IPv6
+ if (address.GetProtocol() == HostAddress::IPv6Protocol) {
+
+ memset(&sockAddrIPv6, 0, sizeof(sockAddrIPv6));
+ sockAddrIPv6.sin6_family = AF_INET6;
+ sockAddrIPv6.sin6_port = htons(port);
+
+ IPv6Address ip6 = address.GetIPv6Address();
+ memcpy(&sockAddrIPv6.sin6_addr.s6_addr, &ip6, sizeof(ip6));
+
+ sockAddrSize = sizeof(sockAddrIPv6);
+ sockAddrPtr = (sockaddr*)&sockAddrIPv6;
+ }
+
+ // IPv4
+ else if (address.GetProtocol() == HostAddress::IPv4Protocol) {
+
+ memset(&sockAddrIPv4, 0, sizeof(sockAddrIPv4));
+ sockAddrIPv4.sin_family = AF_INET;
+ sockAddrIPv4.sin_port = htons(port);
+ sockAddrIPv4.sin_addr.s_addr = htonl(address.GetIPv4Address());
+
+ sockAddrSize = sizeof(sockAddrIPv4);
+ sockAddrPtr = (sockaddr*)&sockAddrIPv4;
+ }
+
+ // unknown (should be unreachable)
+ else
+ BT_ASSERT_X(false, "TcpSocketEngine::nativeConnect() : unknown network protocol");
+
+ // attempt connection
+ int connectResult = connect(m_socketDescriptor, sockAddrPtr, sockAddrSize);
+
+ // if failed, handle error
+ if (connectResult == -1) {
+
+ // ensure state is set before checking errno
+ m_socketState = TcpSocket::UnconnectedState;
+
+ // set error type/message depending on errno
+ switch (
+ errno) { // <-- potential thread issues later? but can't get error type from connectResult
+
+ case EISCONN:
+ m_socketState = TcpSocket::ConnectedState; // socket was already connected
+ break;
+ case ECONNREFUSED:
+ case EINVAL:
+ m_socketError = TcpSocket::ConnectionRefusedError;
+ m_errorString = "connection refused";
+ break;
+ case ETIMEDOUT:
+ m_socketError = TcpSocket::NetworkError;
+ m_errorString = "connection timed out";
+ break;
+ case EHOSTUNREACH:
+ m_socketError = TcpSocket::NetworkError;
+ m_errorString = "host unreachable";
+ break;
+ case ENETUNREACH:
+ m_socketError = TcpSocket::NetworkError;
+ m_errorString = "network unreachable";
+ break;
+ case EADDRINUSE:
+ m_socketError = TcpSocket::SocketResourceError;
+ m_errorString = "address already in use";
+ break;
+ case EACCES:
+ case EPERM:
+ m_socketError = TcpSocket::SocketAccessError;
+ m_errorString = "permission denied";
+ break;
+ default:
+ break;
+ }
+
+ // double check that we're not in 'connected' state; if so, return failure
+ if (m_socketState != TcpSocket::ConnectedState) return false;
+ }
+
+ // otherwise, we should be good
+ // update state & return success
+ m_socketState = TcpSocket::ConnectedState;
+ return true;
+}
+
+bool TcpSocketEngine::nativeCreateSocket(HostAddress::NetworkProtocol protocol)
+{
+
+ // get protocol value for requested protocol type
+ const int protocolNum = ((protocol == HostAddress::IPv6Protocol) ? AF_INET6 : AF_INET);
+
+ // attempt to create socket
+ int socketFd = socket(protocolNum, SOCK_STREAM, IPPROTO_TCP);
+
+ // if we fetched an invalid socket descriptor
+ if (socketFd <= 0) {
+
+ // see what error we got
+ switch (errno) {
+ case EPROTONOSUPPORT:
+ case EAFNOSUPPORT:
+ case EINVAL:
+ m_socketError = TcpSocket::UnsupportedSocketOperationError;
+ m_errorString = "protocol not supported";
+ break;
+ case ENFILE:
+ case EMFILE:
+ case ENOBUFS:
+ case ENOMEM:
+ m_socketError = TcpSocket::SocketResourceError;
+ m_errorString = "out of resources";
+ break;
+ case EACCES:
+ m_socketError = TcpSocket::SocketAccessError;
+ m_errorString = "permission denied";
+ break;
+ default:
+ break;
+ }
+
+ // return failure
+ return false;
+ }
+
+ // otherwise, store our socket FD & return success
+ m_socketDescriptor = socketFd;
+ return true;
+}
+
+int64_t TcpSocketEngine::nativeNumBytesAvailable() const
+{
+
+ // fetch number of bytes, return 0 on error
+ int numBytes(0);
+ if (ioctl(m_socketDescriptor, FIONREAD, (char*)&numBytes) < 0) return -1;
+ return static_cast<int64_t>(numBytes);
+}
+
+int64_t TcpSocketEngine::nativeRead(char* dest, std::size_t max)
+{
+ const std::size_t ret = read(m_socketDescriptor, dest, max);
+ return static_cast<int64_t>(ret);
+}
+
+// negative value for msecs will block (forever) until ready
+int TcpSocketEngine::nativeSelect(int msecs, bool isRead) const
+{
+
+ // set up FD set
+ fd_set fds;
+ FD_ZERO(&fds);
+ FD_SET(m_socketDescriptor, &fds);
+
+ // setup our timeout
+ timeval tv;
+ tv.tv_sec = msecs / 1000;
+ tv.tv_usec = (msecs % 1000) * 1000;
+
+ // do 'select'
+ if (isRead)
+ return select(m_socketDescriptor + 1, &fds, 0, 0, (msecs < 0 ? 0 : &tv));
+ else
+ return select(m_socketDescriptor + 1, 0, &fds, 0, (msecs < 0 ? 0 : &tv));
+}
+
+int64_t TcpSocketEngine::nativeWrite(const char* data, std::size_t length)
+{
+ const std::size_t writtenBytes = write(m_socketDescriptor, data, length);
+ return static_cast<int64_t>(writtenBytes);
+}
diff --git a/src/api/internal/io/TcpSocketEngine_win_p.cpp b/src/api/internal/io/TcpSocketEngine_win_p.cpp
new file mode 100644
index 0000000..6cc257a
--- /dev/null
+++ b/src/api/internal/io/TcpSocketEngine_win_p.cpp
@@ -0,0 +1,242 @@
+// ***************************************************************************
+// TcpSocketEngine_win_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 8 December 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides low-level implementation of TCP I/O for all Windows systems
+// ***************************************************************************
+
+#include "api/internal/io/NetWin_p.h"
+#include "api/internal/io/TcpSocketEngine_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstddef>
+#include <cstring>
+#include <iostream>
+#include <sstream>
+
+// --------------------------------
+// TcpSocketEngine implementation
+// --------------------------------
+
+void TcpSocketEngine::nativeClose()
+{
+ closesocket(m_socketDescriptor);
+}
+
+bool TcpSocketEngine::nativeConnect(const HostAddress& address, const uint16_t port)
+{
+
+ // setup connection parameters from address/port
+ sockaddr_in sockAddrIPv4;
+ sockaddr_in6 sockAddrIPv6;
+ sockaddr* sockAddrPtr = 0;
+ BT_SOCKLEN_T sockAddrSize = 0;
+
+ // IPv6
+ if (address.GetProtocol() == HostAddress::IPv6Protocol) {
+
+ memset(&sockAddrIPv6, 0, sizeof(sockAddrIPv6));
+ sockAddrIPv6.sin6_family = AF_INET6;
+ sockAddrIPv6.sin6_port = htons(port);
+
+ IPv6Address ip6 = address.GetIPv6Address();
+ memcpy(&sockAddrIPv6.sin6_addr.s6_addr, &ip6, sizeof(ip6));
+
+ sockAddrSize = sizeof(sockAddrIPv6);
+ sockAddrPtr = (sockaddr*)&sockAddrIPv6;
+ }
+
+ // IPv4
+ else if (address.GetProtocol() == HostAddress::IPv4Protocol) {
+
+ memset(&sockAddrIPv4, 0, sizeof(sockAddrIPv4));
+ sockAddrIPv4.sin_family = AF_INET;
+ sockAddrIPv4.sin_port = htons(port);
+ sockAddrIPv4.sin_addr.s_addr = htonl(address.GetIPv4Address());
+
+ sockAddrSize = sizeof(sockAddrIPv4);
+ sockAddrPtr = (sockaddr*)&sockAddrIPv4;
+ }
+
+ // unknown (should be unreachable)
+ else
+ BT_ASSERT_X(false, "TcpSocketEngine::nativeConnect() : unknown network protocol");
+
+ // attempt conenction
+ const int connectResult = WSAConnect(m_socketDescriptor, sockAddrPtr, sockAddrSize, 0, 0, 0, 0);
+
+ // if failed, handle error
+ if (connectResult == SOCKET_ERROR) {
+
+ // ensure state is set before checking error code
+ m_socketState = TcpSocket::UnconnectedState;
+
+ // set error type/message depending on errorCode
+ const int errorCode = WSAGetLastError();
+ switch (errorCode) {
+ case WSANOTINITIALISED:
+ m_socketError = TcpSocket::UnknownSocketError;
+ m_errorString = "Windows socket functionality not properly initialized";
+ break;
+ case WSAEISCONN:
+ m_socketState = TcpSocket::ConnectedState; // socket already connected
+ break;
+ case WSAECONNREFUSED:
+ case WSAEINVAL:
+ m_socketError = TcpSocket::ConnectionRefusedError;
+ m_errorString = "connection refused";
+ break;
+ case WSAETIMEDOUT:
+ m_socketError = TcpSocket::NetworkError;
+ m_errorString = "connection timed out";
+ break;
+ case WSAEHOSTUNREACH:
+ m_socketError = TcpSocket::NetworkError;
+ m_errorString = "host unreachable";
+ break;
+ case WSAENETUNREACH:
+ m_socketError = TcpSocket::NetworkError;
+ m_errorString = "network unreachable";
+ break;
+ case WSAEADDRINUSE:
+ m_socketError = TcpSocket::SocketResourceError;
+ m_errorString = "address already in use";
+ break;
+ case WSAEACCES:
+ m_socketError = TcpSocket::SocketAccessError;
+ m_errorString = "permission denied";
+ break;
+ default:
+ break;
+ }
+
+ // double check that we're not in 'connected' state; if so, return failure
+ if (m_socketState != TcpSocket::ConnectedState) return false;
+ }
+
+ // otherwise, we should be good
+ // update state & return success
+ m_socketState = TcpSocket::ConnectedState;
+ return true;
+}
+
+bool TcpSocketEngine::nativeCreateSocket(HostAddress::NetworkProtocol protocol)
+{
+
+ // get protocol value for requested protocol type
+ const int protocolNum = ((protocol == HostAddress::IPv6Protocol) ? AF_INET6 : AF_INET);
+
+ // attempt to create socket
+ SOCKET socketFd = WSASocket(protocolNum, SOCK_STREAM, IPPROTO_TCP, 0, 0, WSA_FLAG_OVERLAPPED);
+
+ // if we fetched an invalid socket descriptor
+ if (socketFd == INVALID_SOCKET) {
+
+ // set error type/message depending on error code
+ const int errorCode = WSAGetLastError();
+ switch (errorCode) {
+ case WSANOTINITIALISED:
+ m_socketError = TcpSocket::UnknownSocketError;
+ m_errorString = "Windows socket functionality not properly initialized";
+ break;
+ case WSAEAFNOSUPPORT:
+ case WSAESOCKTNOSUPPORT:
+ case WSAEPROTOTYPE:
+ case WSAEINVAL:
+ m_socketError = TcpSocket::UnsupportedSocketOperationError;
+ m_errorString = "protocol not supported";
+ break;
+ case WSAEMFILE:
+ case WSAENOBUFS:
+ m_socketError = TcpSocket::SocketResourceError;
+ m_errorString = "out of resources";
+ break;
+ default:
+ m_socketError = TcpSocket::UnknownSocketError;
+ std::stringstream errStream;
+ errStream << "WSA ErrorCode: " << errorCode;
+ m_errorString = errStream.str();
+ break;
+ }
+
+ // return failure
+ return false;
+ }
+
+ // otherwise, store our socket FD & return success
+ m_socketDescriptor = static_cast<int>(socketFd);
+ return true;
+}
+
+int64_t TcpSocketEngine::nativeNumBytesAvailable() const
+{
+
+ int64_t numBytes(0);
+ int64_t dummy(0);
+ DWORD bytesWritten(0);
+
+ const int ioctlResult = WSAIoctl(m_socketDescriptor, FIONREAD, &dummy, sizeof(dummy), &numBytes,
+ sizeof(numBytes), &bytesWritten, 0, 0);
+ return (ioctlResult == SOCKET_ERROR ? -1 : numBytes);
+}
+
+int64_t TcpSocketEngine::nativeRead(char* dest, std::size_t max)
+{
+
+ // skip if invalid socket
+ if (!IsValid()) return -1;
+
+ // set up our WSA output buffer
+ WSABUF buf;
+ buf.buf = dest;
+ buf.len = max;
+
+ // attempt to read bytes
+ DWORD flags = 0;
+ DWORD bytesRead = 0;
+ const int readResult = WSARecv(m_socketDescriptor, &buf, 1, &bytesRead, &flags, 0, 0);
+ if (readResult == SOCKET_ERROR) return -1;
+
+ // return number of bytes read
+ return static_cast<int64_t>(bytesRead);
+}
+
+// negative value for msecs will block (forever) until
+int TcpSocketEngine::nativeSelect(int msecs, bool isRead) const
+{
+
+ fd_set fds;
+ FD_ZERO(&fds);
+ FD_SET(m_socketDescriptor, &fds);
+
+ timeval tv;
+ tv.tv_sec = msecs / 1000;
+ tv.tv_usec = (msecs % 1000) * 1000;
+
+ // do 'select'
+ if (isRead)
+ return select(0, &fds, 0, 0, (msecs < 0 ? 0 : &tv));
+ else
+ return select(0, 0, &fds, 0, (msecs < 0 ? 0 : &tv));
+}
+
+int64_t TcpSocketEngine::nativeWrite(const char* data, std::size_t length)
+{
+
+ // setup our WSA write buffer
+ WSABUF buf;
+ buf.buf = (char*)data;
+ buf.len = length;
+
+ // attempt to write bytes
+ DWORD flags = 0;
+ DWORD bytesWritten = 0;
+ const int writeResult = WSASend(m_socketDescriptor, &buf, 1, &bytesWritten, flags, 0, 0);
+ if (writeResult == SOCKET_ERROR) return -1;
+
+ // return number of bytes written
+ return static_cast<int64_t>(bytesWritten);
+}
diff --git a/src/api/internal/io/TcpSocket_p.cpp b/src/api/internal/io/TcpSocket_p.cpp
new file mode 100644
index 0000000..fee7823
--- /dev/null
+++ b/src/api/internal/io/TcpSocket_p.cpp
@@ -0,0 +1,446 @@
+// ***************************************************************************
+// TcpSocket_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 5 January 2012 (DB)
+// ---------------------------------------------------------------------------
+// Provides basic TCP I/O interface
+// ***************************************************************************
+
+#include "api/internal/io/TcpSocket_p.h"
+#include "api/internal/io/ByteArray_p.h"
+#include "api/internal/io/TcpSocketEngine_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <algorithm>
+#include <climits>
+#include <cstddef>
+#include <sstream>
+#include <vector>
+
+// Windows is brain-damaged and pollutes the entire global namespace with
+// its min() macro, which in turn causes MSVC to go haywire on std::min.
+#undef min
+
+// ------------------------------------
+// static utility methods & constants
+// ------------------------------------
+
+namespace BamTools {
+namespace Internal {
+
+// constants
+static const std::size_t DEFAULT_BUFFER_SIZE = 0x10000;
+static const int64_t DEFAULT_BUFFER_SIZE64 = DEFAULT_BUFFER_SIZE;
+
+} // namespace Internal
+} // namespace BamTools
+
+// --------------------------
+// TcpSocket implementation
+// --------------------------
+
+TcpSocket::TcpSocket()
+ : m_mode(IBamIODevice::NotOpen)
+ // , m_localPort(0)
+ , m_remotePort(0)
+ , m_engine(0)
+ , m_cachedSocketDescriptor(-1)
+ , m_readBuffer(DEFAULT_BUFFER_SIZE)
+ , m_error(TcpSocket::NoError)
+ , m_state(TcpSocket::UnconnectedState)
+{}
+
+TcpSocket::~TcpSocket()
+{
+ if (m_state == TcpSocket::ConnectedState) DisconnectFromHost();
+}
+
+std::size_t TcpSocket::BufferBytesAvailable() const
+{
+ return m_readBuffer.Size();
+}
+
+bool TcpSocket::CanReadLine() const
+{
+ return m_readBuffer.CanReadLine();
+}
+
+void TcpSocket::ClearBuffer()
+{
+ m_readBuffer.Clear();
+}
+
+bool TcpSocket::ConnectImpl(const HostInfo& hostInfo, const std::string& port,
+ IBamIODevice::OpenMode mode)
+{
+ // skip if we're already connected
+ if (m_state == TcpSocket::ConnectedState) {
+ m_error = TcpSocket::SocketResourceError;
+ m_errorString = "socket already connected";
+ return false;
+ }
+
+ // reset socket state
+ m_hostName = hostInfo.HostName();
+ m_mode = mode;
+ m_state = TcpSocket::UnconnectedState;
+ m_error = TcpSocket::NoError;
+ // m_localPort = 0;
+ m_remotePort = 0;
+ // m_localAddress.Clear();
+ m_remoteAddress.Clear();
+ m_readBuffer.Clear();
+
+ // fetch candidate addresses for requested host
+ std::vector<HostAddress> addresses = hostInfo.Addresses();
+ if (addresses.empty()) {
+ m_error = TcpSocket::HostNotFoundError;
+ m_errorString = "no IP addresses found for host";
+ return false;
+ }
+
+ // convert port string to integer
+ std::stringstream ss(port);
+ uint16_t portNumber(0);
+ ss >> portNumber;
+
+ // iterate through adddresses
+ std::vector<HostAddress>::const_iterator addrIter = addresses.begin();
+ std::vector<HostAddress>::const_iterator addrEnd = addresses.end();
+ for (; addrIter != addrEnd; ++addrIter) {
+ const HostAddress& addr = (*addrIter);
+
+ // try to initialize socket engine with this address
+ if (!InitializeSocketEngine(addr.GetProtocol())) {
+ // failure to initialize is OK here
+ // we'll just try the next available address
+ continue;
+ }
+
+ // attempt actual connection
+ if (m_engine->Connect(addr, portNumber)) {
+
+ // if connection successful, update our state & return true
+ m_mode = mode;
+ // m_localAddress = m_engine->GetLocalAddress();
+ // m_localPort = m_engine->GetLocalPort();
+ m_remoteAddress = m_engine->GetRemoteAddress();
+ m_remotePort = m_engine->GetRemotePort();
+ m_cachedSocketDescriptor = m_engine->GetSocketDescriptor();
+ m_state = TcpSocket::ConnectedState;
+ return true;
+ }
+ }
+
+ // if we get here, no connection could be made
+ m_error = TcpSocket::HostNotFoundError;
+ m_errorString = "could not connect to any host addresses";
+ return false;
+}
+
+bool TcpSocket::ConnectToHost(const std::string& hostName, uint16_t port,
+ IBamIODevice::OpenMode mode)
+{
+ std::stringstream ss;
+ ss << port;
+ return ConnectToHost(hostName, ss.str(), mode);
+}
+
+bool TcpSocket::ConnectToHost(const std::string& hostName, const std::string& port,
+ IBamIODevice::OpenMode mode)
+{
+ // create new address object with requested host name
+ HostAddress hostAddress;
+ hostAddress.SetAddress(hostName);
+
+ HostInfo info;
+ // if host name was IP address ("x.x.x.x" or IPv6 format)
+ // otherwise host name was 'plain-text' ("www.foo.bar")
+ // we need to look up IP address(es)
+ if (hostAddress.HasIPAddress())
+ info.SetAddresses(std::vector<HostAddress>(1, hostAddress));
+ else
+ info = HostInfo::Lookup(hostName, port);
+
+ // attempt connection on requested port
+ return ConnectImpl(info, port, mode);
+}
+
+void TcpSocket::DisconnectFromHost()
+{
+
+ // close socket engine & delete
+ if (m_state == TcpSocket::ConnectedState) ResetSocketEngine();
+
+ // reset connection state
+ // m_localPort = 0;
+ m_remotePort = 0;
+ // m_localAddress.Clear();
+ m_remoteAddress.Clear();
+ m_hostName.clear();
+ m_cachedSocketDescriptor = -1;
+
+ // for future, make sure there's outgoing data that needs to be flushed
+ m_readBuffer.Clear();
+}
+
+TcpSocket::SocketError TcpSocket::GetError() const
+{
+ return m_error;
+}
+
+std::string TcpSocket::GetErrorString() const
+{
+ return m_errorString;
+}
+
+std::string TcpSocket::GetHostName() const
+{
+ return m_hostName;
+}
+
+//HostAddress TcpSocket::GetLocalAddress() const {
+// return m_localAddress;
+//}
+
+//uint16_t TcpSocket::GetLocalPort() const {
+// return m_localPort;
+//}
+
+HostAddress TcpSocket::GetRemoteAddress() const
+{
+ return m_remoteAddress;
+}
+
+uint16_t TcpSocket::GetRemotePort() const
+{
+ return m_remotePort;
+}
+
+TcpSocket::SocketState TcpSocket::GetState() const
+{
+ return m_state;
+}
+
+bool TcpSocket::InitializeSocketEngine(HostAddress::NetworkProtocol protocol)
+{
+ ResetSocketEngine();
+ m_engine = new TcpSocketEngine;
+ return m_engine->Initialize(protocol);
+}
+
+bool TcpSocket::IsConnected() const
+{
+ if (m_engine == 0) return false;
+ return (m_engine->IsValid() && (m_state == TcpSocket::ConnectedState));
+}
+
+// may be read in a look until desired data amount has been read
+// returns: number of bytes read, or -1 if error
+int64_t TcpSocket::Read(char* data, const unsigned int numBytes)
+{
+
+ // if we have data in buffer, just return it
+ if (!m_readBuffer.IsEmpty()) {
+ const std::size_t bytesRead = m_readBuffer.Read(data, numBytes);
+ return static_cast<int64_t>(bytesRead);
+ }
+
+ // otherwise, we'll need to fetch data from socket
+ // first make sure we have a valid socket engine
+ if (m_engine == 0) {
+ // TODO: set error string/state?
+ return -1;
+ }
+
+ // fetch data from socket, return 0 for success, -1 for failure
+ // since this should be called in a loop,
+ // we'll pull the actual bytes from the buffer on next iteration
+ const int64_t socketBytesRead = ReadFromSocket();
+ if (socketBytesRead < 0) {
+ // TODO: set error string/state ?
+ return -1;
+ }
+
+ // we should have data now in buffer, try to fetch requested amount
+ // if nothing in buffer, we will return 0 bytes read (signals EOF reached)
+ const std::size_t numBytesRead = m_readBuffer.Read(data, numBytes);
+ return static_cast<int64_t>(numBytesRead);
+}
+
+int64_t TcpSocket::ReadFromSocket()
+{
+
+ // check for any socket engine errors
+ if (!m_engine->IsValid()) {
+ m_errorString = "TcpSocket::ReadFromSocket - socket disconnected";
+ ResetSocketEngine();
+ return -1;
+ }
+
+ // wait for ready read
+ bool timedOut;
+ const bool isReadyRead = m_engine->WaitForRead(5000, &timedOut);
+
+ // if not ready
+ if (!isReadyRead) {
+
+ // if we simply timed out
+ if (timedOut) {
+ // TODO: get add'l error info from engine ?
+ m_errorString = "TcpSocket::ReadFromSocket - timed out waiting for ready read";
+ }
+
+ // otherwise, there was some other error
+ else {
+ // TODO: get add'l error info from engine ?
+ m_errorString =
+ "TcpSocket::ReadFromSocket - encountered error while waiting for ready read";
+ }
+
+ // return failure
+ return -1;
+ }
+
+ // get number of bytes available from socket
+ const int64_t bytesToRead = m_engine->NumBytesAvailable();
+ if (bytesToRead < 0) {
+ // TODO: get add'l error info from engine ?
+ m_errorString =
+ "TcpSocket::ReadFromSocket - encountered error while determining numBytesAvailable";
+ return -1;
+ }
+
+ // make space in buffer & read from socket
+ char* buffer = m_readBuffer.Reserve(bytesToRead);
+ const int64_t numBytesRead = m_engine->Read(buffer, bytesToRead);
+ if (numBytesRead == -1) {
+ // TODO: get add'l error info from engine ?
+ m_errorString = "TcpSocket::ReadFromSocket - encountered error while reading bytes";
+ }
+
+ // return number of bytes actually read
+ return numBytesRead;
+}
+
+std::string TcpSocket::ReadLine(int64_t max)
+{
+
+ // prep result byte buffer
+ ByteArray result;
+ std::size_t bufferMax =
+ ((max > static_cast<int64_t>(UINT_MAX)) ? UINT_MAX : static_cast<std::size_t>(max));
+ result.Resize(bufferMax);
+
+ // read data
+ int64_t readBytes(0);
+ if (result.Size() == 0) {
+
+ if (bufferMax == 0) bufferMax = UINT_MAX;
+
+ result.Resize(1);
+
+ int64_t readResult;
+ do {
+ result.Resize(
+ static_cast<std::size_t>(std::min(bufferMax, result.Size() + DEFAULT_BUFFER_SIZE)));
+ readResult = ReadLine(result.Data() + readBytes, result.Size() - readBytes);
+ if (readResult > 0 || readBytes == 0) readBytes += readResult;
+ } while (readResult == DEFAULT_BUFFER_SIZE64 &&
+ result[static_cast<std::size_t>(readBytes - 1)] != '\n');
+
+ } else
+ readBytes = ReadLine(result.Data(), result.Size());
+
+ // clean up byte buffer
+ if (readBytes <= 0)
+ result.Clear();
+ else
+ result.Resize(static_cast<std::size_t>(readBytes));
+
+ // return byte buffer as string
+ return std::string(result.ConstData(), result.Size());
+}
+
+int64_t TcpSocket::ReadLine(char* dest, std::size_t max)
+{
+
+ // wait for buffer to contain line contents
+ if (!WaitForReadLine()) {
+ m_errorString = "TcpSocket::ReadLine - error waiting for read line";
+ return -1;
+ }
+
+ // leave room for null term
+ if (max < 2) return -1;
+ --max;
+
+ // read from buffer, handle newlines
+ int64_t readSoFar = m_readBuffer.ReadLine(dest, max);
+ if (readSoFar && dest[readSoFar - 1] == '\n') {
+
+ // adjust for windows-style '\r\n'
+ if (readSoFar > 1 && dest[readSoFar - 2] == '\r') {
+ --readSoFar;
+ dest[readSoFar - 1] = '\n';
+ }
+ }
+
+ // null terminate & return number of bytes read
+ dest[readSoFar] = '\0';
+ return readSoFar;
+}
+
+void TcpSocket::ResetSocketEngine()
+{
+
+ // shut down socket engine
+ if (m_engine) {
+ m_engine->Close();
+ delete m_engine;
+ m_engine = 0;
+ }
+
+ // reset our state & cached socket handle
+ m_state = TcpSocket::UnconnectedState;
+ m_cachedSocketDescriptor = -1;
+}
+
+bool TcpSocket::WaitForReadLine()
+{
+
+ // wait until we can read a line (will return immediately if already capable)
+ while (!CanReadLine()) {
+ if (!ReadFromSocket()) return false;
+ }
+
+ // if we get here, success
+ return true;
+}
+
+int64_t TcpSocket::Write(const char* data, const unsigned int numBytes)
+{
+
+ // single-shot attempt at write (not buffered, just try to shove the data through socket)
+ // this method purely exists to send 'small' HTTP requests/FTP commands from client to server
+
+ // wait for our socket to be write-able
+ bool timedOut;
+ const bool isReadyWrite = m_engine->WaitForWrite(3000, &timedOut);
+
+ // if ready, return number of bytes written
+ if (isReadyWrite) return m_engine->Write(data, numBytes);
+
+ // otherwise, socket not ready for writing
+ // set error string depending on reason & return failure
+ if (!timedOut) {
+ // TODO: get add'l error info from engine ??
+ m_errorString = "TcpSocket::Write - timed out waiting for ready-write";
+ } else {
+ // TODO: get add'l error info from engine ??
+ m_errorString = "TcpSocket::Write - error encountered while waiting for ready-write";
+ }
+ return -1;
+}
diff --git a/src/api/internal/io/TcpSocket_p.h b/src/api/internal/io/TcpSocket_p.h
new file mode 100644
index 0000000..3ba33c6
--- /dev/null
+++ b/src/api/internal/io/TcpSocket_p.h
@@ -0,0 +1,132 @@
+// ***************************************************************************
+// TcpSocket_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 7 December 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides basic TCP I/O interface
+// ***************************************************************************
+
+#ifndef TCPSOCKET_P_H
+#define TCPSOCKET_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <cstddef>
+#include <string>
+#include "api/IBamIODevice.h"
+#include "api/internal/io/HostInfo_p.h"
+#include "api/internal/io/RollingBuffer_p.h"
+
+namespace BamTools {
+namespace Internal {
+
+class BamHttp;
+class TcpSocketEngine;
+
+class TcpSocket
+{
+
+ // enums
+public:
+ enum SocketError
+ {
+ NoError = -2,
+ UnknownSocketError = -1,
+ ConnectionRefusedError = 0,
+ RemoteHostClosedError,
+ HostNotFoundError,
+ SocketAccessError,
+ SocketResourceError,
+ SocketTimeoutError,
+ NetworkError,
+ UnsupportedSocketOperationError
+ };
+
+ enum SocketState
+ {
+ UnconnectedState = 0,
+ ConnectedState
+ };
+
+ // ctor & dtor
+public:
+ TcpSocket();
+ ~TcpSocket();
+
+ // TcpSocket interface
+public:
+ // connection methods
+ bool ConnectToHost(const std::string& hostName,
+ const uint16_t port, // Connect("host", 80)
+ IBamIODevice::OpenMode mode = IBamIODevice::ReadOnly);
+ bool ConnectToHost(const std::string& hostName,
+ const std::string& port, // Connect("host", "80")
+ IBamIODevice::OpenMode mode = IBamIODevice::ReadOnly);
+ void DisconnectFromHost();
+ bool IsConnected() const;
+
+ // I/O methods
+ std::size_t BufferBytesAvailable() const;
+ bool CanReadLine() const;
+ void ClearBuffer(); // force buffer to clear (not a 'flush', just a 'discard')
+ int64_t Read(char* data, const unsigned int numBytes);
+ std::string ReadLine(int64_t max = 0);
+ int64_t ReadLine(char* dest, std::size_t max);
+ bool WaitForReadLine();
+ int64_t Write(const char* data, const unsigned int numBytes);
+
+ // connection values
+ std::string GetHostName() const;
+ // HostAddress GetLocalAddress() const;
+ // uint16_t GetLocalPort() const;
+ HostAddress GetRemoteAddress() const;
+ uint16_t GetRemotePort() const;
+
+ // connection status
+ TcpSocket::SocketError GetError() const;
+ TcpSocket::SocketState GetState() const;
+ std::string GetErrorString() const;
+
+ // internal methods
+private:
+ bool ConnectImpl(const HostInfo& hostInfo, const std::string& port,
+ IBamIODevice::OpenMode mode);
+ bool InitializeSocketEngine(HostAddress::NetworkProtocol protocol);
+ int64_t ReadFromSocket();
+ void ResetSocketEngine();
+
+ // data members
+private:
+ IBamIODevice::OpenMode m_mode;
+
+ std::string m_hostName;
+ // uint16_t m_localPort;
+ uint16_t m_remotePort;
+ // HostAddress m_localAddress;
+ HostAddress m_remoteAddress;
+
+ TcpSocketEngine* m_engine;
+ int m_cachedSocketDescriptor;
+
+ RollingBuffer m_readBuffer;
+
+ TcpSocket::SocketError m_error;
+ TcpSocket::SocketState m_state;
+ std::string m_errorString;
+
+ friend class BamHttp;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // TCPSOCKET_P_H
diff --git a/src/api/internal/sam/CMakeLists.txt b/src/api/internal/sam/CMakeLists.txt
new file mode 100644
index 0000000..2f303bd
--- /dev/null
+++ b/src/api/internal/sam/CMakeLists.txt
@@ -0,0 +1,17 @@
+# ==========================
+# BamTools CMakeLists.txt
+# (c) 2011 Derek Barnett
+#
+# src/api/internal/sam
+# ==========================
+
+set( InternalSamDir "${InternalDir}/sam" )
+
+set( InternalSamSources
+ ${InternalSamDir}/SamFormatParser_p.cpp
+ ${InternalSamDir}/SamFormatPrinter_p.cpp
+ ${InternalSamDir}/SamHeaderValidator_p.cpp
+
+ PARENT_SCOPE # <-- leave this last
+)
+
diff --git a/src/api/internal/sam/SamFormatParser_p.cpp b/src/api/internal/sam/SamFormatParser_p.cpp
new file mode 100644
index 0000000..2370e26
--- /dev/null
+++ b/src/api/internal/sam/SamFormatParser_p.cpp
@@ -0,0 +1,263 @@
+// ***************************************************************************
+// SamFormatParser.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 8 December 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for parsing SAM header text into SamHeader object
+// ***************************************************************************
+
+#include "api/internal/sam/SamFormatParser_p.h"
+#include "api/SamConstants.h"
+#include "api/SamHeader.h"
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+SamFormatParser::SamFormatParser(SamHeader& header)
+ : m_header(header)
+{}
+
+SamFormatParser::~SamFormatParser() {}
+
+void SamFormatParser::Parse(const std::string& headerText)
+{
+
+ // clear header's prior contents
+ m_header.Clear();
+
+ // empty header is OK, but skip processing
+ if (headerText.empty()) return;
+
+ // other wise parse SAM lines
+ std::istringstream headerStream(headerText);
+ std::string headerLine;
+ while (std::getline(headerStream, headerLine))
+ ParseSamLine(headerLine);
+}
+
+void SamFormatParser::ParseSamLine(const std::string& line)
+{
+
+ // skip if line is not long enough to contain true values
+ if (line.length() < 5) return;
+
+ // determine token at beginning of line
+ const std::string firstToken = line.substr(0, 3);
+ const std::string restOfLine = line.substr(4);
+ if (firstToken == Constants::SAM_HD_BEGIN_TOKEN)
+ ParseHDLine(restOfLine);
+ else if (firstToken == Constants::SAM_SQ_BEGIN_TOKEN)
+ ParseSQLine(restOfLine);
+ else if (firstToken == Constants::SAM_RG_BEGIN_TOKEN)
+ ParseRGLine(restOfLine);
+ else if (firstToken == Constants::SAM_PG_BEGIN_TOKEN)
+ ParsePGLine(restOfLine);
+ else if (firstToken == Constants::SAM_CO_BEGIN_TOKEN)
+ ParseCOLine(restOfLine);
+}
+
+void SamFormatParser::ParseHDLine(const std::string& line)
+{
+
+ // split HD lines into tokens
+ std::vector<std::string> tokens = Split(line, Constants::SAM_TAB);
+
+ // iterate over tokens
+ std::vector<std::string>::const_iterator tokenIter = tokens.begin();
+ std::vector<std::string>::const_iterator tokenEnd = tokens.end();
+ for (; tokenIter != tokenEnd; ++tokenIter) {
+
+ // get tag/value
+ const std::string tokenTag = (*tokenIter).substr(0, 2);
+ const std::string tokenValue = (*tokenIter).substr(3);
+
+ // set header contents
+ if (tokenTag == Constants::SAM_HD_VERSION_TAG)
+ m_header.Version = tokenValue;
+ else if (tokenTag == Constants::SAM_HD_SORTORDER_TAG)
+ m_header.SortOrder = tokenValue;
+ else if (tokenTag == Constants::SAM_HD_GROUPORDER_TAG)
+ m_header.GroupOrder = tokenValue;
+ else { // custom tag
+ CustomHeaderTag otherTag;
+ otherTag.TagName = tokenTag;
+ otherTag.TagValue = tokenValue;
+ m_header.CustomTags.push_back(otherTag);
+ }
+ }
+
+ // check for required tags
+ if (!m_header.HasVersion())
+ throw BamException("SamFormatParser::ParseHDLine", "@HD line is missing VN tag");
+}
+
+void SamFormatParser::ParseSQLine(const std::string& line)
+{
+
+ SamSequence seq;
+
+ // split SQ line into tokens
+ std::vector<std::string> tokens = Split(line, Constants::SAM_TAB);
+
+ // iterate over tokens
+ std::vector<std::string>::const_iterator tokenIter = tokens.begin();
+ std::vector<std::string>::const_iterator tokenEnd = tokens.end();
+ for (; tokenIter != tokenEnd; ++tokenIter) {
+
+ // get tag/value
+ const std::string tokenTag = (*tokenIter).substr(0, 2);
+ const std::string tokenValue = (*tokenIter).substr(3);
+
+ // set sequence contents
+ if (tokenTag == Constants::SAM_SQ_NAME_TAG)
+ seq.Name = tokenValue;
+ else if (tokenTag == Constants::SAM_SQ_LENGTH_TAG)
+ seq.Length = tokenValue;
+ else if (tokenTag == Constants::SAM_SQ_ASSEMBLYID_TAG)
+ seq.AssemblyID = tokenValue;
+ else if (tokenTag == Constants::SAM_SQ_CHECKSUM_TAG)
+ seq.Checksum = tokenValue;
+ else if (tokenTag == Constants::SAM_SQ_SPECIES_TAG)
+ seq.Species = tokenValue;
+ else if (tokenTag == Constants::SAM_SQ_URI_TAG)
+ seq.URI = tokenValue;
+ else { // custom tag
+ CustomHeaderTag otherTag;
+ otherTag.TagName = tokenTag;
+ otherTag.TagValue = tokenValue;
+ seq.CustomTags.push_back(otherTag);
+ }
+ }
+
+ // check for required tags
+ if (!seq.HasName())
+ throw BamException("SamFormatParser::ParseSQLine", "@SQ line is missing SN tag");
+ if (!seq.HasLength())
+ throw BamException("SamFormatParser::ParseSQLine", "@SQ line is missing LN tag");
+
+ // store SAM sequence entry
+ m_header.Sequences.Add(seq);
+}
+
+void SamFormatParser::ParseRGLine(const std::string& line)
+{
+
+ SamReadGroup rg;
+
+ // split string into tokens
+ std::vector<std::string> tokens = Split(line, Constants::SAM_TAB);
+
+ // iterate over tokens
+ std::vector<std::string>::const_iterator tokenIter = tokens.begin();
+ std::vector<std::string>::const_iterator tokenEnd = tokens.end();
+ for (; tokenIter != tokenEnd; ++tokenIter) {
+
+ // get token tag/value
+ const std::string tokenTag = (*tokenIter).substr(0, 2);
+ const std::string tokenValue = (*tokenIter).substr(3);
+
+ // set read group contents
+ if (tokenTag == Constants::SAM_RG_ID_TAG)
+ rg.ID = tokenValue;
+ else if (tokenTag == Constants::SAM_RG_DESCRIPTION_TAG)
+ rg.Description = tokenValue;
+ else if (tokenTag == Constants::SAM_RG_FLOWORDER_TAG)
+ rg.FlowOrder = tokenValue;
+ else if (tokenTag == Constants::SAM_RG_KEYSEQUENCE_TAG)
+ rg.KeySequence = tokenValue;
+ else if (tokenTag == Constants::SAM_RG_LIBRARY_TAG)
+ rg.Library = tokenValue;
+ else if (tokenTag == Constants::SAM_RG_PLATFORMUNIT_TAG)
+ rg.PlatformUnit = tokenValue;
+ else if (tokenTag == Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG)
+ rg.PredictedInsertSize = tokenValue;
+ else if (tokenTag == Constants::SAM_RG_PRODUCTIONDATE_TAG)
+ rg.ProductionDate = tokenValue;
+ else if (tokenTag == Constants::SAM_RG_PROGRAM_TAG)
+ rg.Program = tokenValue;
+ else if (tokenTag == Constants::SAM_RG_SAMPLE_TAG)
+ rg.Sample = tokenValue;
+ else if (tokenTag == Constants::SAM_RG_SEQCENTER_TAG)
+ rg.SequencingCenter = tokenValue;
+ else if (tokenTag == Constants::SAM_RG_SEQTECHNOLOGY_TAG)
+ rg.SequencingTechnology = tokenValue;
+ else { // custom tag
+ CustomHeaderTag otherTag;
+ otherTag.TagName = tokenTag;
+ otherTag.TagValue = tokenValue;
+ rg.CustomTags.push_back(otherTag);
+ }
+ }
+
+ // check for required tags
+ if (!rg.HasID())
+ throw BamException("SamFormatParser::ParseRGLine", "@RG line is missing ID tag");
+
+ // store SAM read group entry
+ m_header.ReadGroups.Add(rg);
+}
+
+void SamFormatParser::ParsePGLine(const std::string& line)
+{
+
+ SamProgram pg;
+
+ // split string into tokens
+ std::vector<std::string> tokens = Split(line, Constants::SAM_TAB);
+
+ // iterate over tokens
+ std::vector<std::string>::const_iterator tokenIter = tokens.begin();
+ std::vector<std::string>::const_iterator tokenEnd = tokens.end();
+ for (; tokenIter != tokenEnd; ++tokenIter) {
+
+ // get token tag/value
+ const std::string tokenTag = (*tokenIter).substr(0, 2);
+ const std::string tokenValue = (*tokenIter).substr(3);
+
+ // set program record contents
+ if (tokenTag == Constants::SAM_PG_ID_TAG)
+ pg.ID = tokenValue;
+ else if (tokenTag == Constants::SAM_PG_NAME_TAG)
+ pg.Name = tokenValue;
+ else if (tokenTag == Constants::SAM_PG_COMMANDLINE_TAG)
+ pg.CommandLine = tokenValue;
+ else if (tokenTag == Constants::SAM_PG_PREVIOUSPROGRAM_TAG)
+ pg.PreviousProgramID = tokenValue;
+ else if (tokenTag == Constants::SAM_PG_VERSION_TAG)
+ pg.Version = tokenValue;
+ else { // custom tag
+ CustomHeaderTag otherTag;
+ otherTag.TagName = tokenTag;
+ otherTag.TagValue = tokenValue;
+ pg.CustomTags.push_back(otherTag);
+ }
+ }
+
+ // check for required tags
+ if (!pg.HasID())
+ throw BamException("SamFormatParser::ParsePGLine", "@PG line is missing ID tag");
+
+ // store SAM program entry
+ m_header.Programs.Add(pg);
+}
+
+void SamFormatParser::ParseCOLine(const std::string& line)
+{
+ // simply add line to comments list
+ m_header.Comments.push_back(line);
+}
+
+const std::vector<std::string> SamFormatParser::Split(const std::string& line, const char delim)
+{
+ std::vector<std::string> tokens;
+ std::stringstream lineStream(line);
+ std::string token;
+ while (std::getline(lineStream, token, delim))
+ tokens.push_back(token);
+ return tokens;
+}
diff --git a/src/api/internal/sam/SamFormatParser_p.h b/src/api/internal/sam/SamFormatParser_p.h
new file mode 100644
index 0000000..39bd44a
--- /dev/null
+++ b/src/api/internal/sam/SamFormatParser_p.h
@@ -0,0 +1,62 @@
+// ***************************************************************************
+// SamFormatParser.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for parsing SAM header text into SamHeader object
+// ***************************************************************************
+
+#ifndef SAM_FORMAT_PARSER_H
+#define SAM_FORMAT_PARSER_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+struct SamHeader;
+
+namespace Internal {
+
+class SamFormatParser
+{
+
+ // ctor & dtor
+public:
+ SamFormatParser(BamTools::SamHeader& header);
+ ~SamFormatParser();
+
+ // parse text & populate header data
+public:
+ void Parse(const std::string& headerText);
+
+ // internal methods
+private:
+ void ParseSamLine(const std::string& line);
+ void ParseHDLine(const std::string& line);
+ void ParseSQLine(const std::string& line);
+ void ParseRGLine(const std::string& line);
+ void ParsePGLine(const std::string& line);
+ void ParseCOLine(const std::string& line);
+ const std::vector<std::string> Split(const std::string& line, const char delim);
+
+ // data members
+private:
+ SamHeader& m_header;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // SAM_FORMAT_PARSER_H
diff --git a/src/api/internal/sam/SamFormatPrinter_p.cpp b/src/api/internal/sam/SamFormatPrinter_p.cpp
new file mode 100644
index 0000000..2b93a04
--- /dev/null
+++ b/src/api/internal/sam/SamFormatPrinter_p.cpp
@@ -0,0 +1,240 @@
+// ***************************************************************************
+// SamFormatPrinter.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for printing formatted SAM header to string
+// ***************************************************************************
+
+#include "api/internal/sam/SamFormatPrinter_p.h"
+#include "api/SamConstants.h"
+#include "api/SamHeader.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstddef>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+// ------------------------
+// static utility methods
+// ------------------------
+
+static inline const std::string FormatTag(const std::string& tag, const std::string& value)
+{
+ return std::string(Constants::SAM_TAB + tag + Constants::SAM_COLON + value);
+}
+
+// ---------------------------------
+// SamFormatPrinter implementation
+// ---------------------------------
+
+SamFormatPrinter::SamFormatPrinter(const SamHeader& header)
+ : m_header(header)
+{}
+
+SamFormatPrinter::~SamFormatPrinter() {}
+
+const std::string SamFormatPrinter::ToString() const
+{
+
+ // clear out stream
+ std::stringstream out;
+
+ // generate formatted header text
+ PrintHD(out);
+ PrintSQ(out);
+ PrintRG(out);
+ PrintPG(out);
+ PrintCO(out);
+
+ // return result
+ return out.str();
+}
+
+void SamFormatPrinter::PrintHD(std::stringstream& out) const
+{
+
+ // if header has @HD data
+ if (m_header.HasVersion()) {
+
+ // @HD VN:<Version>
+ out << Constants::SAM_HD_BEGIN_TOKEN
+ << FormatTag(Constants::SAM_HD_VERSION_TAG, m_header.Version);
+
+ // SO:<SortOrder>
+ if (m_header.HasSortOrder())
+ out << FormatTag(Constants::SAM_HD_SORTORDER_TAG, m_header.SortOrder);
+
+ // GO:<GroupOrder>
+ if (m_header.HasGroupOrder())
+ out << FormatTag(Constants::SAM_HD_GROUPORDER_TAG, m_header.GroupOrder);
+
+ // custom tags
+ if (!m_header.CustomTags.empty()) {
+ for (std::size_t i = 0; i < m_header.CustomTags.size(); ++i) {
+ const CustomHeaderTag& customTag = m_header.CustomTags[i];
+ out << FormatTag(customTag.TagName, customTag.TagValue);
+ }
+ }
+ // newline
+ out << std::endl;
+ }
+}
+
+void SamFormatPrinter::PrintSQ(std::stringstream& out) const
+{
+
+ // iterate over sequence entries
+ SamSequenceConstIterator seqIter = m_header.Sequences.ConstBegin();
+ SamSequenceConstIterator seqEnd = m_header.Sequences.ConstEnd();
+ for (; seqIter != seqEnd; ++seqIter) {
+ const SamSequence& seq = (*seqIter);
+
+ // @SQ SN:<Name> LN:<Length>
+ out << Constants::SAM_SQ_BEGIN_TOKEN << FormatTag(Constants::SAM_SQ_NAME_TAG, seq.Name)
+ << FormatTag(Constants::SAM_SQ_LENGTH_TAG, seq.Length);
+
+ // AS:<AssemblyID>
+ if (seq.HasAssemblyID()) out << FormatTag(Constants::SAM_SQ_ASSEMBLYID_TAG, seq.AssemblyID);
+
+ // M5:<Checksum>
+ if (seq.HasChecksum()) out << FormatTag(Constants::SAM_SQ_CHECKSUM_TAG, seq.Checksum);
+
+ // SP:<Species>
+ if (seq.HasSpecies()) out << FormatTag(Constants::SAM_SQ_SPECIES_TAG, seq.Species);
+
+ // UR:<URI>
+ if (seq.HasURI()) out << FormatTag(Constants::SAM_SQ_URI_TAG, seq.URI);
+
+ // custom tags
+ if (!seq.CustomTags.empty()) {
+ for (std::size_t i = 0; i < seq.CustomTags.size(); ++i) {
+ const CustomHeaderTag& customTag = seq.CustomTags[i];
+ out << FormatTag(customTag.TagName, customTag.TagValue);
+ }
+ }
+
+ // newline
+ out << std::endl;
+ }
+}
+
+void SamFormatPrinter::PrintRG(std::stringstream& out) const
+{
+
+ // iterate over read group entries
+ SamReadGroupConstIterator rgIter = m_header.ReadGroups.ConstBegin();
+ SamReadGroupConstIterator rgEnd = m_header.ReadGroups.ConstEnd();
+ for (; rgIter != rgEnd; ++rgIter) {
+ const SamReadGroup& rg = (*rgIter);
+
+ // @RG ID:<ID>
+ out << Constants::SAM_RG_BEGIN_TOKEN << FormatTag(Constants::SAM_RG_ID_TAG, rg.ID);
+
+ // CN:<SequencingCenter>
+ if (rg.HasSequencingCenter())
+ out << FormatTag(Constants::SAM_RG_SEQCENTER_TAG, rg.SequencingCenter);
+
+ // DS:<Description>
+ if (rg.HasDescription())
+ out << FormatTag(Constants::SAM_RG_DESCRIPTION_TAG, rg.Description);
+
+ // DT:<ProductionDate>
+ if (rg.HasProductionDate())
+ out << FormatTag(Constants::SAM_RG_PRODUCTIONDATE_TAG, rg.ProductionDate);
+
+ // FO:<FlowOrder>
+ if (rg.HasFlowOrder()) out << FormatTag(Constants::SAM_RG_FLOWORDER_TAG, rg.FlowOrder);
+
+ // KS:<KeySequence>
+ if (rg.HasKeySequence())
+ out << FormatTag(Constants::SAM_RG_KEYSEQUENCE_TAG, rg.KeySequence);
+
+ // LB:<Library>
+ if (rg.HasLibrary()) out << FormatTag(Constants::SAM_RG_LIBRARY_TAG, rg.Library);
+
+ // PG:<Program>
+ if (rg.HasProgram()) out << FormatTag(Constants::SAM_RG_PROGRAM_TAG, rg.Program);
+
+ // PI:<PredictedInsertSize>
+ if (rg.HasPredictedInsertSize())
+ out << FormatTag(Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG, rg.PredictedInsertSize);
+
+ // PL:<SequencingTechnology>
+ if (rg.HasSequencingTechnology())
+ out << FormatTag(Constants::SAM_RG_SEQTECHNOLOGY_TAG, rg.SequencingTechnology);
+
+ // PU:<PlatformUnit>
+ if (rg.HasPlatformUnit())
+ out << FormatTag(Constants::SAM_RG_PLATFORMUNIT_TAG, rg.PlatformUnit);
+
+ // SM:<Sample>
+ if (rg.HasSample()) out << FormatTag(Constants::SAM_RG_SAMPLE_TAG, rg.Sample);
+
+ // custom tags
+ if (!rg.CustomTags.empty()) {
+ for (std::size_t i = 0; i < rg.CustomTags.size(); ++i) {
+ const CustomHeaderTag& customTag = rg.CustomTags[i];
+ out << FormatTag(customTag.TagName, customTag.TagValue);
+ }
+ }
+
+ // newline
+ out << std::endl;
+ }
+}
+
+void SamFormatPrinter::PrintPG(std::stringstream& out) const
+{
+
+ // iterate over program record entries
+ SamProgramConstIterator pgIter = m_header.Programs.ConstBegin();
+ SamProgramConstIterator pgEnd = m_header.Programs.ConstEnd();
+ for (; pgIter != pgEnd; ++pgIter) {
+ const SamProgram& pg = (*pgIter);
+
+ // @PG ID:<ID>
+ out << Constants::SAM_PG_BEGIN_TOKEN << FormatTag(Constants::SAM_PG_ID_TAG, pg.ID);
+
+ // PN:<Name>
+ if (pg.HasName()) out << FormatTag(Constants::SAM_PG_NAME_TAG, pg.Name);
+
+ // CL:<CommandLine>
+ if (pg.HasCommandLine())
+ out << FormatTag(Constants::SAM_PG_COMMANDLINE_TAG, pg.CommandLine);
+
+ // PP:<PreviousProgramID>
+ if (pg.HasPreviousProgramID())
+ out << FormatTag(Constants::SAM_PG_PREVIOUSPROGRAM_TAG, pg.PreviousProgramID);
+
+ // VN:<Version>
+ if (pg.HasVersion()) out << FormatTag(Constants::SAM_PG_VERSION_TAG, pg.Version);
+
+ // custom tags
+ if (!pg.CustomTags.empty()) {
+ for (std::size_t i = 0; i < pg.CustomTags.size(); ++i) {
+ const CustomHeaderTag& customTag = pg.CustomTags[i];
+ out << FormatTag(customTag.TagName, customTag.TagValue);
+ }
+ }
+
+ // newline
+ out << std::endl;
+ }
+}
+
+void SamFormatPrinter::PrintCO(std::stringstream& out) const
+{
+
+ // iterate over comments
+ std::vector<std::string>::const_iterator commentIter = m_header.Comments.begin();
+ std::vector<std::string>::const_iterator commentEnd = m_header.Comments.end();
+ for (; commentIter != commentEnd; ++commentIter) {
+
+ // @CO <Comment>
+ out << Constants::SAM_CO_BEGIN_TOKEN << Constants::SAM_TAB << (*commentIter) << std::endl;
+ }
+}
diff --git a/src/api/internal/sam/SamFormatPrinter_p.h b/src/api/internal/sam/SamFormatPrinter_p.h
new file mode 100644
index 0000000..b43e5a2
--- /dev/null
+++ b/src/api/internal/sam/SamFormatPrinter_p.h
@@ -0,0 +1,60 @@
+// ***************************************************************************
+// SamFormatPrinter.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 6 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for printing formatted SAM header to string
+// ***************************************************************************
+
+#ifndef SAM_FORMAT_PRINTER_H
+#define SAM_FORMAT_PRINTER_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <sstream>
+#include <string>
+
+namespace BamTools {
+
+struct SamHeader;
+
+namespace Internal {
+
+class SamFormatPrinter
+{
+
+ // ctor & dtor
+public:
+ SamFormatPrinter(const BamTools::SamHeader& header);
+ ~SamFormatPrinter();
+
+ // generates SAM-formatted string from header data
+public:
+ const std::string ToString() const;
+
+ // internal methods
+private:
+ void PrintHD(std::stringstream& out) const;
+ void PrintSQ(std::stringstream& out) const;
+ void PrintRG(std::stringstream& out) const;
+ void PrintPG(std::stringstream& out) const;
+ void PrintCO(std::stringstream& out) const;
+
+ // data members
+private:
+ const SamHeader& m_header;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // SAM_FORMAT_PRINTER_H
diff --git a/src/api/internal/sam/SamHeaderValidator_p.cpp b/src/api/internal/sam/SamHeaderValidator_p.cpp
new file mode 100644
index 0000000..10320b1
--- /dev/null
+++ b/src/api/internal/sam/SamHeaderValidator_p.cpp
@@ -0,0 +1,536 @@
+// ***************************************************************************
+// SamHeaderValidator.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for validating SamHeader data
+// ***************************************************************************
+
+#include "api/internal/sam/SamHeaderValidator_p.h"
+#include "api/SamConstants.h"
+#include "api/SamHeader.h"
+#include "api/internal/sam/SamHeaderVersion_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cctype>
+#include <cstddef>
+#include <set>
+#include <sstream>
+
+// ------------------------
+// static utility methods
+// -------------------------
+
+static bool caseInsensitiveCompare(const std::string& lhs, const std::string& rhs)
+{
+
+ // can omit checking chars if lengths not equal
+ const int lhsLength = lhs.length();
+ const int rhsLength = rhs.length();
+ if (lhsLength != rhsLength) return false;
+
+ // do *basic* toupper checks on each string char's
+ for (int i = 0; i < lhsLength; ++i) {
+ if (toupper((int)lhs.at(i)) != toupper((int)rhs.at(i))) return false;
+ }
+
+ // otherwise OK
+ return true;
+}
+
+// ------------------------------------------------------------------------
+// Allow validation rules to vary, as needed, between SAM header versions
+//
+// use SAM_VERSION_X_Y to tag important changes
+//
+// Together, they will allow for comparisons like:
+// if ( m_version < SAM_VERSION_2_0 ) {
+// // use some older rule
+// else
+// // use rule introduced with version 2.0
+
+static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1, 0);
+static const SamHeaderVersion SAM_VERSION_1_1 = SamHeaderVersion(1, 1);
+static const SamHeaderVersion SAM_VERSION_1_2 = SamHeaderVersion(1, 2);
+static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1, 3);
+static const SamHeaderVersion SAM_VERSION_1_4 = SamHeaderVersion(1, 4);
+
+// TODO: This functionality is currently unused.
+// Make validation "version-aware."
+//
+// ------------------------------------------------------------------------
+
+const std::string SamHeaderValidator::ERROR_PREFIX = "ERROR: ";
+const std::string SamHeaderValidator::WARN_PREFIX = "WARNING: ";
+const std::string SamHeaderValidator::NEWLINE(1, '\n');
+
+SamHeaderValidator::SamHeaderValidator(const SamHeader& header)
+ : m_header(header)
+{}
+
+SamHeaderValidator::~SamHeaderValidator() {}
+
+void SamHeaderValidator::AddError(const std::string& message)
+{
+ m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE);
+}
+
+void SamHeaderValidator::AddWarning(const std::string& message)
+{
+ m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE);
+}
+
+void SamHeaderValidator::PrintErrorMessages(std::ostream& stream)
+{
+
+ // skip if no error messages
+ if (m_errorMessages.empty()) return;
+
+ // print error header line
+ stream << "* SAM header has " << m_errorMessages.size() << " errors:" << std::endl;
+
+ // print each error message
+ std::vector<std::string>::const_iterator errorIter = m_errorMessages.begin();
+ std::vector<std::string>::const_iterator errorEnd = m_errorMessages.end();
+ for (; errorIter != errorEnd; ++errorIter)
+ stream << (*errorIter);
+}
+
+void SamHeaderValidator::PrintMessages(std::ostream& stream)
+{
+ PrintErrorMessages(stream);
+ PrintWarningMessages(stream);
+}
+
+void SamHeaderValidator::PrintWarningMessages(std::ostream& stream)
+{
+
+ // skip if no warning messages
+ if (m_warningMessages.empty()) return;
+
+ // print warning header line
+ stream << "* SAM header has " << m_warningMessages.size() << " warnings:" << std::endl;
+
+ // print each warning message
+ std::vector<std::string>::const_iterator warnIter = m_warningMessages.begin();
+ std::vector<std::string>::const_iterator warnEnd = m_warningMessages.end();
+ for (; warnIter != warnEnd; ++warnIter)
+ stream << (*warnIter);
+}
+
+// entry point for validation
+bool SamHeaderValidator::Validate()
+{
+ bool isValid = true;
+ isValid &= ValidateMetadata();
+ isValid &= ValidateSequenceDictionary();
+ isValid &= ValidateReadGroupDictionary();
+ isValid &= ValidateProgramChain();
+ return isValid;
+}
+
+// check all SAM header 'metadata'
+bool SamHeaderValidator::ValidateMetadata()
+{
+ bool isValid = true;
+ isValid &= ValidateVersion();
+ isValid &= ValidateSortOrder();
+ isValid &= ValidateGroupOrder();
+ return isValid;
+}
+
+// check SAM header version tag
+bool SamHeaderValidator::ValidateVersion()
+{
+
+ const std::string& version = m_header.Version;
+
+ // warn if version not present
+ if (version.empty()) {
+ AddWarning("Version (VN) missing. Not required, but strongly recommended");
+ return true;
+ }
+
+ // invalid if version does not contain a period
+ const std::size_t periodFound = version.find(Constants::SAM_PERIOD);
+ if (periodFound == std::string::npos) {
+ AddError("Invalid version (VN) format: " + version);
+ return false;
+ }
+
+ // invalid if major version is empty or contains non-digits
+ const std::string majorVersion = version.substr(0, periodFound);
+ if (majorVersion.empty() || !ContainsOnlyDigits(majorVersion)) {
+ AddError("Invalid version (VN) format: " + version);
+ return false;
+ }
+
+ // invalid if major version is empty or contains non-digits
+ const std::string minorVersion = version.substr(periodFound + 1);
+ if (minorVersion.empty() || !ContainsOnlyDigits(minorVersion)) {
+ AddError("Invalid version (VN) format: " + version);
+ return false;
+ }
+
+ // TODO: check if version is not just syntactically OK,
+ // but is also a valid SAM version ( 1.0 .. CURRENT )
+
+ // all checked out this far, then version is OK
+ return true;
+}
+
+// assumes non-empty input string
+bool SamHeaderValidator::ContainsOnlyDigits(const std::string& s)
+{
+ const std::size_t nonDigitPosition = s.find_first_not_of(Constants::SAM_DIGITS);
+ return (nonDigitPosition == std::string::npos);
+}
+
+// validate SAM header sort order tag
+bool SamHeaderValidator::ValidateSortOrder()
+{
+
+ const std::string& sortOrder = m_header.SortOrder;
+
+ // warn if sort order not present
+ if (sortOrder.empty()) {
+ AddWarning("Sort order (SO) missing. Not required, but strongly recommended");
+ return true;
+ }
+
+ // if sort order is valid keyword
+ if (sortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ||
+ sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ||
+ sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED) {
+ return true;
+ }
+
+ // otherwise
+ AddError("Invalid sort order (SO): " + sortOrder);
+ return false;
+}
+
+// validate SAM header group order tag
+bool SamHeaderValidator::ValidateGroupOrder()
+{
+
+ const std::string& groupOrder = m_header.GroupOrder;
+
+ // if no group order, no problem, just return OK
+ if (groupOrder.empty()) return true;
+
+ // if group order is valid keyword
+ if (groupOrder == Constants::SAM_HD_GROUPORDER_NONE ||
+ groupOrder == Constants::SAM_HD_GROUPORDER_QUERY ||
+ groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE) {
+ return true;
+ }
+
+ // otherwise
+ AddError("Invalid group order (GO): " + groupOrder);
+ return false;
+}
+
+// validate SAM header sequence dictionary
+bool SamHeaderValidator::ValidateSequenceDictionary()
+{
+
+ bool isValid = true;
+
+ // check for unique sequence names
+ isValid &= ContainsUniqueSequenceNames();
+
+ // iterate over sequences
+ const SamSequenceDictionary& sequences = m_header.Sequences;
+ SamSequenceConstIterator seqIter = sequences.ConstBegin();
+ SamSequenceConstIterator seqEnd = sequences.ConstEnd();
+ for (; seqIter != seqEnd; ++seqIter) {
+ const SamSequence& seq = (*seqIter);
+ isValid &= ValidateSequence(seq);
+ }
+
+ // return validation state
+ return isValid;
+}
+
+// make sure all SQ names are unique
+bool SamHeaderValidator::ContainsUniqueSequenceNames()
+{
+
+ bool isValid = true;
+ std::set<std::string> sequenceNames;
+ std::set<std::string>::iterator nameIter;
+
+ // iterate over sequences
+ const SamSequenceDictionary& sequences = m_header.Sequences;
+ SamSequenceConstIterator seqIter = sequences.ConstBegin();
+ SamSequenceConstIterator seqEnd = sequences.ConstEnd();
+ for (; seqIter != seqEnd; ++seqIter) {
+ const SamSequence& seq = (*seqIter);
+
+ // lookup sequence name
+ const std::string& name = seq.Name;
+ nameIter = sequenceNames.find(name);
+
+ // error if found (duplicate entry)
+ if (nameIter != sequenceNames.end()) {
+ AddError("Sequence name (SN): " + name + " is not unique");
+ isValid = false;
+ }
+
+ // otherwise ok, store name
+ sequenceNames.insert(name);
+ }
+
+ // return validation state
+ return isValid;
+}
+
+// validate SAM header sequence entry
+bool SamHeaderValidator::ValidateSequence(const SamSequence& seq)
+{
+ bool isValid = true;
+ isValid &= CheckNameFormat(seq.Name);
+ isValid &= CheckLengthInRange(seq.Length);
+ return isValid;
+}
+
+// check sequence name is valid format
+bool SamHeaderValidator::CheckNameFormat(const std::string& name)
+{
+
+ // invalid if name is empty
+ if (name.empty()) {
+ AddError("Sequence entry (@SQ) is missing SN tag");
+ return false;
+ }
+
+ // invalid if first character is a reserved char
+ const char firstChar = name.at(0);
+ if (firstChar == Constants::SAM_EQUAL || firstChar == Constants::SAM_STAR) {
+ AddError("Invalid sequence name (SN): " + name);
+ return false;
+ }
+ // otherwise OK
+ return true;
+}
+
+// check that sequence length is within accepted range
+bool SamHeaderValidator::CheckLengthInRange(const std::string& length)
+{
+
+ // invalid if empty
+ if (length.empty()) {
+ AddError("Sequence entry (@SQ) is missing LN tag");
+ return false;
+ }
+
+ // convert string length to numeric
+ std::stringstream lengthStream(length);
+ unsigned int sequenceLength;
+ lengthStream >> sequenceLength;
+
+ // invalid if length outside accepted range
+ if (sequenceLength < Constants::SAM_SQ_LENGTH_MIN ||
+ sequenceLength > Constants::SAM_SQ_LENGTH_MAX) {
+ AddError("Sequence length (LN): " + length + " out of range");
+ return false;
+ }
+
+ // otherwise OK
+ return true;
+}
+
+// validate SAM header read group dictionary
+bool SamHeaderValidator::ValidateReadGroupDictionary()
+{
+
+ bool isValid = true;
+
+ // check for unique read group IDs & platform units
+ isValid &= ContainsUniqueIDsAndPlatformUnits();
+
+ // iterate over read groups
+ const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
+ SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
+ SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
+ for (; rgIter != rgEnd; ++rgIter) {
+ const SamReadGroup& rg = (*rgIter);
+ isValid &= ValidateReadGroup(rg);
+ }
+
+ // return validation state
+ return isValid;
+}
+
+// make sure RG IDs and platform units are unique
+bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits()
+{
+
+ bool isValid = true;
+ std::set<std::string> readGroupIds;
+ std::set<std::string> platformUnits;
+ std::set<std::string>::iterator idIter;
+ std::set<std::string>::iterator puIter;
+
+ // iterate over sequences
+ const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
+ SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
+ SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
+ for (; rgIter != rgEnd; ++rgIter) {
+ const SamReadGroup& rg = (*rgIter);
+
+ // --------------------------------
+ // check for unique ID
+
+ // lookup read group ID
+ const std::string& id = rg.ID;
+ idIter = readGroupIds.find(id);
+
+ // error if found (duplicate entry)
+ if (idIter != readGroupIds.end()) {
+ AddError("Read group ID (ID): " + id + " is not unique");
+ isValid = false;
+ }
+
+ // otherwise ok, store id
+ readGroupIds.insert(id);
+
+ // --------------------------------
+ // check for unique platform unit
+
+ // lookup platform unit
+ const std::string& pu = rg.PlatformUnit;
+ puIter = platformUnits.find(pu);
+
+ // error if found (duplicate entry)
+ if (puIter != platformUnits.end()) {
+ AddError("Platform unit (PU): " + pu + " is not unique");
+ isValid = false;
+ }
+
+ // otherwise ok, store platform unit
+ platformUnits.insert(pu);
+ }
+
+ // return validation state
+ return isValid;
+}
+
+// validate SAM header read group entry
+bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg)
+{
+ bool isValid = true;
+ isValid &= CheckReadGroupID(rg.ID);
+ isValid &= CheckSequencingTechnology(rg.SequencingTechnology);
+ return isValid;
+}
+
+// make sure RG ID exists
+bool SamHeaderValidator::CheckReadGroupID(const std::string& id)
+{
+
+ // invalid if empty
+ if (id.empty()) {
+ AddError("Read group entry (@RG) is missing ID tag");
+ return false;
+ }
+
+ // otherwise OK
+ return true;
+}
+
+// make sure RG sequencing tech is one of the accepted keywords
+bool SamHeaderValidator::CheckSequencingTechnology(const std::string& technology)
+{
+
+ // if no technology provided, no problem, just return OK
+ if (technology.empty()) return true;
+
+ // if technology is valid keyword
+ if (caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_CAPILLARY) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_HELICOS) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_IONTORRENT) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_LS454) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_PACBIO) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_SOLID)) {
+ return true;
+ }
+
+ // otherwise
+ AddError("Invalid read group sequencing platform (PL): " + technology);
+ return false;
+}
+
+// validate the SAM header "program chain"
+bool SamHeaderValidator::ValidateProgramChain()
+{
+ bool isValid = true;
+ isValid &= ContainsUniqueProgramIds();
+ isValid &= ValidatePreviousProgramIds();
+ return isValid;
+}
+
+// make sure all PG IDs are unique
+bool SamHeaderValidator::ContainsUniqueProgramIds()
+{
+
+ bool isValid = true;
+ std::set<std::string> programIds;
+ std::set<std::string>::iterator pgIdIter;
+
+ // iterate over program records
+ const SamProgramChain& programs = m_header.Programs;
+ SamProgramConstIterator pgIter = programs.ConstBegin();
+ SamProgramConstIterator pgEnd = programs.ConstEnd();
+ for (; pgIter != pgEnd; ++pgIter) {
+ const SamProgram& pg = (*pgIter);
+
+ // lookup program ID
+ const std::string& pgId = pg.ID;
+ pgIdIter = programIds.find(pgId);
+
+ // error if found (duplicate entry)
+ if (pgIdIter != programIds.end()) {
+ AddError("Program ID (ID): " + pgId + " is not unique");
+ isValid = false;
+ }
+
+ // otherwise ok, store ID
+ programIds.insert(pgId);
+ }
+
+ // return validation state
+ return isValid;
+}
+
+// make sure that any PP tags present point to existing @PG IDs
+bool SamHeaderValidator::ValidatePreviousProgramIds()
+{
+
+ bool isValid = true;
+
+ // iterate over program records
+ const SamProgramChain& programs = m_header.Programs;
+ SamProgramConstIterator pgIter = programs.ConstBegin();
+ SamProgramConstIterator pgEnd = programs.ConstEnd();
+ for (; pgIter != pgEnd; ++pgIter) {
+ const SamProgram& pg = (*pgIter);
+
+ // ignore record for validation if PreviousProgramID is empty
+ const std::string& ppId = pg.PreviousProgramID;
+ if (ppId.empty()) continue;
+
+ // see if program "chain" contains an entry for ppId
+ if (!programs.Contains(ppId)) {
+ AddError("PreviousProgramID (PP): " + ppId + " is not a known ID");
+ isValid = false;
+ }
+ }
+
+ // return validation state
+ return isValid;
+}
diff --git a/src/api/internal/sam/SamHeaderValidator_p.h b/src/api/internal/sam/SamHeaderValidator_p.h
new file mode 100644
index 0000000..579726e
--- /dev/null
+++ b/src/api/internal/sam/SamHeaderValidator_p.h
@@ -0,0 +1,103 @@
+// ***************************************************************************
+// SamHeaderValidator.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 6 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for validating SamHeader data
+// ***************************************************************************
+
+#ifndef SAM_HEADER_VALIDATOR_P_H
+#define SAM_HEADER_VALIDATOR_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+struct SamHeader;
+struct SamReadGroup;
+struct SamSequence;
+
+namespace Internal {
+
+class SamHeaderValidator
+{
+
+ // ctor & dtor
+public:
+ SamHeaderValidator(const SamHeader& header);
+ ~SamHeaderValidator();
+
+ // SamHeaderValidator interface
+public:
+ // prints error & warning messages
+ void PrintMessages(std::ostream& stream);
+
+ // validates SamHeader data, returns true/false accordingly
+ bool Validate();
+
+ // internal methods
+private:
+ // validate header metadata
+ bool ValidateMetadata();
+ bool ValidateVersion();
+ bool ContainsOnlyDigits(const std::string& s);
+ bool ValidateSortOrder();
+ bool ValidateGroupOrder();
+
+ // validate sequence dictionary
+ bool ValidateSequenceDictionary();
+ bool ContainsUniqueSequenceNames();
+ bool CheckNameFormat(const std::string& name);
+ bool ValidateSequence(const SamSequence& seq);
+ bool CheckLengthInRange(const std::string& length);
+
+ // validate read group dictionary
+ bool ValidateReadGroupDictionary();
+ bool ContainsUniqueIDsAndPlatformUnits();
+ bool ValidateReadGroup(const SamReadGroup& rg);
+ bool CheckReadGroupID(const std::string& id);
+ bool CheckSequencingTechnology(const std::string& technology);
+
+ // validate program data
+ bool ValidateProgramChain();
+ bool ContainsUniqueProgramIds();
+ bool ValidatePreviousProgramIds();
+
+ // error reporting
+ void AddError(const std::string& message);
+ void AddWarning(const std::string& message);
+ void PrintErrorMessages(std::ostream& stream);
+ void PrintWarningMessages(std::ostream& stream);
+
+ // data members
+private:
+ // SamHeader being validated
+ const SamHeader& m_header;
+
+ // error reporting helpers
+ static const std::string ERROR_PREFIX;
+ static const std::string WARN_PREFIX;
+ static const std::string NEWLINE;
+
+ // error reporting messages
+ std::vector<std::string> m_errorMessages;
+ std::vector<std::string> m_warningMessages;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // SAM_HEADER_VALIDATOR_P_H
diff --git a/src/api/internal/sam/SamHeaderVersion_p.h b/src/api/internal/sam/SamHeaderVersion_p.h
new file mode 100644
index 0000000..530aa46
--- /dev/null
+++ b/src/api/internal/sam/SamHeaderVersion_p.h
@@ -0,0 +1,154 @@
+// ***************************************************************************
+// SamHeaderVersion.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for comparing SAM header versions
+// *************************************************************************
+
+#ifndef SAM_HEADERVERSION_P_H
+#define SAM_HEADERVERSION_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <cstddef>
+#include <sstream>
+#include <string>
+#include "api/SamConstants.h"
+
+namespace BamTools {
+namespace Internal {
+
+class SamHeaderVersion
+{
+
+ // ctors & dtor
+public:
+ SamHeaderVersion()
+ : m_majorVersion(0)
+ , m_minorVersion(0)
+ {}
+
+ explicit SamHeaderVersion(const std::string& version)
+ : m_majorVersion(0)
+ , m_minorVersion(0)
+ {
+ SetVersion(version);
+ }
+
+ SamHeaderVersion(const unsigned int& major, const unsigned int& minor)
+ : m_majorVersion(major)
+ , m_minorVersion(minor)
+ {}
+
+ ~SamHeaderVersion()
+ {
+ m_majorVersion = 0;
+ m_minorVersion = 0;
+ }
+
+ // acess data
+public:
+ unsigned int MajorVersion() const
+ {
+ return m_majorVersion;
+ }
+ unsigned int MinorVersion() const
+ {
+ return m_minorVersion;
+ }
+
+ void SetVersion(const std::string& version);
+ std::string ToString() const;
+
+ // data members
+private:
+ unsigned int m_majorVersion;
+ unsigned int m_minorVersion;
+};
+
+inline void SamHeaderVersion::SetVersion(const std::string& version)
+{
+
+ // do nothing if version is empty
+ if (!version.empty()) {
+
+ std::stringstream versionStream;
+
+ // do nothing if period not found
+ const std::size_t periodFound = version.find(Constants::SAM_PERIOD);
+ if (periodFound != std::string::npos) {
+
+ // store major version if non-empty and contains only digits
+ const std::string& majorVersion = version.substr(0, periodFound);
+ versionStream.str(majorVersion);
+ if (!majorVersion.empty()) {
+ const std::size_t nonDigitFound =
+ majorVersion.find_first_not_of(Constants::SAM_DIGITS);
+ if (nonDigitFound == std::string::npos) versionStream >> m_majorVersion;
+ }
+
+ // store minor version if non-empty and contains only digits
+ const std::string& minorVersion = version.substr(periodFound + 1);
+ versionStream.str(minorVersion);
+ if (!minorVersion.empty()) {
+ const std::size_t nonDigitFound =
+ minorVersion.find_first_not_of(Constants::SAM_DIGITS);
+ if (nonDigitFound == std::string::npos) versionStream >> m_minorVersion;
+ }
+ }
+ }
+}
+
+// -----------------------------------------------------
+// printing
+
+inline std::string SamHeaderVersion::ToString() const
+{
+ std::stringstream version;
+ version << m_majorVersion << Constants::SAM_PERIOD << m_minorVersion;
+ return version.str();
+}
+
+// -----------------------------------------------------
+// comparison operators
+
+inline bool operator==(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs)
+{
+ return (lhs.MajorVersion() == rhs.MajorVersion()) && (lhs.MinorVersion() == rhs.MinorVersion());
+}
+
+inline bool operator<(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs)
+{
+ if (lhs.MajorVersion() == rhs.MajorVersion())
+ return lhs.MinorVersion() < rhs.MinorVersion();
+ else
+ return lhs.MajorVersion() < rhs.MajorVersion();
+}
+
+inline bool operator>(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs)
+{
+ return rhs < lhs;
+}
+inline bool operator<=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs)
+{
+ return !(lhs > rhs);
+}
+inline bool operator>=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs)
+{
+ return !(lhs < rhs);
+}
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // SAM_HEADERVERSION_P_H
diff --git a/src/api/internal/utils/BamException_p.cpp b/src/api/internal/utils/BamException_p.cpp
new file mode 100644
index 0000000..3b38779
--- /dev/null
+++ b/src/api/internal/utils/BamException_p.cpp
@@ -0,0 +1,14 @@
+// ***************************************************************************
+// BamException_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides a basic exception class for BamTools internals
+// ***************************************************************************
+
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+const std::string BamException::SEPARATOR(": ");
diff --git a/src/api/internal/utils/BamException_p.h b/src/api/internal/utils/BamException_p.h
new file mode 100644
index 0000000..3a0a175
--- /dev/null
+++ b/src/api/internal/utils/BamException_p.h
@@ -0,0 +1,53 @@
+// ***************************************************************************
+// BamException_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 6 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides a basic exception class for BamTools internals
+// ***************************************************************************
+
+#ifndef BAMEXCEPTION_P_H
+#define BAMEXCEPTION_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <exception>
+#include <string>
+
+namespace BamTools {
+namespace Internal {
+
+class BamException : public std::exception
+{
+
+public:
+ inline BamException(const std::string& where, const std::string& message)
+ : std::exception()
+ , m_errorString(where + SEPARATOR + message)
+ {}
+
+ inline ~BamException() throw() {}
+
+ inline const char* what() const throw()
+ {
+ return m_errorString.c_str();
+ }
+
+private:
+ std::string m_errorString;
+ static const std::string SEPARATOR;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMEXCEPTION_P_H
diff --git a/src/api/internal/utils/CMakeLists.txt b/src/api/internal/utils/CMakeLists.txt
new file mode 100644
index 0000000..4b1e2c2
--- /dev/null
+++ b/src/api/internal/utils/CMakeLists.txt
@@ -0,0 +1,15 @@
+# ==========================
+# BamTools CMakeLists.txt
+# (c) 2011 Derek Barnett
+#
+# src/api/internal/utils
+# ==========================
+
+set( InternalUtilsDir "${InternalDir}/utils" )
+
+set( InternalUtilsSources
+ ${InternalUtilsDir}/BamException_p.cpp
+
+ PARENT_SCOPE # <-- leave this last
+)
+
diff --git a/src/bamtools.pc.in b/src/bamtools.pc.in
new file mode 100644
index 0000000..59c3017
--- /dev/null
+++ b/src/bamtools.pc.in
@@ -0,0 +1,10 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+
+Name: BamTools
+Description: BamTools is a C++ library for reading and manipulating BAM files
+Version: @BamTools_VERSION@
+
+Requires.private: @BAMTOOLS_PRIVATE_DEPS@
+Libs: -L${libdir} -lbamtools
+Cflags: -I${includedir}
diff --git a/src/shared/bamtools_global.h b/src/shared/bamtools_global.h
new file mode 100644
index 0000000..7b128be
--- /dev/null
+++ b/src/shared/bamtools_global.h
@@ -0,0 +1,89 @@
+// ***************************************************************************
+// bamtools_global.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic definitions for exporting & importing library symbols.
+// Also provides some platform-specific rules for definitions.
+// ***************************************************************************
+
+#ifndef BAMTOOLS_GLOBAL_H
+#define BAMTOOLS_GLOBAL_H
+
+/*! \brief Library export macro
+ \internal
+*/
+#ifndef BAMTOOLS_LIBRARY_EXPORT
+#if defined(WIN32)
+#define BAMTOOLS_LIBRARY_EXPORT __declspec(dllexport)
+#else
+#define BAMTOOLS_LIBRARY_EXPORT __attribute__((visibility("default")))
+#endif
+#endif // BAMTOOLS_LIBRARY_EXPORT
+
+/*! \brief Library import macro
+ \internal
+*/
+#ifndef BAMTOOLS_LIBRARY_IMPORT
+#if defined(WIN32)
+#define BAMTOOLS_LIBRARY_IMPORT __declspec(dllimport)
+#else
+#define BAMTOOLS_LIBRARY_IMPORT
+#endif
+#endif // BAMTOOLS_LIBRARY_IMPORT
+
+/*! \brief Platform-specific type definitions
+ \internal
+*/
+#ifndef BAMTOOLS_LFS
+#define BAMTOOLS_LFS
+#ifdef WIN32
+#define ftell64(a) _ftelli64(a)
+#define fseek64(a, b, c) _fseeki64(a, b, c)
+#else
+#define ftell64(a) ftello(a)
+#define fseek64(a, b, c) fseeko(a, b, c)
+#endif
+#endif // BAMTOOLS_LFS
+
+/*! \def ftell64(a)
+ \brief Platform-independent tell() operation.
+ \internal
+*/
+/*! \def fseek64(a,b,c)
+ \brief Platform-independent seek() operation.
+ \internal
+*/
+
+/*! \brief Platform-specific type definitions
+ \internal
+*/
+#ifndef BAMTOOLS_TYPES
+#define BAMTOOLS_TYPES
+#include <stdint.h>
+#endif // BAMTOOLS_TYPES
+
+//! \internal
+inline void bamtools_noop() {}
+
+/*! \brief Assert definitions
+ \internal
+*/
+#ifndef BAMTOOLS_ASSERTS
+#define BAMTOOLS_ASSERTS
+#ifdef NDEBUG
+#define BT_ASSERT_UNREACHABLE bamtools_noop()
+#define BT_ASSERT_X(condition, message) bamtools_noop()
+#else
+#include <cassert>
+#include <stdexcept>
+#define BT_ASSERT_UNREACHABLE assert(false)
+#define BT_ASSERT_X(condition, message) \
+ if (!(condition)) { \
+ throw std::runtime_error(message); \
+ }
+#endif
+#endif // BAMTOOLS_ASSERTS
+
+#endif // BAMTOOLS_GLOBAL_H
diff --git a/src/toolkit/CMakeLists.txt b/src/toolkit/CMakeLists.txt
new file mode 100644
index 0000000..7a1f676
--- /dev/null
+++ b/src/toolkit/CMakeLists.txt
@@ -0,0 +1,47 @@
+# ==========================
+# BamTools CMakeLists.txt
+# (c) 2010 Derek Barnett
+#
+# src/toolkit
+# ==========================
+
+# set include path
+include_directories( ${BamTools_SOURCE_DIR}/src/api
+ ${BamTools_SOURCE_DIR}/src/utils
+ ${BamTools_SOURCE_DIR}/src/third_party
+ )
+
+# compile main bamtools application
+add_executable( bamtools_cmd
+ bamtools_convert.cpp
+ bamtools_count.cpp
+ bamtools_coverage.cpp
+ bamtools_filter.cpp
+ bamtools_header.cpp
+ bamtools_index.cpp
+ bamtools_merge.cpp
+ bamtools_random.cpp
+ bamtools_resolve.cpp
+ bamtools_revert.cpp
+ bamtools_sort.cpp
+ bamtools_split.cpp
+ bamtools_stats.cpp
+ bamtools.cpp
+ )
+
+# set BamTools application properties
+set_target_properties( bamtools_cmd PROPERTIES
+ OUTPUT_NAME "bamtools"
+ )
+# make version info available in application
+configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/bamtools_version.h.in ${CMAKE_CURRENT_BINARY_DIR}/bamtools_version.h )
+include_directories( ${CMAKE_CURRENT_BINARY_DIR} )
+
+# set include paths for system JsonCpp
+target_include_directories( bamtools_cmd PRIVATE ${JSONCPP_INCLUDE_DIRS} )
+
+# define libraries to link
+target_link_libraries( bamtools_cmd BamTools BamTools-utils ${JSONCPP_LDFLAGS} )
+
+# set application install destinations
+install( TARGETS bamtools_cmd DESTINATION "${CMAKE_INSTALL_BINDIR}" )
diff --git a/src/toolkit/bamtools.cpp b/src/toolkit/bamtools.cpp
new file mode 100644
index 0000000..34a99ae
--- /dev/null
+++ b/src/toolkit/bamtools.cpp
@@ -0,0 +1,174 @@
+// ***************************************************************************
+// bamtools.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 12 October 2012 (DB)
+// ---------------------------------------------------------------------------
+// Integrates a number of BamTools functionalities into a single executable.
+// ***************************************************************************
+
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include "bamtools_convert.h"
+#include "bamtools_count.h"
+#include "bamtools_coverage.h"
+#include "bamtools_filter.h"
+#include "bamtools_header.h"
+#include "bamtools_index.h"
+#include "bamtools_merge.h"
+#include "bamtools_random.h"
+#include "bamtools_resolve.h"
+#include "bamtools_revert.h"
+#include "bamtools_sort.h"
+#include "bamtools_split.h"
+#include "bamtools_stats.h"
+#include "bamtools_version.h"
+using namespace BamTools;
+
+// bamtools subtool names
+static const std::string CONVERT = "convert";
+static const std::string COUNT = "count";
+static const std::string COVERAGE = "coverage";
+static const std::string FILTER = "filter";
+static const std::string HEADER = "header";
+static const std::string INDEX = "index";
+static const std::string MERGE = "merge";
+static const std::string RANDOM = "random";
+static const std::string RESOLVE = "resolve";
+static const std::string REVERT = "revert";
+static const std::string SORT = "sort";
+static const std::string SPLIT = "split";
+static const std::string STATS = "stats";
+
+// bamtools help/version constants
+static const std::string HELP = "help";
+static const std::string LONG_HELP = "--help";
+static const std::string SHORT_HELP = "-h";
+static const std::string VERSION = "version";
+static const std::string LONG_VERSION = "--version";
+static const std::string SHORT_VERSION = "-v";
+
+// determine if string is a help constant
+static bool IsHelp(char* str)
+{
+ return (str == HELP || str == LONG_HELP || str == SHORT_HELP);
+}
+
+// determine if string is a version constant
+static bool IsVersion(char* str)
+{
+ return (str == VERSION || str == LONG_VERSION || str == SHORT_VERSION);
+}
+
+// subtool factory method
+AbstractTool* CreateTool(const std::string& arg)
+{
+
+ // determine tool type based on arg
+ if (arg == CONVERT) return new ConvertTool;
+ if (arg == COUNT) return new CountTool;
+ if (arg == COVERAGE) return new CoverageTool;
+ if (arg == FILTER) return new FilterTool;
+ if (arg == HEADER) return new HeaderTool;
+ if (arg == INDEX) return new IndexTool;
+ if (arg == MERGE) return new MergeTool;
+ if (arg == RANDOM) return new RandomTool;
+ if (arg == RESOLVE) return new ResolveTool;
+ if (arg == REVERT) return new RevertTool;
+ if (arg == SORT) return new SortTool;
+ if (arg == SPLIT) return new SplitTool;
+ if (arg == STATS) return new StatsTool;
+
+ // unknown arg
+ return 0;
+}
+
+// print help info
+int Help(int argc, char* argv[])
+{
+
+ // check for 'bamtools help COMMAND' to print tool-specific help message
+ if (argc > 2) {
+
+ // determine desired sub-tool
+ AbstractTool* tool = CreateTool(argv[2]);
+
+ // if tool known, print its help screen
+ if (tool) return tool->Help();
+ }
+
+ // print general BamTools help message
+ std::cerr << std::endl;
+ std::cerr << "usage: bamtools [--help] COMMAND [ARGS]" << std::endl;
+ std::cerr << std::endl;
+ std::cerr << "Available bamtools commands:" << std::endl;
+ std::cerr << "\tconvert Converts between BAM and a number of other formats"
+ << std::endl;
+ std::cerr << "\tcount Prints number of alignments in BAM file(s)" << std::endl;
+ std::cerr << "\tcoverage Prints coverage statistics from the input BAM file"
+ << std::endl;
+ std::cerr << "\tfilter Filters BAM file(s) by user-specified criteria" << std::endl;
+ std::cerr << "\theader Prints BAM header information" << std::endl;
+ std::cerr << "\tindex Generates index for BAM file" << std::endl;
+ std::cerr << "\tmerge Merge multiple BAM files into single file" << std::endl;
+ std::cerr << "\trandom Select random alignments from existing BAM file(s), intended "
+ "more as a testing tool."
+ << std::endl;
+ std::cerr
+ << "\tresolve Resolves paired-end reads (marking the IsProperPair flag as needed)"
+ << std::endl;
+ std::cerr << "\trevert Removes duplicate marks and restores original base qualities"
+ << std::endl;
+ std::cerr << "\tsort Sorts the BAM file according to some criteria" << std::endl;
+ std::cerr << "\tsplit Splits a BAM file on user-specified property, creating a new "
+ "BAM output file for each value found"
+ << std::endl;
+ std::cerr << "\tstats Prints some basic statistics from input BAM file(s)"
+ << std::endl;
+ std::cerr << std::endl;
+ std::cerr << "See 'bamtools help COMMAND' for more information on a specific command."
+ << std::endl;
+ std::cerr << std::endl;
+ return EXIT_SUCCESS;
+}
+
+// print version info
+int Version()
+{
+
+ std::stringstream versionStream;
+ versionStream << BAMTOOLS_VERSION_MAJOR << '.' << BAMTOOLS_VERSION_MINOR << '.'
+ << BAMTOOLS_VERSION_PATCH;
+
+ std::cout << std::endl;
+ std::cout << "bamtools " << versionStream.str() << std::endl;
+ std::cout << "Part of BamTools API and toolkit" << std::endl;
+ std::cout << "Primary authors: Derek Barnett, Erik Garrison, Michael Stromberg" << std::endl;
+ std::cout << "(c) 2009-2012 Marth Lab, Biology Dept., Boston College" << std::endl;
+ std::cout << std::endl;
+ return EXIT_SUCCESS;
+}
+
+// toolkit entry point
+int main(int argc, char* argv[])
+{
+
+ // just 'bamtools'
+ if (argc == 1) return Help(argc, argv);
+
+ // 'bamtools help', 'bamtools --help', or 'bamtools -h'
+ if (IsHelp(argv[1])) return Help(argc, argv);
+
+ // 'bamtools version', 'bamtools --version', or 'bamtools -v'
+ if (IsVersion(argv[1])) return Version();
+
+ // determine desired sub-tool, run if found
+ AbstractTool* tool = CreateTool(argv[1]);
+ if (tool) return tool->Run(argc, argv);
+
+ // no tool matched, show help
+ return Help(argc, argv);
+}
diff --git a/src/toolkit/bamtools_convert.cpp b/src/toolkit/bamtools_convert.cpp
new file mode 100644
index 0000000..bd32218
--- /dev/null
+++ b/src/toolkit/bamtools_convert.cpp
@@ -0,0 +1,967 @@
+// ***************************************************************************
+// bamtools_convert.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 December 2012
+// ---------------------------------------------------------------------------
+// Converts between BAM and a number of other formats
+// ***************************************************************************
+
+#include "bamtools_convert.h"
+
+#include <api/BamConstants.h>
+#include <api/BamMultiReader.h>
+#include <utils/bamtools_fasta.h>
+#include <utils/bamtools_options.h>
+#include <utils/bamtools_pileup_engine.h>
+#include <utils/bamtools_utilities.h>
+using namespace BamTools;
+
+#include <cstddef>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+// ---------------------------------------------
+// ConvertTool constants
+
+// supported conversion format command-line names
+static const std::string FORMAT_BED = "bed";
+static const std::string FORMAT_FASTA = "fasta";
+static const std::string FORMAT_FASTQ = "fastq";
+static const std::string FORMAT_JSON = "json";
+static const std::string FORMAT_SAM = "sam";
+static const std::string FORMAT_PILEUP = "pileup";
+static const std::string FORMAT_YAML = "yaml";
+
+// other constants
+static const unsigned int FASTA_LINE_MAX = 50;
+
+// ---------------------------------------------
+// ConvertPileupFormatVisitor declaration
+
+class ConvertPileupFormatVisitor : public PileupVisitor
+{
+
+ // ctor & dtor
+public:
+ ConvertPileupFormatVisitor(const RefVector& references, const std::string& fastaFilename,
+ const bool isPrintingMapQualities, std::ostream* out);
+ ~ConvertPileupFormatVisitor();
+
+ // PileupVisitor interface implementation
+public:
+ void Visit(const PileupPosition& pileupData);
+
+ // data members
+private:
+ Fasta m_fasta;
+ bool m_hasFasta;
+ bool m_isPrintingMapQualities;
+ std::ostream* m_out;
+ RefVector m_references;
+};
+
+} // namespace BamTools
+
+// ---------------------------------------------
+// ConvertSettings implementation
+
+struct ConvertTool::ConvertSettings
+{
+
+ // flag
+ bool HasInput;
+ bool HasInputFilelist;
+ bool HasOutput;
+ bool HasFormat;
+ bool HasRegion;
+
+ // pileup flags
+ bool HasFastaFilename;
+ bool IsOmittingSamHeader;
+ bool IsPrintingPileupMapQualities;
+
+ // options
+ std::vector<std::string> InputFiles;
+ std::string InputFilelist;
+ std::string OutputFilename;
+ std::string Format;
+ std::string Region;
+
+ // pileup options
+ std::string FastaFilename;
+
+ // constructor
+ ConvertSettings()
+ : HasInput(false)
+ , HasInputFilelist(false)
+ , HasOutput(false)
+ , HasFormat(false)
+ , HasRegion(false)
+ , HasFastaFilename(false)
+ , IsOmittingSamHeader(false)
+ , IsPrintingPileupMapQualities(false)
+ , OutputFilename(Options::StandardOut())
+ {}
+};
+
+// ---------------------------------------------
+// ConvertToolPrivate implementation
+
+struct ConvertTool::ConvertToolPrivate
+{
+
+ // ctor & dtor
+public:
+ ConvertToolPrivate(ConvertTool::ConvertSettings* settings)
+ : m_settings(settings)
+ , m_out(std::cout.rdbuf())
+ {}
+
+ ~ConvertToolPrivate() {}
+
+ // interface
+public:
+ bool Run();
+
+ // internal methods
+private:
+ void PrintBed(const BamAlignment& a);
+ void PrintFasta(const BamAlignment& a);
+ void PrintFastq(const BamAlignment& a);
+ void PrintJson(const BamAlignment& a);
+ void PrintSam(const BamAlignment& a);
+ void PrintYaml(const BamAlignment& a);
+
+ // special case - uses the PileupEngine
+ bool RunPileupConversion(BamMultiReader* reader);
+
+ // data members
+private:
+ ConvertTool::ConvertSettings* m_settings;
+ RefVector m_references;
+ std::ostream m_out;
+};
+
+bool ConvertTool::ConvertToolPrivate::Run()
+{
+
+ // ------------------------------------
+ // initialize conversion input/output
+
+ // set to default input if none provided
+ if (!m_settings->HasInput && !m_settings->HasInputFilelist)
+ m_settings->InputFiles.push_back(Options::StandardIn());
+
+ // add files in the filelist to the input file list
+ if (m_settings->HasInputFilelist) {
+
+ std::ifstream filelist(m_settings->InputFilelist.c_str(), std::ios::in);
+ if (!filelist.is_open()) {
+ std::cerr << "bamtools convert ERROR: could not open input BAM file list... Aborting."
+ << std::endl;
+ return false;
+ }
+
+ std::string line;
+ while (std::getline(filelist, line))
+ m_settings->InputFiles.push_back(line);
+ }
+
+ // open input files
+ BamMultiReader reader;
+ if (!reader.Open(m_settings->InputFiles)) {
+ std::cerr << "bamtools convert ERROR: could not open input BAM file(s)... Aborting."
+ << std::endl;
+ return false;
+ }
+
+ // if input is not stdin & a region is provided, look for index files
+ if (m_settings->HasInput && m_settings->HasRegion) {
+ if (!reader.LocateIndexes()) {
+ std::cerr << "bamtools convert ERROR: could not locate index file(s)... Aborting."
+ << std::endl;
+ return false;
+ }
+ }
+
+ // retrieve reference data
+ m_references = reader.GetReferenceData();
+
+ // set region if specified
+ BamRegion region;
+ if (m_settings->HasRegion) {
+ if (Utilities::ParseRegionString(m_settings->Region, reader, region)) {
+
+ if (reader.HasIndexes()) {
+ if (!reader.SetRegion(region)) {
+ std::cerr << "bamtools convert ERROR: set region failed. Check that REGION "
+ "describes a valid range"
+ << std::endl;
+ reader.Close();
+ return false;
+ }
+ }
+
+ } else {
+ std::cerr << "bamtools convert ERROR: could not parse REGION: " << m_settings->Region
+ << std::endl;
+ std::cerr << "Check that REGION is in valid format (see documentation) and that the "
+ "coordinates are valid"
+ << std::endl;
+ reader.Close();
+ return false;
+ }
+ }
+
+ // if output file given
+ std::ofstream outFile;
+ if (m_settings->HasOutput) {
+
+ // open output file stream
+ outFile.open(m_settings->OutputFilename.c_str());
+ if (!outFile) {
+ std::cerr << "bamtools convert ERROR: could not open " << m_settings->OutputFilename
+ << " for output" << std::endl;
+ return false;
+ }
+
+ // set m_out to file's streambuf
+ m_out.rdbuf(outFile.rdbuf());
+ }
+
+ // -------------------------------------
+ // do conversion based on format
+
+ bool convertedOk = true;
+
+ // pileup is special case
+ // conversion not done per alignment, like the other formats
+ if (m_settings->Format == FORMAT_PILEUP) convertedOk = RunPileupConversion(&reader);
+
+ // all other formats
+ else {
+
+ bool formatError = false;
+
+ // set function pointer to proper conversion method
+ void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0;
+ if (m_settings->Format == FORMAT_BED)
+ pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed;
+ else if (m_settings->Format == FORMAT_FASTA)
+ pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta;
+ else if (m_settings->Format == FORMAT_FASTQ)
+ pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq;
+ else if (m_settings->Format == FORMAT_JSON)
+ pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson;
+ else if (m_settings->Format == FORMAT_SAM)
+ pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam;
+ else if (m_settings->Format == FORMAT_YAML)
+ pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintYaml;
+ else {
+ std::cerr << "bamtools convert ERROR: unrecognized format: " << m_settings->Format
+ << std::endl;
+ std::cerr << "Please see documentation for list of supported formats " << std::endl;
+ formatError = true;
+ convertedOk = false;
+ }
+
+ // if format selected ok
+ if (!formatError) {
+
+ // if SAM format & not omitting header, print SAM header first
+ if ((m_settings->Format == FORMAT_SAM) && !m_settings->IsOmittingSamHeader)
+ m_out << reader.GetHeaderText();
+
+ // iterate through file, doing conversion
+ BamAlignment a;
+ while (reader.GetNextAlignment(a))
+ (this->*pFunction)(a);
+
+ // set flag for successful conversion
+ convertedOk = true;
+ }
+ }
+
+ // ------------------------
+ // clean up & exit
+ reader.Close();
+ if (m_settings->HasOutput) outFile.close();
+ return convertedOk;
+}
+
+// ----------------------------------------------------------
+// Conversion/output methods
+// ----------------------------------------------------------
+
+void ConvertTool::ConvertToolPrivate::PrintBed(const BamAlignment& a)
+{
+
+ // tab-delimited, 0-based half-open
+ // (e.g. a 50-base read aligned to pos 10 could have BED coordinates (10, 60) instead of BAM coordinates (10, 59) )
+ // <chromName> <chromStart> <chromEnd> <readName> <score> <strand>
+
+ m_out << m_references.at(a.RefID).RefName << '\t' << a.Position << '\t' << a.GetEndPosition()
+ << '\t' << a.Name << '\t' << a.MapQuality << '\t' << (a.IsReverseStrand() ? '-' : '+')
+ << std::endl;
+}
+
+// print BamAlignment in FASTA format
+// N.B. - uses QueryBases NOT AlignedBases
+void ConvertTool::ConvertToolPrivate::PrintFasta(const BamAlignment& a)
+{
+
+ // >BamAlignment.Name
+ // BamAlignment.QueryBases (up to FASTA_LINE_MAX bases per line)
+ // ...
+ //
+ // N.B. - QueryBases are reverse-complemented if aligned to reverse strand
+
+ // print header
+ m_out << '>' << a.Name << std::endl;
+
+ // handle reverse strand alignment - bases
+ std::string sequence = a.QueryBases;
+ if (a.IsReverseStrand()) Utilities::ReverseComplement(sequence);
+
+ // if sequence fits on single line
+ if (sequence.length() <= FASTA_LINE_MAX) m_out << sequence << std::endl;
+
+ // else split over multiple lines
+ else {
+
+ std::size_t position = 0;
+ std::size_t seqLength =
+ sequence.length(); // handle reverse strand alignment - bases & qualitiesth();
+
+ // write subsequences to each line
+ while (position < (seqLength - FASTA_LINE_MAX)) {
+ m_out << sequence.substr(position, FASTA_LINE_MAX) << std::endl;
+ position += FASTA_LINE_MAX;
+ }
+
+ // write final subsequence
+ m_out << sequence.substr(position) << std::endl;
+ }
+}
+
+// print BamAlignment in FASTQ format
+// N.B. - uses QueryBases NOT AlignedBases
+void ConvertTool::ConvertToolPrivate::PrintFastq(const BamAlignment& a)
+{
+
+ // @BamAlignment.Name
+ // BamAlignment.QueryBases
+ // +
+ // BamAlignment.Qualities
+ //
+ // N.B. - QueryBases are reverse-complemented (& Qualities reversed) if aligned to reverse strand .
+ // Name is appended "/1" or "/2" if paired-end, to reflect which mate this entry is.
+
+ // handle paired-end alignments
+ std::string name = a.Name;
+ if (a.IsPaired()) name.append((a.IsFirstMate() ? "/1" : "/2"));
+
+ // handle reverse strand alignment - bases & qualities
+ std::string qualities = a.Qualities;
+ std::string sequence = a.QueryBases;
+ if (a.IsReverseStrand()) {
+ Utilities::Reverse(qualities);
+ Utilities::ReverseComplement(sequence);
+ }
+
+ // write to output stream
+ m_out << '@' << name << std::endl
+ << sequence << std::endl
+ << '+' << std::endl
+ << qualities << std::endl;
+}
+
+// print BamAlignment in JSON format
+void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a)
+{
+
+ // write name & alignment flag
+ m_out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\"" << a.AlignmentFlag << "\",";
+
+ // write reference name
+ if ((a.RefID >= 0) && (a.RefID < (int)m_references.size()))
+ m_out << "\"reference\":\"" << m_references[a.RefID].RefName << "\",";
+
+ // write position & map quality
+ m_out << "\"position\":" << a.Position + 1 << ",\"mapQuality\":" << a.MapQuality << ',';
+
+ // write CIGAR
+ const std::vector<CigarOp>& cigarData = a.CigarData;
+ if (!cigarData.empty()) {
+ m_out << "\"cigar\":[";
+ std::vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
+ std::vector<CigarOp>::const_iterator cigarIter = cigarBegin;
+ std::vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
+ for (; cigarIter != cigarEnd; ++cigarIter) {
+ const CigarOp& op = (*cigarIter);
+ if (cigarIter != cigarBegin) m_out << ',';
+ m_out << '"' << op.Length << op.Type << '"';
+ }
+ m_out << "],";
+ }
+
+ // write mate reference name, mate position, & insert size
+ if (a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size())) {
+ m_out << "\"mate\":{"
+ << "\"reference\":\"" << m_references[a.MateRefID].RefName << "\","
+ << "\"position\":" << a.MatePosition + 1 << ",\"insertSize\":" << a.InsertSize
+ << "},";
+ }
+
+ // write sequence
+ if (!a.QueryBases.empty()) m_out << "\"queryBases\":\"" << a.QueryBases << "\",";
+
+ // write qualities
+ if (!a.Qualities.empty() && a.Qualities.at(0) != (char)0xFF) {
+ std::string::const_iterator s = a.Qualities.begin();
+ m_out << "\"qualities\":[" << static_cast<short>(*s) - 33;
+ ++s;
+ for (; s != a.Qualities.end(); ++s)
+ m_out << ',' << static_cast<short>(*s) - 33;
+ m_out << "],";
+ }
+
+ // write alignment's source BAM file
+ m_out << "\"filename\":\"" << a.Filename << "\",";
+
+ // write tag data
+ const char* tagData = a.TagData.c_str();
+ const std::size_t tagDataLength = a.TagData.length();
+ std::size_t index = 0;
+ if (index < tagDataLength) {
+
+ m_out << "\"tags\":{";
+
+ while (index < tagDataLength) {
+
+ if (index > 0) m_out << ',';
+
+ // write tag name
+ m_out << '"' << a.TagData.substr(index, 2) << "\":";
+ index += 2;
+
+ // get data type
+ char type = a.TagData.at(index);
+ ++index;
+ switch (type) {
+ case (Constants::BAM_TAG_TYPE_ASCII):
+ m_out << '"' << tagData[index] << '"';
+ ++index;
+ break;
+
+ case (Constants::BAM_TAG_TYPE_INT8):
+ // force value into integer-type (instead of char value)
+ m_out << static_cast<int16_t>(tagData[index]);
+ ++index;
+ break;
+
+ case (Constants::BAM_TAG_TYPE_UINT8):
+ // force value into integer-type (instead of char value)
+ m_out << static_cast<uint16_t>(tagData[index]);
+ ++index;
+ break;
+
+ case (Constants::BAM_TAG_TYPE_INT16):
+ m_out << BamTools::UnpackSignedShort(&tagData[index]);
+ index += sizeof(int16_t);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_UINT16):
+ m_out << BamTools::UnpackUnsignedShort(&tagData[index]);
+ index += sizeof(uint16_t);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_INT32):
+ m_out << BamTools::UnpackSignedInt(&tagData[index]);
+ index += sizeof(int32_t);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_UINT32):
+ m_out << BamTools::UnpackUnsignedInt(&tagData[index]);
+ index += sizeof(uint32_t);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_FLOAT):
+ m_out << BamTools::UnpackFloat(&tagData[index]);
+ index += sizeof(float);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_HEX):
+ case (Constants::BAM_TAG_TYPE_STRING):
+ m_out << '"';
+ while (tagData[index]) {
+ if (tagData[index] == '\"')
+ m_out << "\\\""; // escape for json
+ else
+ m_out << tagData[index];
+ ++index;
+ }
+ m_out << '"';
+ ++index;
+ break;
+ }
+
+ if (tagData[index] == '\0') break;
+ }
+
+ m_out << '}';
+ }
+
+ m_out << '}' << std::endl;
+}
+
+// print BamAlignment in SAM format
+void ConvertTool::ConvertToolPrivate::PrintSam(const BamAlignment& a)
+{
+
+ // tab-delimited
+ // <QNAME> <FLAG> <RNAME> <POS> <MAPQ> <CIGAR> <MRNM> <MPOS> <ISIZE> <SEQ> <QUAL> [ <TAG>:<VTYPE>:<VALUE> [...] ]
+
+ // write name & alignment flag
+ m_out << a.Name << '\t' << a.AlignmentFlag << '\t';
+
+ // write reference name
+ if ((a.RefID >= 0) && (a.RefID < (int)m_references.size()))
+ m_out << m_references[a.RefID].RefName << '\t';
+ else
+ m_out << "*\t";
+
+ // write position & map quality
+ m_out << a.Position + 1 << '\t' << a.MapQuality << '\t';
+
+ // write CIGAR
+ const std::vector<CigarOp>& cigarData = a.CigarData;
+ if (cigarData.empty())
+ m_out << "*\t";
+ else {
+ std::vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
+ std::vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
+ for (; cigarIter != cigarEnd; ++cigarIter) {
+ const CigarOp& op = (*cigarIter);
+ m_out << op.Length << op.Type;
+ }
+ m_out << '\t';
+ }
+
+ // write mate reference name, mate position, & insert size
+ if (a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size())) {
+ if (a.MateRefID == a.RefID)
+ m_out << "=\t";
+ else
+ m_out << m_references[a.MateRefID].RefName << '\t';
+ m_out << a.MatePosition + 1 << '\t' << a.InsertSize << '\t';
+ } else
+ m_out << "*\t0\t0\t";
+
+ // write sequence
+ if (a.QueryBases.empty())
+ m_out << "*\t";
+ else
+ m_out << a.QueryBases << '\t';
+
+ // write qualities
+ if (a.Qualities.empty() || (a.Qualities.at(0) == (char)0xFF))
+ m_out << '*';
+ else
+ m_out << a.Qualities;
+
+ // write tag data
+ const char* tagData = a.TagData.c_str();
+ const std::size_t tagDataLength = a.TagData.length();
+
+ std::size_t index = 0;
+ while (index < tagDataLength) {
+
+ // write tag name
+ std::string tagName = a.TagData.substr(index, 2);
+ m_out << '\t' << tagName << ':';
+ index += 2;
+
+ // get data type
+ char type = a.TagData.at(index);
+ ++index;
+ switch (type) {
+ case (Constants::BAM_TAG_TYPE_ASCII):
+ m_out << "A:" << tagData[index];
+ ++index;
+ break;
+
+ case (Constants::BAM_TAG_TYPE_INT8):
+ // force value into integer-type (instead of char value)
+ m_out << "i:" << static_cast<int16_t>(tagData[index]);
+ ++index;
+ break;
+
+ case (Constants::BAM_TAG_TYPE_UINT8):
+ // force value into integer-type (instead of char value)
+ m_out << "i:" << static_cast<uint16_t>(tagData[index]);
+ ++index;
+ break;
+
+ case (Constants::BAM_TAG_TYPE_INT16):
+ m_out << "i:" << BamTools::UnpackSignedShort(&tagData[index]);
+ index += sizeof(int16_t);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_UINT16):
+ m_out << "i:" << BamTools::UnpackUnsignedShort(&tagData[index]);
+ index += sizeof(uint16_t);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_INT32):
+ m_out << "i:" << BamTools::UnpackSignedInt(&tagData[index]);
+ index += sizeof(int32_t);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_UINT32):
+ m_out << "i:" << BamTools::UnpackUnsignedInt(&tagData[index]);
+ index += sizeof(uint32_t);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_FLOAT):
+ m_out << "f:" << BamTools::UnpackFloat(&tagData[index]);
+ index += sizeof(float);
+ break;
+
+ case (Constants::BAM_TAG_TYPE_HEX): // fall-through
+ case (Constants::BAM_TAG_TYPE_STRING):
+ m_out << type << ':';
+ while (tagData[index]) {
+ m_out << tagData[index];
+ ++index;
+ }
+ ++index;
+ break;
+ }
+
+ if (tagData[index] == '\0') break;
+ }
+
+ m_out << std::endl;
+}
+
+// Print BamAlignment in YAML format
+void ConvertTool::ConvertToolPrivate::PrintYaml(const BamAlignment& a)
+{
+
+ // write alignment name
+ m_out << "---" << std::endl;
+ m_out << a.Name << ':' << std::endl;
+
+ // write alignment data
+ m_out << " "
+ << "AlndBases: " << a.AlignedBases << std::endl;
+ m_out << " "
+ << "Qualities: " << a.Qualities << std::endl;
+ m_out << " "
+ << "Name: " << a.Name << std::endl;
+ m_out << " "
+ << "Length: " << a.Length << std::endl;
+ m_out << " "
+ << "TagData: " << a.TagData << std::endl;
+ m_out << " "
+ << "RefID: " << a.RefID << std::endl;
+ m_out << " "
+ << "RefName: " << m_references[a.RefID].RefName << std::endl;
+ m_out << " "
+ << "Position: " << a.Position << std::endl;
+ m_out << " "
+ << "Bin: " << a.Bin << std::endl;
+ m_out << " "
+ << "MapQuality: " << a.MapQuality << std::endl;
+ m_out << " "
+ << "AlignmentFlag: " << a.AlignmentFlag << std::endl;
+ m_out << " "
+ << "MateRefID: " << a.MateRefID << std::endl;
+ m_out << " "
+ << "MatePosition: " << a.MatePosition << std::endl;
+ m_out << " "
+ << "InsertSize: " << a.InsertSize << std::endl;
+ m_out << " "
+ << "Filename: " << a.Filename << std::endl;
+
+ // write Cigar data
+ const std::vector<CigarOp>& cigarData = a.CigarData;
+ if (!cigarData.empty()) {
+ m_out << " "
+ << "Cigar: ";
+ std::vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
+ std::vector<CigarOp>::const_iterator cigarIter = cigarBegin;
+ std::vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
+ for (; cigarIter != cigarEnd; ++cigarIter) {
+ const CigarOp& op = (*cigarIter);
+ m_out << op.Length << op.Type;
+ }
+ m_out << std::endl;
+ }
+}
+
+bool ConvertTool::ConvertToolPrivate::RunPileupConversion(BamMultiReader* reader)
+{
+
+ // check for valid BamMultiReader
+ if (reader == 0) return false;
+
+ // set up our pileup format 'visitor'
+ ConvertPileupFormatVisitor* v = new ConvertPileupFormatVisitor(
+ m_references, m_settings->FastaFilename, m_settings->IsPrintingPileupMapQualities, &m_out);
+
+ // set up PileupEngine
+ PileupEngine pileup;
+ pileup.AddVisitor(v);
+
+ // iterate through data
+ BamAlignment al;
+ while (reader->GetNextAlignment(al))
+ pileup.AddAlignment(al);
+ pileup.Flush();
+
+ // clean up
+ delete v;
+ v = 0;
+
+ // return success
+ return true;
+}
+
+// ---------------------------------------------
+// ConvertTool implementation
+
+ConvertTool::ConvertTool()
+ : AbstractTool()
+ , m_settings(new ConvertSettings)
+ , m_impl(0)
+{
+ // set program details
+ Options::SetProgramInfo("bamtools convert", "converts BAM to a number of other formats",
+ "-format <FORMAT> [-in <filename> -in <filename> ... | -list "
+ "<filelist>] [-out <filename>] [-region <REGION>] [format-specific "
+ "options]");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "",
+ m_settings->HasInput, m_settings->InputFiles, IO_Opts,
+ Options::StandardIn());
+ Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "",
+ m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts);
+ Options::AddValueOption("-out", "BAM filename", "the output BAM file", "",
+ m_settings->HasOutput, m_settings->OutputFilename, IO_Opts,
+ Options::StandardOut());
+ Options::AddValueOption("-format", "FORMAT",
+ "the output file format - see README for recognized formats", "",
+ m_settings->HasFormat, m_settings->Format, IO_Opts);
+ Options::AddValueOption("-region", "REGION",
+ "genomic region. Index file is recommended for better performance, and "
+ "is used automatically if it exists. See \'bamtools help index\' for "
+ "more details on creating one",
+ "", m_settings->HasRegion, m_settings->Region, IO_Opts);
+
+ OptionGroup* PileupOpts = Options::CreateOptionGroup("Pileup Options");
+ Options::AddValueOption("-fasta", "FASTA filename", "FASTA reference file", "",
+ m_settings->HasFastaFilename, m_settings->FastaFilename, PileupOpts);
+ Options::AddOption("-mapqual", "print the mapping qualities",
+ m_settings->IsPrintingPileupMapQualities, PileupOpts);
+
+ OptionGroup* SamOpts = Options::CreateOptionGroup("SAM Options");
+ Options::AddOption("-noheader", "omit the SAM header from output",
+ m_settings->IsOmittingSamHeader, SamOpts);
+}
+
+ConvertTool::~ConvertTool()
+{
+
+ delete m_settings;
+ m_settings = 0;
+
+ delete m_impl;
+ m_impl = 0;
+}
+
+int ConvertTool::Help()
+{
+ Options::DisplayHelp();
+ return 0;
+}
+
+int ConvertTool::Run(int argc, char* argv[])
+{
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // initialize ConvertTool with settings
+ m_impl = new ConvertToolPrivate(m_settings);
+
+ // run ConvertTool, return success/fail
+ if (m_impl->Run())
+ return 0;
+ else
+ return 1;
+}
+
+// ---------------------------------------------
+// ConvertPileupFormatVisitor implementation
+
+ConvertPileupFormatVisitor::ConvertPileupFormatVisitor(const RefVector& references,
+ const std::string& fastaFilename,
+ const bool isPrintingMapQualities,
+ std::ostream* out)
+ : PileupVisitor()
+ , m_hasFasta(false)
+ , m_isPrintingMapQualities(isPrintingMapQualities)
+ , m_out(out)
+ , m_references(references)
+{
+ // set up Fasta reader if file is provided
+ if (!fastaFilename.empty()) {
+
+ // check for FASTA index
+ std::string indexFilename;
+ if (Utilities::FileExists(fastaFilename + ".fai")) indexFilename = fastaFilename + ".fai";
+
+ // open FASTA file
+ if (m_fasta.Open(fastaFilename, indexFilename)) m_hasFasta = true;
+ }
+}
+
+ConvertPileupFormatVisitor::~ConvertPileupFormatVisitor()
+{
+ // be sure to close Fasta reader
+ if (m_hasFasta) {
+ m_fasta.Close();
+ m_hasFasta = false;
+ }
+}
+
+void ConvertPileupFormatVisitor::Visit(const PileupPosition& pileupData)
+{
+
+ // skip if no alignments at this position
+ if (pileupData.PileupAlignments.empty()) return;
+
+ // retrieve reference name
+ const std::string& referenceName = m_references[pileupData.RefId].RefName;
+ const int& position = pileupData.Position;
+
+ // retrieve reference base from FASTA file, if one provided; otherwise default to 'N'
+ char referenceBase('N');
+ if (m_hasFasta && (pileupData.Position < m_references[pileupData.RefId].RefLength)) {
+ if (!m_fasta.GetBase(pileupData.RefId, pileupData.Position, referenceBase)) {
+ std::cerr << "bamtools convert ERROR: pileup conversion - could not read reference "
+ "base from FASTA file"
+ << std::endl;
+ return;
+ }
+ }
+
+ // get count of alleles at this position
+ const int numberAlleles = pileupData.PileupAlignments.size();
+
+ // -----------------------------------------------------------
+ // build strings based on alleles at this positionInAlignment
+
+ std::stringstream bases;
+ std::stringstream baseQualities;
+ std::stringstream mapQualities;
+
+ // iterate over alignments at this pileup position
+ std::vector<PileupAlignment>::const_iterator pileupIter = pileupData.PileupAlignments.begin();
+ std::vector<PileupAlignment>::const_iterator pileupEnd = pileupData.PileupAlignments.end();
+ for (; pileupIter != pileupEnd; ++pileupIter) {
+ const PileupAlignment pa = (*pileupIter);
+ const BamAlignment& ba = pa.Alignment;
+
+ // if beginning of read segment
+ if (pa.IsSegmentBegin)
+ bases << '^'
+ << (((int)ba.MapQuality > 93) ? (char)126 : (char)((int)ba.MapQuality + 33));
+
+ // if current base is not a DELETION
+ if (!pa.IsCurrentDeletion) {
+
+ // get base at current position
+ char base = ba.QueryBases.at(pa.PositionInAlignment);
+
+ // if base matches reference
+ if (base == '=' || toupper(base) == toupper(referenceBase) ||
+ tolower(base) == tolower(referenceBase)) {
+ base = (ba.IsReverseStrand() ? ',' : '.');
+ }
+
+ // mismatches reference
+ else
+ base = (ba.IsReverseStrand() ? tolower(base) : toupper(base));
+
+ // store base
+ bases << base;
+
+ // if next position contains insertion
+ if (pa.IsNextInsertion) {
+ bases << '+' << pa.InsertionLength;
+ for (int i = 1; i <= pa.InsertionLength; ++i) {
+ char insertedBase = (char)ba.QueryBases.at(pa.PositionInAlignment + i);
+ bases << (ba.IsReverseStrand() ? (char)tolower(insertedBase)
+ : (char)toupper(insertedBase));
+ }
+ }
+
+ // if next position contains DELETION
+ else if (pa.IsNextDeletion) {
+ bases << '-' << pa.DeletionLength;
+ for (int i = 1; i <= pa.DeletionLength; ++i) {
+ char deletedBase('N');
+ if (m_hasFasta &&
+ (pileupData.Position + i < m_references[pileupData.RefId].RefLength)) {
+ if (!m_fasta.GetBase(pileupData.RefId, pileupData.Position + i,
+ deletedBase)) {
+ std::cerr << "bamtools convert ERROR: pileup conversion - could not "
+ "read reference base from FASTA file"
+ << std::endl;
+ return;
+ }
+ }
+ bases << (ba.IsReverseStrand() ? (char)tolower(deletedBase)
+ : (char)toupper(deletedBase));
+ }
+ }
+ }
+
+ // otherwise, DELETION
+ else
+ bases << '*';
+
+ // if end of read segment
+ if (pa.IsSegmentEnd) bases << '$';
+
+ // store current base quality
+ baseQualities << ba.Qualities.at(pa.PositionInAlignment);
+
+ // save alignment map quality if desired
+ if (m_isPrintingMapQualities)
+ mapQualities << (((int)ba.MapQuality > 93) ? (char)126
+ : (char)((int)ba.MapQuality + 33));
+ }
+
+ // ----------------------
+ // print results
+
+ // tab-delimited
+ // <refName> <1-based pos> <refBase> <numberAlleles> <bases> <qualities> [mapQuals]
+
+ const std::string TAB(1, '\t');
+ *m_out << referenceName << TAB << position + 1 << TAB << referenceBase << TAB << numberAlleles
+ << TAB << bases.str() << TAB << baseQualities.str() << TAB << mapQualities.str()
+ << std::endl;
+}
diff --git a/src/toolkit/bamtools_convert.h b/src/toolkit/bamtools_convert.h
new file mode 100644
index 0000000..d981963
--- /dev/null
+++ b/src/toolkit/bamtools_convert.h
@@ -0,0 +1,38 @@
+// ***************************************************************************
+// bamtools_convert.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 9 July 2010
+// ---------------------------------------------------------------------------
+// Converts between BAM and a number of other formats
+// ***************************************************************************
+
+#ifndef BAMTOOLS_CONVERT_H
+#define BAMTOOLS_CONVERT_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class ConvertTool : public AbstractTool
+{
+
+public:
+ ConvertTool();
+ ~ConvertTool();
+
+public:
+ int Help();
+ int Run(int argc, char* argv[]);
+
+private:
+ struct ConvertSettings;
+ ConvertSettings* m_settings;
+
+ struct ConvertToolPrivate;
+ ConvertToolPrivate* m_impl;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_CONVERT_H
diff --git a/src/toolkit/bamtools_count.cpp b/src/toolkit/bamtools_count.cpp
new file mode 100644
index 0000000..95c5edc
--- /dev/null
+++ b/src/toolkit/bamtools_count.cpp
@@ -0,0 +1,228 @@
+// ***************************************************************************
+// bamtools_count.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 December 2012
+// ---------------------------------------------------------------------------
+// Prints alignment count for BAM file(s)
+// ***************************************************************************
+
+#include "bamtools_count.h"
+
+#include <api/BamAlgorithms.h>
+#include <api/BamMultiReader.h>
+#include <utils/bamtools_options.h>
+#include <utils/bamtools_utilities.h>
+using namespace BamTools;
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// ---------------------------------------------
+// CountSettings implementation
+
+struct CountTool::CountSettings
+{
+
+ // flags
+ bool HasInput;
+ bool HasInputFilelist;
+ bool HasRegion;
+
+ // filenames
+ std::vector<std::string> InputFiles;
+ std::string InputFilelist;
+ std::string Region;
+
+ // constructor
+ CountSettings()
+ : HasInput(false)
+ , HasInputFilelist(false)
+ , HasRegion(false)
+ {}
+};
+
+// ---------------------------------------------
+// CountToolPrivate implementation
+
+struct CountTool::CountToolPrivate
+{
+
+ // ctor & dtro
+public:
+ CountToolPrivate(CountTool::CountSettings* settings)
+ : m_settings(settings)
+ {}
+
+ ~CountToolPrivate() {}
+
+ // interface
+public:
+ bool Run();
+
+ // data members
+private:
+ CountTool::CountSettings* m_settings;
+};
+
+bool CountTool::CountToolPrivate::Run()
+{
+
+ // set to default input if none provided
+ if (!m_settings->HasInput && !m_settings->HasInputFilelist)
+ m_settings->InputFiles.push_back(Options::StandardIn());
+
+ // add files in the filelist to the input file list
+ if (m_settings->HasInputFilelist) {
+
+ std::ifstream filelist(m_settings->InputFilelist.c_str(), std::ios::in);
+ if (!filelist.is_open()) {
+ std::cerr << "bamtools count ERROR: could not open input BAM file list... Aborting."
+ << std::endl;
+ return false;
+ }
+
+ std::string line;
+ while (std::getline(filelist, line))
+ m_settings->InputFiles.push_back(line);
+ }
+
+ // open reader without index
+ BamMultiReader reader;
+ if (!reader.Open(m_settings->InputFiles)) {
+ std::cerr << "bamtools count ERROR: could not open input BAM file(s)... Aborting."
+ << std::endl;
+ return false;
+ }
+
+ // alignment counter
+ BamAlignment al;
+ int alignmentCount(0);
+
+ // if no region specified, count entire file
+ if (!m_settings->HasRegion) {
+ while (reader.GetNextAlignmentCore(al))
+ ++alignmentCount;
+ }
+
+ // otherwise attempt to use region as constraint
+ else {
+
+ // if region string parses OK
+ BamRegion region;
+ if (Utilities::ParseRegionString(m_settings->Region, reader, region)) {
+
+ // attempt to find index files
+ reader.LocateIndexes();
+
+ // if index data available for all BAM files, we can use SetRegion
+ if (reader.HasIndexes()) {
+
+ // attempt to set region on reader
+ if (!reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID,
+ region.RightPosition)) {
+ std::cerr << "bamtools count ERROR: set region failed. Check that REGION "
+ "describes a valid range"
+ << std::endl;
+ reader.Close();
+ return false;
+ }
+
+ // everything checks out, just iterate through specified region, counting alignments
+ while (reader.GetNextAlignmentCore(al))
+ ++alignmentCount;
+ }
+
+ // no index data available, we have to iterate through until we
+ // find overlapping alignments
+ else {
+ while (reader.GetNextAlignmentCore(al)) {
+ if ((al.RefID >= region.LeftRefID) &&
+ ((al.Position + al.Length) >= region.LeftPosition) &&
+ (al.RefID <= region.RightRefID) && (al.Position <= region.RightPosition)) {
+ ++alignmentCount;
+ }
+ }
+ }
+ }
+
+ // error parsing REGION string
+ else {
+ std::cerr << "bamtools count ERROR: could not parse REGION - " << m_settings->Region
+ << std::endl;
+ std::cerr << "Check that REGION is in valid format (see documentation) and that the "
+ "coordinates are valid"
+ << std::endl;
+ reader.Close();
+ return false;
+ }
+ }
+
+ // print results
+ std::cout << alignmentCount << std::endl;
+
+ // clean up & exit
+ reader.Close();
+ return true;
+}
+
+// ---------------------------------------------
+// CountTool implementation
+
+CountTool::CountTool()
+ : AbstractTool()
+ , m_settings(new CountSettings)
+ , m_impl(0)
+{
+ // set program details
+ Options::SetProgramInfo(
+ "bamtools count", "prints number of alignments in BAM file(s)",
+ "[-in <filename> -in <filename> ... | -list <filelist>] [-region <REGION>]");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "",
+ m_settings->HasInput, m_settings->InputFiles, IO_Opts,
+ Options::StandardIn());
+ Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "",
+ m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts);
+ Options::AddValueOption("-region", "REGION",
+ "genomic region. Index file is recommended for better performance, and "
+ "is used automatically if it exists. See \'bamtools help index\' for "
+ "more details on creating one",
+ "", m_settings->HasRegion, m_settings->Region, IO_Opts);
+}
+
+CountTool::~CountTool()
+{
+
+ delete m_settings;
+ m_settings = 0;
+
+ delete m_impl;
+ m_impl = 0;
+}
+
+int CountTool::Help()
+{
+ Options::DisplayHelp();
+ return 0;
+}
+
+int CountTool::Run(int argc, char* argv[])
+{
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // initialize CountTool with settings
+ m_impl = new CountToolPrivate(m_settings);
+
+ // run CountTool, return success/fail
+ if (m_impl->Run())
+ return 0;
+ else
+ return 1;
+}
diff --git a/src/toolkit/bamtools_count.h b/src/toolkit/bamtools_count.h
new file mode 100644
index 0000000..57de0f9
--- /dev/null
+++ b/src/toolkit/bamtools_count.h
@@ -0,0 +1,38 @@
+// ***************************************************************************
+// bamtools_count.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 7 April 2011
+// ---------------------------------------------------------------------------
+// Prints alignment count for BAM file(s)
+// ***************************************************************************
+
+#ifndef BAMTOOLS_COUNT_H
+#define BAMTOOLS_COUNT_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class CountTool : public AbstractTool
+{
+
+public:
+ CountTool();
+ ~CountTool();
+
+public:
+ int Help();
+ int Run(int argc, char* argv[]);
+
+private:
+ struct CountSettings;
+ CountSettings* m_settings;
+
+ struct CountToolPrivate;
+ CountToolPrivate* m_impl;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_COUNT_H
diff --git a/src/toolkit/bamtools_coverage.cpp b/src/toolkit/bamtools_coverage.cpp
new file mode 100644
index 0000000..aaf1de4
--- /dev/null
+++ b/src/toolkit/bamtools_coverage.cpp
@@ -0,0 +1,207 @@
+// ***************************************************************************
+// bamtools_coverage.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 24 July 2013
+// ---------------------------------------------------------------------------
+// Prints coverage data for a single BAM file
+// ***************************************************************************
+
+#include "bamtools_coverage.h"
+
+#include <api/BamReader.h>
+#include <utils/bamtools_options.h>
+#include <utils/bamtools_pileup_engine.h>
+using namespace BamTools;
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+// ---------------------------------------------
+// CoverageVisitor implementation
+
+class CoverageVisitor : public PileupVisitor
+{
+
+public:
+ CoverageVisitor(const RefVector& references, std::ostream* out)
+ : PileupVisitor()
+ , m_references(references)
+ , m_out(out)
+ {}
+ ~CoverageVisitor() {}
+
+ // PileupVisitor interface implementation
+public:
+ // prints coverage results ( tab-delimited )
+ void Visit(const PileupPosition& pileupData)
+ {
+ *m_out << m_references[pileupData.RefId].RefName << '\t' << pileupData.Position << '\t'
+ << pileupData.PileupAlignments.size() << std::endl;
+ }
+
+private:
+ RefVector m_references;
+ std::ostream* m_out;
+};
+
+} // namespace BamTools
+
+// ---------------------------------------------
+// CoverageSettings implementation
+
+struct CoverageTool::CoverageSettings
+{
+
+ // flags
+ bool HasInputFile;
+ bool HasOutputFile;
+
+ // filenames
+ std::string InputBamFilename;
+ std::string OutputFilename;
+
+ // constructor
+ CoverageSettings()
+ : HasInputFile(false)
+ , HasOutputFile(false)
+ , InputBamFilename(Options::StandardIn())
+ , OutputFilename(Options::StandardOut())
+ {}
+};
+
+// ---------------------------------------------
+// CoverageToolPrivate implementation
+
+struct CoverageTool::CoverageToolPrivate
+{
+
+ // ctor & dtor
+public:
+ CoverageToolPrivate(CoverageTool::CoverageSettings* settings)
+ : m_settings(settings)
+ , m_out(std::cout.rdbuf())
+ {}
+
+ ~CoverageToolPrivate() {}
+
+ // interface
+public:
+ bool Run();
+
+ // data members
+private:
+ CoverageTool::CoverageSettings* m_settings;
+ std::ostream m_out;
+ RefVector m_references;
+};
+
+bool CoverageTool::CoverageToolPrivate::Run()
+{
+
+ // if output filename given
+ std::ofstream outFile;
+ if (m_settings->HasOutputFile) {
+
+ // open output file stream
+ outFile.open(m_settings->OutputFilename.c_str());
+ if (!outFile) {
+ std::cerr << "bamtools coverage ERROR: could not open " << m_settings->OutputFilename
+ << " for output" << std::endl;
+ return false;
+ }
+
+ // set m_out to file's streambuf
+ m_out.rdbuf(outFile.rdbuf());
+ }
+
+ //open our BAM reader
+ BamReader reader;
+ if (!reader.Open(m_settings->InputBamFilename)) {
+ std::cerr << "bamtools coverage ERROR: could not open input BAM file: "
+ << m_settings->InputBamFilename << std::endl;
+ return false;
+ }
+
+ // retrieve references
+ m_references = reader.GetReferenceData();
+
+ // set up our output 'visitor'
+ CoverageVisitor* cv = new CoverageVisitor(m_references, &m_out);
+
+ // set up pileup engine with 'visitor'
+ PileupEngine pileup;
+ pileup.AddVisitor(cv);
+
+ // process input data
+ BamAlignment al;
+ while (reader.GetNextAlignment(al))
+ pileup.AddAlignment(al);
+ pileup.Flush();
+
+ // clean up
+ reader.Close();
+ if (m_settings->HasOutputFile) outFile.close();
+ delete cv;
+ cv = 0;
+
+ // return success
+ return true;
+}
+
+// ---------------------------------------------
+// CoverageTool implementation
+
+CoverageTool::CoverageTool()
+ : AbstractTool()
+ , m_settings(new CoverageSettings)
+ , m_impl(0)
+{
+ // set program details
+ Options::SetProgramInfo("bamtools coverage", "prints coverage data for a single BAM file",
+ "[-in <filename>] [-out <filename>]");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file", "",
+ m_settings->HasInputFile, m_settings->InputBamFilename, IO_Opts,
+ Options::StandardIn());
+ Options::AddValueOption("-out", "filename", "the output file", "", m_settings->HasOutputFile,
+ m_settings->OutputFilename, IO_Opts, Options::StandardOut());
+}
+
+CoverageTool::~CoverageTool()
+{
+
+ delete m_settings;
+ m_settings = 0;
+
+ delete m_impl;
+ m_impl = 0;
+}
+
+int CoverageTool::Help()
+{
+ Options::DisplayHelp();
+ return 0;
+}
+
+int CoverageTool::Run(int argc, char* argv[])
+{
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // initialize CoverageTool with settings
+ m_impl = new CoverageToolPrivate(m_settings);
+
+ // run CoverageTool, return success/fail
+ if (m_impl->Run())
+ return 0;
+ else
+ return 1;
+}
diff --git a/src/toolkit/bamtools_coverage.h b/src/toolkit/bamtools_coverage.h
new file mode 100644
index 0000000..df5cadc
--- /dev/null
+++ b/src/toolkit/bamtools_coverage.h
@@ -0,0 +1,38 @@
+// ***************************************************************************
+// bamtools_coverage.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 1 August 2010
+// ---------------------------------------------------------------------------
+// Prints coverage data for a single BAM file
+// ***************************************************************************
+
+#ifndef BAMTOOLS_COVERAGE_H
+#define BAMTOOLS_COVERAGE_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class CoverageTool : public AbstractTool
+{
+
+public:
+ CoverageTool();
+ ~CoverageTool();
+
+public:
+ int Help();
+ int Run(int argc, char* argv[]);
+
+private:
+ struct CoverageSettings;
+ CoverageSettings* m_settings;
+
+ struct CoverageToolPrivate;
+ CoverageToolPrivate* m_impl;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_COVERAGE_H
diff --git a/src/toolkit/bamtools_filter.cpp b/src/toolkit/bamtools_filter.cpp
new file mode 100644
index 0000000..46b6402
--- /dev/null
+++ b/src/toolkit/bamtools_filter.cpp
@@ -0,0 +1,1048 @@
+// ***************************************************************************
+// bamtools_filter.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 3 May 2013
+// ---------------------------------------------------------------------------
+// Filters BAM file(s) according to some user-specified criteria
+// ***************************************************************************
+
+#include "bamtools_filter.h"
+
+#include <api/BamMultiReader.h>
+#include <api/BamWriter.h>
+#include <utils/bamtools_filter_engine.h>
+#include <utils/bamtools_options.h>
+#include <utils/bamtools_utilities.h>
+using namespace BamTools;
+
+#include <json/json.h>
+using namespace Json;
+
+#include <cstdio>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+// -------------------------------
+// string literal constants
+
+// property names
+const std::string ALIGNMENTFLAG_PROPERTY = "alignmentFlag";
+const std::string CIGAR_PROPERTY = "cigar";
+const std::string INSERTSIZE_PROPERTY = "insertSize";
+const std::string ISDUPLICATE_PROPERTY = "isDuplicate";
+const std::string ISFAILEDQC_PROPERTY = "isFailedQC";
+const std::string ISFIRSTMATE_PROPERTY = "isFirstMate";
+const std::string ISMAPPED_PROPERTY = "isMapped";
+const std::string ISMATEMAPPED_PROPERTY = "isMateMapped";
+const std::string ISMATEREVERSESTRAND_PROPERTY = "isMateReverseStrand";
+const std::string ISPAIRED_PROPERTY = "isPaired";
+const std::string ISPRIMARYALIGNMENT_PROPERTY = "isPrimaryAlignment";
+const std::string ISPROPERPAIR_PROPERTY = "isProperPair";
+const std::string ISREVERSESTRAND_PROPERTY = "isReverseStrand";
+const std::string ISSECONDMATE_PROPERTY = "isSecondMate";
+const std::string ISSINGLETON_PROPERTY = "isSingleton";
+const std::string LENGTH_PROPERTY = "length";
+const std::string MAPQUALITY_PROPERTY = "mapQuality";
+const std::string MATEPOSITION_PROPERTY = "matePosition";
+const std::string MATEREFERENCE_PROPERTY = "mateReference";
+const std::string NAME_PROPERTY = "name";
+const std::string POSITION_PROPERTY = "position";
+const std::string QUERYBASES_PROPERTY = "queryBases";
+const std::string REFERENCE_PROPERTY = "reference";
+const std::string TAG_PROPERTY = "tag";
+
+// boolalpha
+const std::string TRUE_STR = "true";
+const std::string FALSE_STR = "false";
+
+RefVector filterToolReferences;
+
+struct BamAlignmentChecker
+{
+ bool check(const PropertyFilter& filter, const BamAlignment& al)
+ {
+
+ bool keepAlignment = true;
+ const PropertyMap& properties = filter.Properties;
+ PropertyMap::const_iterator propertyIter = properties.begin();
+ PropertyMap::const_iterator propertyEnd = properties.end();
+ for (; propertyIter != propertyEnd; ++propertyIter) {
+
+ // check alignment data field depending on propertyName
+ const std::string& propertyName = (*propertyIter).first;
+ const PropertyFilterValue& valueFilter = (*propertyIter).second;
+
+ if (propertyName == ALIGNMENTFLAG_PROPERTY)
+ keepAlignment &= valueFilter.check(al.AlignmentFlag);
+ else if (propertyName == CIGAR_PROPERTY) {
+ std::stringstream cigarSs;
+ const std::vector<CigarOp>& cigarData = al.CigarData;
+ if (!cigarData.empty()) {
+ std::vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
+ std::vector<CigarOp>::const_iterator cigarIter = cigarBegin;
+ std::vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
+ for (; cigarIter != cigarEnd; ++cigarIter) {
+ const CigarOp& op = (*cigarIter);
+ cigarSs << op.Length << op.Type;
+ }
+ keepAlignment &= valueFilter.check(cigarSs.str());
+ }
+ } else if (propertyName == INSERTSIZE_PROPERTY)
+ keepAlignment &= valueFilter.check(al.InsertSize);
+ else if (propertyName == ISDUPLICATE_PROPERTY)
+ keepAlignment &= valueFilter.check(al.IsDuplicate());
+ else if (propertyName == ISFAILEDQC_PROPERTY)
+ keepAlignment &= valueFilter.check(al.IsFailedQC());
+ else if (propertyName == ISFIRSTMATE_PROPERTY)
+ keepAlignment &= valueFilter.check(al.IsFirstMate());
+ else if (propertyName == ISMAPPED_PROPERTY)
+ keepAlignment &= valueFilter.check(al.IsMapped());
+ else if (propertyName == ISMATEMAPPED_PROPERTY)
+ keepAlignment &= valueFilter.check(al.IsMateMapped());
+ else if (propertyName == ISMATEREVERSESTRAND_PROPERTY)
+ keepAlignment &= valueFilter.check(al.IsMateReverseStrand());
+ else if (propertyName == ISPAIRED_PROPERTY)
+ keepAlignment &= valueFilter.check(al.IsPaired());
+ else if (propertyName == ISPRIMARYALIGNMENT_PROPERTY)
+ keepAlignment &= valueFilter.check(al.IsPrimaryAlignment());
+ else if (propertyName == ISPROPERPAIR_PROPERTY)
+ keepAlignment &= valueFilter.check(al.IsProperPair());
+ else if (propertyName == ISREVERSESTRAND_PROPERTY)
+ keepAlignment &= valueFilter.check(al.IsReverseStrand());
+ else if (propertyName == ISSECONDMATE_PROPERTY)
+ keepAlignment &= valueFilter.check(al.IsSecondMate());
+ else if (propertyName == ISSINGLETON_PROPERTY) {
+ const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped();
+ keepAlignment &= valueFilter.check(isSingleton);
+ } else if (propertyName == LENGTH_PROPERTY)
+ keepAlignment &= valueFilter.check(al.Length);
+ else if (propertyName == MAPQUALITY_PROPERTY)
+ keepAlignment &= valueFilter.check(al.MapQuality);
+ else if (propertyName == MATEPOSITION_PROPERTY)
+ keepAlignment &=
+ (al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID));
+ else if (propertyName == MATEREFERENCE_PROPERTY) {
+ if (!al.IsPaired() || !al.IsMateMapped()) return false;
+ BAMTOOLS_ASSERT_MESSAGE(
+ (al.MateRefID >= 0 && (al.MateRefID < (int)filterToolReferences.size())),
+ "Invalid MateRefID");
+ const std::string& refName = filterToolReferences.at(al.MateRefID).RefName;
+ keepAlignment &= valueFilter.check(refName);
+ } else if (propertyName == NAME_PROPERTY)
+ keepAlignment &= valueFilter.check(al.Name);
+ else if (propertyName == POSITION_PROPERTY)
+ keepAlignment &= valueFilter.check(al.Position);
+ else if (propertyName == QUERYBASES_PROPERTY)
+ keepAlignment &= valueFilter.check(al.QueryBases);
+ else if (propertyName == REFERENCE_PROPERTY) {
+ BAMTOOLS_ASSERT_MESSAGE(
+ (al.RefID >= 0 && (al.RefID < (int)filterToolReferences.size())),
+ "Invalid RefID");
+ const std::string& refName = filterToolReferences.at(al.RefID).RefName;
+ keepAlignment &= valueFilter.check(refName);
+ } else if (propertyName == TAG_PROPERTY)
+ keepAlignment &= checkAlignmentTag(valueFilter, al);
+ else
+ BAMTOOLS_ASSERT_UNREACHABLE;
+
+ // if alignment fails at ANY point, just quit and return false
+ if (!keepAlignment) return false;
+ }
+
+ BAMTOOLS_ASSERT_MESSAGE(
+ keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here");
+ return keepAlignment;
+ }
+
+ bool checkAlignmentTag(const PropertyFilterValue& valueFilter, const BamAlignment& al)
+ {
+
+ // ensure filter contains string data
+ Variant entireTagFilter = valueFilter.Value;
+ if (!entireTagFilter.is_type<std::string>()) return false;
+
+ // localize string from variant
+ const std::string& entireTagFilterString = entireTagFilter.get<std::string>();
+
+ // ensure we have at least "XX:x"
+ if (entireTagFilterString.length() < 4) return false;
+
+ // get tagName & lookup in alignment
+ // if found, set tagType to tag type character
+ // if not found, return false
+ const std::string& tagName = entireTagFilterString.substr(0, 2);
+ char tagType = '\0';
+ if (!al.GetTagType(tagName, tagType)) return false;
+
+ // remove tagName & ':' from beginning tagFilter
+ std::string tagFilterString = entireTagFilterString.substr(3);
+
+ // switch on tag type to set tag query value & parse filter token
+ int8_t asciiFilterValue, asciiQueryValue;
+ int32_t intFilterValue, intQueryValue;
+ uint32_t uintFilterValue, uintQueryValue;
+ float realFilterValue, realQueryValue;
+ std::string stringFilterValue, stringQueryValue;
+
+ PropertyFilterValue tagFilter;
+ PropertyFilterValue::ValueCompareType compareType;
+ bool keepAlignment = false;
+ switch (tagType) {
+
+ // ASCII tag type
+ case 'A':
+ if (al.GetTag(tagName, asciiQueryValue)) {
+ if (FilterEngine<BamAlignmentChecker>::parseToken(
+ tagFilterString, asciiFilterValue, compareType)) {
+ tagFilter.Value = asciiFilterValue;
+ tagFilter.Type = compareType;
+ keepAlignment = tagFilter.check(asciiQueryValue);
+ }
+ }
+ break;
+
+ // signed int tag type
+ case 'c':
+ case 's':
+ case 'i':
+ if (al.GetTag(tagName, intQueryValue)) {
+ if (FilterEngine<BamAlignmentChecker>::parseToken(
+ tagFilterString, intFilterValue, compareType)) {
+ tagFilter.Value = intFilterValue;
+ tagFilter.Type = compareType;
+ keepAlignment = tagFilter.check(intQueryValue);
+ }
+ }
+ break;
+
+ // unsigned int tag type
+ case 'C':
+ case 'S':
+ case 'I':
+ if (al.GetTag(tagName, uintQueryValue)) {
+ if (FilterEngine<BamAlignmentChecker>::parseToken(
+ tagFilterString, uintFilterValue, compareType)) {
+ tagFilter.Value = uintFilterValue;
+ tagFilter.Type = compareType;
+ keepAlignment = tagFilter.check(uintQueryValue);
+ }
+ }
+ break;
+
+ // 'real' tag type
+ case 'f':
+ if (al.GetTag(tagName, realQueryValue)) {
+ if (FilterEngine<BamAlignmentChecker>::parseToken(
+ tagFilterString, realFilterValue, compareType)) {
+ tagFilter.Value = realFilterValue;
+ tagFilter.Type = compareType;
+ keepAlignment = tagFilter.check(realQueryValue);
+ }
+ }
+ break;
+
+ // string tag type
+
+ case 'Z':
+ case 'H':
+ if (al.GetTag(tagName, stringQueryValue)) {
+ if (FilterEngine<BamAlignmentChecker>::parseToken(
+ tagFilterString, stringFilterValue, compareType)) {
+ tagFilter.Value = stringFilterValue;
+ tagFilter.Type = compareType;
+ keepAlignment = tagFilter.check(stringQueryValue);
+ }
+ }
+ break;
+
+ // unknown tag type
+ default:
+ keepAlignment = false;
+ }
+
+ return keepAlignment;
+ }
+};
+
+} // namespace BamTools
+
+// ---------------------------------------------
+// FilterSettings implementation
+
+struct FilterTool::FilterSettings
+{
+
+ // ----------------------------------
+ // IO opts
+
+ // flags
+ bool HasInput;
+ bool HasInputFilelist;
+ bool HasOutput;
+ bool HasRegion;
+ bool HasScript;
+ bool IsForceCompression;
+
+ // filenames
+ std::vector<std::string> InputFiles;
+ std::string InputFilelist;
+ std::string OutputFilename;
+ std::string Region;
+ std::string ScriptFilename;
+
+ // -----------------------------------
+ // General filter opts
+
+ // flags
+ bool HasAlignmentFlagFilter;
+ bool HasInsertSizeFilter;
+ bool HasLengthFilter;
+ bool HasMapQualityFilter;
+ bool HasNameFilter;
+ bool HasQueryBasesFilter;
+ bool HasTagFilter; //(s)
+
+ // filters
+ std::string AlignmentFlagFilter;
+ std::string InsertSizeFilter;
+ std::string LengthFilter;
+ std::string MapQualityFilter;
+ std::string NameFilter;
+ std::string QueryBasesFilter;
+ std::string TagFilter; // support multiple ?
+
+ // -----------------------------------
+ // AlignmentFlag filter opts
+
+ // flags
+ bool HasIsDuplicateFilter;
+ bool HasIsFailedQCFilter;
+ bool HasIsFirstMateFilter;
+ bool HasIsMappedFilter;
+ bool HasIsMateMappedFilter;
+ bool HasIsMateReverseStrandFilter;
+ bool HasIsPairedFilter;
+ bool HasIsPrimaryAlignmentFilter;
+ bool HasIsProperPairFilter;
+ bool HasIsReverseStrandFilter;
+ bool HasIsSecondMateFilter;
+ bool HasIsSingletonFilter;
+
+ // filters
+ std::string IsDuplicateFilter;
+ std::string IsFailedQCFilter;
+ std::string IsFirstMateFilter;
+ std::string IsMappedFilter;
+ std::string IsMateMappedFilter;
+ std::string IsMateReverseStrandFilter;
+ std::string IsPairedFilter;
+ std::string IsPrimaryAlignmentFilter;
+ std::string IsProperPairFilter;
+ std::string IsReverseStrandFilter;
+ std::string IsSecondMateFilter;
+ std::string IsSingletonFilter;
+
+ // ---------------------------------
+ // constructor
+
+ FilterSettings()
+ : HasInput(false)
+ , HasInputFilelist(false)
+ , HasOutput(false)
+ , HasRegion(false)
+ , HasScript(false)
+ , IsForceCompression(false)
+ , OutputFilename(Options::StandardOut())
+ , HasAlignmentFlagFilter(false)
+ , HasInsertSizeFilter(false)
+ , HasLengthFilter(false)
+ , HasMapQualityFilter(false)
+ , HasNameFilter(false)
+ , HasQueryBasesFilter(false)
+ , HasTagFilter(false)
+ , HasIsDuplicateFilter(false)
+ , HasIsFailedQCFilter(false)
+ , HasIsFirstMateFilter(false)
+ , HasIsMappedFilter(false)
+ , HasIsMateMappedFilter(false)
+ , HasIsMateReverseStrandFilter(false)
+ , HasIsPairedFilter(false)
+ , HasIsPrimaryAlignmentFilter(false)
+ , HasIsProperPairFilter(false)
+ , HasIsReverseStrandFilter(false)
+ , HasIsSecondMateFilter(false)
+ , HasIsSingletonFilter(false)
+ , IsDuplicateFilter(TRUE_STR)
+ , IsFailedQCFilter(TRUE_STR)
+ , IsFirstMateFilter(TRUE_STR)
+ , IsMappedFilter(TRUE_STR)
+ , IsMateMappedFilter(TRUE_STR)
+ , IsMateReverseStrandFilter(TRUE_STR)
+ , IsPairedFilter(TRUE_STR)
+ , IsPrimaryAlignmentFilter(TRUE_STR)
+ , IsProperPairFilter(TRUE_STR)
+ , IsReverseStrandFilter(TRUE_STR)
+ , IsSecondMateFilter(TRUE_STR)
+ , IsSingletonFilter(TRUE_STR)
+ {}
+};
+
+// ---------------------------------------------
+// FilterToolPrivate declaration
+
+class FilterTool::FilterToolPrivate
+{
+
+ // ctor & dtor
+public:
+ FilterToolPrivate(FilterTool::FilterSettings* settings);
+ ~FilterToolPrivate();
+
+ // 'public' interface
+public:
+ bool Run();
+
+ // internal methods
+private:
+ bool AddPropertyTokensToFilter(const std::string& filterName,
+ const std::map<std::string, std::string>& propertyTokens);
+ bool CheckAlignment(const BamAlignment& al);
+ const std::string GetScriptContents();
+ void InitProperties();
+ bool ParseCommandLine();
+ bool ParseFilterObject(const std::string& filterName, const Json::Value& filterObject);
+ bool ParseScript();
+ bool SetupFilters();
+
+ // data members
+private:
+ std::vector<std::string> m_propertyNames;
+ FilterTool::FilterSettings* m_settings;
+ FilterEngine<BamAlignmentChecker> m_filterEngine;
+};
+
+// ---------------------------------------------
+// FilterToolPrivate implementation
+
+// constructor
+FilterTool::FilterToolPrivate::FilterToolPrivate(FilterTool::FilterSettings* settings)
+ : m_settings(settings)
+{}
+
+// destructor
+FilterTool::FilterToolPrivate::~FilterToolPrivate() {}
+
+bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(
+ const std::string& filterName, const std::map<std::string, std::string>& propertyTokens)
+{
+ // dummy temp values for token parsing
+ bool boolValue;
+ int32_t int32Value;
+ uint16_t uint16Value;
+ uint32_t uint32Value;
+ std::string stringValue;
+ PropertyFilterValue::ValueCompareType type;
+
+ // iterate over property token map
+ std::map<std::string, std::string>::const_iterator mapIter = propertyTokens.begin();
+ std::map<std::string, std::string>::const_iterator mapEnd = propertyTokens.end();
+ for (; mapIter != mapEnd; ++mapIter) {
+
+ const std::string& propertyName = (*mapIter).first;
+ const std::string& token = (*mapIter).second;
+
+ // ------------------------------
+ // convert token to value & compare type
+ // then add to filter engine
+
+ // bool conversion
+ if (propertyName == ISDUPLICATE_PROPERTY || propertyName == ISFAILEDQC_PROPERTY ||
+ propertyName == ISFIRSTMATE_PROPERTY || propertyName == ISMAPPED_PROPERTY ||
+ propertyName == ISMATEMAPPED_PROPERTY || propertyName == ISMATEREVERSESTRAND_PROPERTY ||
+ propertyName == ISPAIRED_PROPERTY || propertyName == ISPRIMARYALIGNMENT_PROPERTY ||
+ propertyName == ISPROPERPAIR_PROPERTY || propertyName == ISREVERSESTRAND_PROPERTY ||
+ propertyName == ISSECONDMATE_PROPERTY || propertyName == ISSINGLETON_PROPERTY) {
+ FilterEngine<BamAlignmentChecker>::parseToken(token, boolValue, type);
+ m_filterEngine.setProperty(filterName, propertyName, boolValue, type);
+ }
+
+ // int32_t conversion
+ else if (propertyName == INSERTSIZE_PROPERTY || propertyName == LENGTH_PROPERTY ||
+ propertyName == MATEPOSITION_PROPERTY || propertyName == POSITION_PROPERTY) {
+ FilterEngine<BamAlignmentChecker>::parseToken(token, int32Value, type);
+ m_filterEngine.setProperty(filterName, propertyName, int32Value, type);
+ }
+
+ // uint16_t conversion
+ else if (propertyName == MAPQUALITY_PROPERTY) {
+ FilterEngine<BamAlignmentChecker>::parseToken(token, uint16Value, type);
+ m_filterEngine.setProperty(filterName, propertyName, uint16Value, type);
+ }
+
+ // uint32_t conversion
+ else if (propertyName == ALIGNMENTFLAG_PROPERTY) {
+ FilterEngine<BamAlignmentChecker>::parseToken(token, uint32Value, type);
+ m_filterEngine.setProperty(filterName, propertyName, uint32Value, type);
+ }
+
+ // string conversion
+ else if (propertyName == CIGAR_PROPERTY || propertyName == MATEREFERENCE_PROPERTY ||
+ propertyName == NAME_PROPERTY || propertyName == QUERYBASES_PROPERTY ||
+ propertyName == REFERENCE_PROPERTY) {
+ FilterEngine<BamAlignmentChecker>::parseToken(token, stringValue, type);
+ m_filterEngine.setProperty(filterName, propertyName, stringValue, type);
+ }
+
+ else if (propertyName == TAG_PROPERTY) {
+ // this will be stored directly as the TAG:VALUE token
+ // (VALUE may contain compare ops, will be parsed out later)
+ m_filterEngine.setProperty(filterName, propertyName, token, PropertyFilterValue::EXACT);
+ }
+
+ // else unknown property
+ else {
+ std::cerr << "bamtools filter ERROR: unknown property - " << propertyName << std::endl;
+ return false;
+ }
+ }
+ return true;
+}
+
+bool FilterTool::FilterToolPrivate::CheckAlignment(const BamAlignment& al)
+{
+ return m_filterEngine.check(al);
+}
+
+const std::string FilterTool::FilterToolPrivate::GetScriptContents()
+{
+
+ // open script for reading
+ FILE* inFile = fopen(m_settings->ScriptFilename.c_str(), "rb");
+ if (!inFile) {
+ std::cerr << "bamtools filter ERROR: could not open script: " << m_settings->ScriptFilename
+ << " for reading" << std::endl;
+ return std::string();
+ }
+
+ // read in entire script contents
+ char buffer[1024];
+ std::ostringstream docStream;
+ while (true) {
+
+ // peek ahead, make sure there is data available
+ char ch = fgetc(inFile);
+ ungetc(ch, inFile);
+ if (feof(inFile)) break;
+
+ // read next block of data
+ if (fgets(buffer, 1024, inFile) == 0) {
+ std::cerr << "bamtools filter ERROR: could not read script contents" << std::endl;
+ return std::string();
+ }
+
+ docStream << buffer;
+ }
+
+ // close script file
+ fclose(inFile);
+
+ // import buffer contents to document, return
+ return docStream.str();
+}
+
+void FilterTool::FilterToolPrivate::InitProperties()
+{
+
+ // store property names in vector
+ m_propertyNames.push_back(ALIGNMENTFLAG_PROPERTY);
+ m_propertyNames.push_back(CIGAR_PROPERTY);
+ m_propertyNames.push_back(INSERTSIZE_PROPERTY);
+ m_propertyNames.push_back(ISDUPLICATE_PROPERTY);
+ m_propertyNames.push_back(ISFAILEDQC_PROPERTY);
+ m_propertyNames.push_back(ISFIRSTMATE_PROPERTY);
+ m_propertyNames.push_back(ISMAPPED_PROPERTY);
+ m_propertyNames.push_back(ISMATEMAPPED_PROPERTY);
+ m_propertyNames.push_back(ISMATEREVERSESTRAND_PROPERTY);
+ m_propertyNames.push_back(ISPAIRED_PROPERTY);
+ m_propertyNames.push_back(ISPRIMARYALIGNMENT_PROPERTY);
+ m_propertyNames.push_back(ISPROPERPAIR_PROPERTY);
+ m_propertyNames.push_back(ISREVERSESTRAND_PROPERTY);
+ m_propertyNames.push_back(ISSECONDMATE_PROPERTY);
+ m_propertyNames.push_back(ISSINGLETON_PROPERTY);
+ m_propertyNames.push_back(LENGTH_PROPERTY);
+ m_propertyNames.push_back(MAPQUALITY_PROPERTY);
+ m_propertyNames.push_back(MATEPOSITION_PROPERTY);
+ m_propertyNames.push_back(MATEREFERENCE_PROPERTY);
+ m_propertyNames.push_back(NAME_PROPERTY);
+ m_propertyNames.push_back(POSITION_PROPERTY);
+ m_propertyNames.push_back(QUERYBASES_PROPERTY);
+ m_propertyNames.push_back(REFERENCE_PROPERTY);
+ m_propertyNames.push_back(TAG_PROPERTY);
+
+ // add vector contents to FilterEngine<BamAlignmentChecker>
+ std::vector<std::string>::const_iterator propertyNameIter = m_propertyNames.begin();
+ std::vector<std::string>::const_iterator propertyNameEnd = m_propertyNames.end();
+ for (; propertyNameIter != propertyNameEnd; ++propertyNameIter)
+ m_filterEngine.addProperty((*propertyNameIter));
+}
+
+bool FilterTool::FilterToolPrivate::ParseCommandLine()
+{
+
+ // add a rule set to filter engine
+ const std::string CMD = "COMMAND_LINE";
+ m_filterEngine.addFilter(CMD);
+
+ // map property names to command line args
+ std::map<std::string, std::string> propertyTokens;
+ if (m_settings->HasAlignmentFlagFilter)
+ propertyTokens.insert(make_pair(ALIGNMENTFLAG_PROPERTY, m_settings->AlignmentFlagFilter));
+ if (m_settings->HasInsertSizeFilter)
+ propertyTokens.insert(make_pair(INSERTSIZE_PROPERTY, m_settings->InsertSizeFilter));
+ if (m_settings->HasIsDuplicateFilter)
+ propertyTokens.insert(make_pair(ISDUPLICATE_PROPERTY, m_settings->IsDuplicateFilter));
+ if (m_settings->HasIsFailedQCFilter)
+ propertyTokens.insert(make_pair(ISFAILEDQC_PROPERTY, m_settings->IsFailedQCFilter));
+ if (m_settings->HasIsFirstMateFilter)
+ propertyTokens.insert(make_pair(ISFIRSTMATE_PROPERTY, m_settings->IsFirstMateFilter));
+ if (m_settings->HasIsMappedFilter)
+ propertyTokens.insert(make_pair(ISMAPPED_PROPERTY, m_settings->IsMappedFilter));
+ if (m_settings->HasIsMateMappedFilter)
+ propertyTokens.insert(make_pair(ISMATEMAPPED_PROPERTY, m_settings->IsMateMappedFilter));
+ if (m_settings->HasIsMateReverseStrandFilter)
+ propertyTokens.insert(
+ make_pair(ISMATEREVERSESTRAND_PROPERTY, m_settings->IsMateReverseStrandFilter));
+ if (m_settings->HasIsPairedFilter)
+ propertyTokens.insert(make_pair(ISPAIRED_PROPERTY, m_settings->IsPairedFilter));
+ if (m_settings->HasIsPrimaryAlignmentFilter)
+ propertyTokens.insert(
+ make_pair(ISPRIMARYALIGNMENT_PROPERTY, m_settings->IsPrimaryAlignmentFilter));
+ if (m_settings->HasIsProperPairFilter)
+ propertyTokens.insert(make_pair(ISPROPERPAIR_PROPERTY, m_settings->IsProperPairFilter));
+ if (m_settings->HasIsReverseStrandFilter)
+ propertyTokens.insert(
+ make_pair(ISREVERSESTRAND_PROPERTY, m_settings->IsReverseStrandFilter));
+ if (m_settings->HasIsSecondMateFilter)
+ propertyTokens.insert(make_pair(ISSECONDMATE_PROPERTY, m_settings->IsSecondMateFilter));
+ if (m_settings->HasIsSingletonFilter)
+ propertyTokens.insert(make_pair(ISSINGLETON_PROPERTY, m_settings->IsSingletonFilter));
+ if (m_settings->HasLengthFilter)
+ propertyTokens.insert(make_pair(LENGTH_PROPERTY, m_settings->LengthFilter));
+ if (m_settings->HasMapQualityFilter)
+ propertyTokens.insert(make_pair(MAPQUALITY_PROPERTY, m_settings->MapQualityFilter));
+ if (m_settings->HasNameFilter)
+ propertyTokens.insert(make_pair(NAME_PROPERTY, m_settings->NameFilter));
+ if (m_settings->HasQueryBasesFilter)
+ propertyTokens.insert(make_pair(QUERYBASES_PROPERTY, m_settings->QueryBasesFilter));
+ if (m_settings->HasTagFilter)
+ propertyTokens.insert(make_pair(TAG_PROPERTY, m_settings->TagFilter));
+
+ // send add these properties to filter set "COMMAND_LINE"
+ return AddPropertyTokensToFilter(CMD, propertyTokens);
+}
+
+bool FilterTool::FilterToolPrivate::ParseFilterObject(const std::string& filterName,
+ const Json::Value& filterObject)
+{
+
+ // filter object parsing variables
+ Json::Value null(Json::nullValue);
+ Json::Value propertyValue;
+
+ // store results
+ std::map<std::string, std::string> propertyTokens;
+
+ // iterate over known properties
+ std::vector<std::string>::const_iterator propertyNameIter = m_propertyNames.begin();
+ std::vector<std::string>::const_iterator propertyNameEnd = m_propertyNames.end();
+ for (; propertyNameIter != propertyNameEnd; ++propertyNameIter) {
+ const std::string& propertyName = (*propertyNameIter);
+
+ // if property defined in filter, add to token list
+ propertyValue = filterObject.get(propertyName, null);
+ if (propertyValue != null)
+ propertyTokens.insert(make_pair(propertyName, propertyValue.asString()));
+ }
+
+ // add this filter to engin
+ m_filterEngine.addFilter(filterName);
+
+ // add token list to this filter
+ return AddPropertyTokensToFilter(filterName, propertyTokens);
+}
+
+bool FilterTool::FilterToolPrivate::ParseScript()
+{
+
+ // read in script contents from file
+ const std::string document = GetScriptContents();
+ std::istringstream sin(document);
+
+ // set up JsonCPP reader and attempt to parse script
+ Json::Value root;
+ Json::CharReaderBuilder rbuilder;
+ std::string errs;
+ const bool ok = Json::parseFromStream(rbuilder, sin, &root, &errs);
+ if (!ok) {
+ // use built-in error reporting mechanism to alert user what was wrong with the script
+ std::cerr << "bamtools filter ERROR: failed to parse script - see error message(s) below"
+ << std::endl
+ << errs;
+ return false;
+ }
+
+ // initialize return status
+ bool success = true;
+
+ // see if root object contains multiple filters
+ const Json::Value filters = root["filters"];
+ if (!filters.isNull()) {
+
+ // iterate over any filters found
+ int filterIndex = 0;
+ Json::Value::const_iterator filtersIter = filters.begin();
+ Json::Value::const_iterator filtersEnd = filters.end();
+ for (; filtersIter != filtersEnd; ++filtersIter, ++filterIndex) {
+ Json::Value filter = (*filtersIter);
+
+ // convert filter index to string
+ std::string filterName;
+
+ // if id tag supplied
+ const Json::Value id = filter["id"];
+ if (!id.isNull()) filterName = id.asString();
+
+ // use array index
+ else {
+ std::stringstream convert;
+ convert << filterIndex;
+ filterName = convert.str();
+ }
+
+ // create & parse filter
+ success &= ParseFilterObject(filterName, filter);
+ }
+
+ // see if user defined a "rule" for these filters
+ // otherwise, use filter engine's default rule behavior
+ std::string ruleString;
+ const Json::Value rule = root["rule"];
+ if (rule.isString()) ruleString = rule.asString();
+ m_filterEngine.setRule(ruleString);
+
+ // return success/fail
+ return success;
+ }
+
+ // otherwise, root is the only filter (just contains properties)
+ // create & parse filter named "ROOT"
+ else
+ success = ParseFilterObject("ROOT", root);
+
+ // return success/failure
+ return success;
+}
+
+bool FilterTool::FilterToolPrivate::Run()
+{
+
+ // set to default input if none provided
+ if (!m_settings->HasInput && !m_settings->HasInputFilelist)
+ m_settings->InputFiles.push_back(Options::StandardIn());
+
+ // add files in the filelist to the input file list
+ if (m_settings->HasInputFilelist) {
+
+ std::ifstream filelist(m_settings->InputFilelist.c_str(), std::ios::in);
+ if (!filelist.is_open()) {
+ std::cerr << "bamtools filter ERROR: could not open input BAM file list... Aborting."
+ << std::endl;
+ return false;
+ }
+
+ std::string line;
+ while (std::getline(filelist, line))
+ m_settings->InputFiles.push_back(line);
+ }
+
+ // initialize defined properties & user-specified filters
+ // quit if failed
+ if (!SetupFilters()) return false;
+
+ // open reader without index
+ BamMultiReader reader;
+ if (!reader.Open(m_settings->InputFiles)) {
+ std::cerr << "bamtools filter ERROR: could not open input files for reading." << std::endl;
+ return false;
+ }
+
+ // retrieve reader header & reference data
+ const std::string headerText = reader.GetHeaderText();
+ filterToolReferences = reader.GetReferenceData();
+
+ // determine compression mode for BamWriter
+ bool writeUncompressed =
+ (m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression);
+ BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
+ if (writeUncompressed) compressionMode = BamWriter::Uncompressed;
+
+ // open BamWriter
+ BamWriter writer;
+ writer.SetCompressionMode(compressionMode);
+ if (!writer.Open(m_settings->OutputFilename, headerText, filterToolReferences)) {
+ std::cerr << "bamtools filter ERROR: could not open " << m_settings->OutputFilename
+ << " for writing." << std::endl;
+ reader.Close();
+ return false;
+ }
+
+ // if no region specified, filter entire file
+ BamAlignment al;
+ if (!m_settings->HasRegion) {
+ while (reader.GetNextAlignment(al)) {
+ if (CheckAlignment(al)) writer.SaveAlignment(al);
+ }
+ }
+
+ // otherwise attempt to use region as constraint
+ else {
+
+ // if region string parses OK
+ BamRegion region;
+ if (Utilities::ParseRegionString(m_settings->Region, reader, region)) {
+
+ // attempt to find index files
+ reader.LocateIndexes();
+
+ // if index data available for all BAM files, we can use SetRegion
+ if (reader.HasIndexes()) {
+
+ // attempt to use SetRegion(), if failed report error
+ if (!reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID,
+ region.RightPosition)) {
+ std::cerr << "bamtools filter ERROR: set region failed. Check that REGION "
+ "describes a valid range"
+ << std::endl;
+ reader.Close();
+ return false;
+ }
+
+ // everything checks out, just iterate through specified region, filtering alignments
+ while (reader.GetNextAlignment(al))
+ if (CheckAlignment(al)) writer.SaveAlignment(al);
+ }
+
+ // no index data available, we have to iterate through until we
+ // find overlapping alignments
+ else {
+ while (reader.GetNextAlignment(al)) {
+ if ((al.RefID >= region.LeftRefID) &&
+ ((al.Position + al.Length) >= region.LeftPosition) &&
+ (al.RefID <= region.RightRefID) && (al.Position <= region.RightPosition)) {
+ if (CheckAlignment(al)) writer.SaveAlignment(al);
+ }
+ }
+ }
+ }
+
+ // error parsing REGION string
+ else {
+ std::cerr << "bamtools filter ERROR: could not parse REGION: " << m_settings->Region
+ << std::endl;
+ std::cerr << "Check that REGION is in valid format (see documentation) and that the "
+ "coordinates are valid"
+ << std::endl;
+ reader.Close();
+ return false;
+ }
+ }
+
+ // clean up & exit
+ reader.Close();
+ writer.Close();
+ return true;
+}
+
+bool FilterTool::FilterToolPrivate::SetupFilters()
+{
+
+ // set up filter engine with supported properties
+ InitProperties();
+
+ // parse script for filter rules, if given
+ if (m_settings->HasScript) return ParseScript();
+
+ // otherwise check command line for filters
+ else
+ return ParseCommandLine();
+}
+
+// ---------------------------------------------
+// FilterTool implementation
+
+FilterTool::FilterTool()
+ : AbstractTool()
+ , m_settings(new FilterSettings)
+ , m_impl(0)
+{
+ // ----------------------------------
+ // set program details
+
+ const std::string usage =
+ "[-in <filename> -in <filename> ... | -list <filelist>] "
+ "[-out <filename> | [-forceCompression]] [-region <REGION>] "
+ "[ [-script <filename] | [filterOptions] ]";
+
+ Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", usage);
+
+ // ----------------------------------
+ // I/O options
+
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+
+ const std::string inDesc = "the input BAM file(s)";
+ const std::string listDesc = "the input BAM file list, one line per file";
+ const std::string outDesc = "the output BAM file";
+ const std::string regionDesc =
+ "only read data from this genomic region (see documentation for more details)";
+ const std::string scriptDesc = "the filter script file (see documentation for more details)";
+ const std::string forceDesc =
+ "if results are sent to stdout (like when piping to another tool), "
+ "default behavior is to leave output uncompressed. Use this flag to "
+ "override and force compression";
+
+ Options::AddValueOption("-in", "BAM filename", inDesc, "", m_settings->HasInput,
+ m_settings->InputFiles, IO_Opts, Options::StandardIn());
+ Options::AddValueOption("-list", "filename", listDesc, "", m_settings->HasInputFilelist,
+ m_settings->InputFilelist, IO_Opts);
+ Options::AddValueOption("-out", "BAM filename", outDesc, "", m_settings->HasOutput,
+ m_settings->OutputFilename, IO_Opts, Options::StandardOut());
+ Options::AddValueOption("-region", "REGION", regionDesc, "", m_settings->HasRegion,
+ m_settings->Region, IO_Opts);
+ Options::AddValueOption("-script", "filename", scriptDesc, "", m_settings->HasScript,
+ m_settings->ScriptFilename, IO_Opts);
+ Options::AddOption("-forceCompression", forceDesc, m_settings->IsForceCompression, IO_Opts);
+
+ // ----------------------------------
+ // general filter options
+
+ OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters");
+
+ const std::string flagDesc =
+ "keep reads with this *exact* alignment flag (for more detailed queries, see below)";
+ const std::string insertDesc = "keep reads with insert size that matches pattern";
+ const std::string lengthDesc = "keep reads with length that matches pattern";
+ const std::string mapQualDesc = "keep reads with map quality that matches pattern";
+ const std::string nameDesc = "keep reads with name that matches pattern";
+ const std::string queryDesc = "keep reads with motif that matches pattern";
+ const std::string tagDesc = "keep reads with this key=>value pair";
+
+ Options::AddValueOption("-alignmentFlag", "int", flagDesc, "",
+ m_settings->HasAlignmentFlagFilter, m_settings->AlignmentFlagFilter,
+ FilterOpts);
+ Options::AddValueOption("-insertSize", "int", insertDesc, "", m_settings->HasInsertSizeFilter,
+ m_settings->InsertSizeFilter, FilterOpts);
+ Options::AddValueOption("-length", "int", lengthDesc, "", m_settings->HasLengthFilter,
+ m_settings->LengthFilter, FilterOpts);
+ Options::AddValueOption("-mapQuality", "[0-255]", mapQualDesc, "",
+ m_settings->HasMapQualityFilter, m_settings->MapQualityFilter,
+ FilterOpts);
+ Options::AddValueOption("-name", "string", nameDesc, "", m_settings->HasNameFilter,
+ m_settings->NameFilter, FilterOpts);
+ Options::AddValueOption("-queryBases", "string", queryDesc, "", m_settings->HasQueryBasesFilter,
+ m_settings->QueryBasesFilter, FilterOpts);
+ Options::AddValueOption("-tag", "TAG:VALUE", tagDesc, "", m_settings->HasTagFilter,
+ m_settings->TagFilter, FilterOpts);
+
+ // ----------------------------------
+ // alignment flag filter options
+
+ OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters");
+
+ const std::string boolArg = "true/false";
+ const std::string isDupDesc = "keep only alignments that are marked as duplicate?";
+ const std::string isFailQcDesc = "keep only alignments that failed QC?";
+ const std::string isFirstMateDesc = "keep only alignments marked as first mate?";
+ const std::string isMappedDesc = "keep only alignments that were mapped?";
+ const std::string isMateMappedDesc = "keep only alignments with mates that mapped";
+ const std::string isMateReverseDesc = "keep only alignments with mate on reverese strand?";
+ const std::string isPairedDesc = "keep only alignments that were sequenced as paired?";
+ const std::string isPrimaryDesc = "keep only alignments marked as primary?";
+ const std::string isProperPairDesc = "keep only alignments that passed PE resolution?";
+ const std::string isReverseDesc = "keep only alignments on reverse strand?";
+ const std::string isSecondMateDesc = "keep only alignments marked as second mate?";
+ const std::string isSingletonDesc = "keep only singletons";
+
+ Options::AddValueOption("-isDuplicate", boolArg, isDupDesc, "",
+ m_settings->HasIsDuplicateFilter, m_settings->IsDuplicateFilter,
+ AlignmentFlagOpts, TRUE_STR);
+ Options::AddValueOption("-isFailedQC", boolArg, isFailQcDesc, "",
+ m_settings->HasIsFailedQCFilter, m_settings->IsFailedQCFilter,
+ AlignmentFlagOpts, TRUE_STR);
+ Options::AddValueOption("-isFirstMate", boolArg, isFirstMateDesc, "",
+ m_settings->HasIsFirstMateFilter, m_settings->IsFirstMateFilter,
+ AlignmentFlagOpts, TRUE_STR);
+ Options::AddValueOption("-isMapped", boolArg, isMappedDesc, "", m_settings->HasIsMappedFilter,
+ m_settings->IsMappedFilter, AlignmentFlagOpts, TRUE_STR);
+ Options::AddValueOption("-isMateMapped", boolArg, isMateMappedDesc, "",
+ m_settings->HasIsMateMappedFilter, m_settings->IsMateMappedFilter,
+ AlignmentFlagOpts, TRUE_STR);
+ Options::AddValueOption("-isMateReverseStrand", boolArg, isMateReverseDesc, "",
+ m_settings->HasIsMateReverseStrandFilter,
+ m_settings->IsMateReverseStrandFilter, AlignmentFlagOpts, TRUE_STR);
+ Options::AddValueOption("-isPaired", boolArg, isPairedDesc, "", m_settings->HasIsPairedFilter,
+ m_settings->IsPairedFilter, AlignmentFlagOpts, TRUE_STR);
+ Options::AddValueOption("-isPrimaryAlignment", boolArg, isPrimaryDesc, "",
+ m_settings->HasIsPrimaryAlignmentFilter,
+ m_settings->IsPrimaryAlignmentFilter, AlignmentFlagOpts, TRUE_STR);
+ Options::AddValueOption("-isProperPair", boolArg, isProperPairDesc, "",
+ m_settings->HasIsProperPairFilter, m_settings->IsProperPairFilter,
+ AlignmentFlagOpts, TRUE_STR);
+ Options::AddValueOption("-isReverseStrand", boolArg, isReverseDesc, "",
+ m_settings->HasIsReverseStrandFilter, m_settings->IsReverseStrandFilter,
+ AlignmentFlagOpts, TRUE_STR);
+ Options::AddValueOption("-isSecondMate", boolArg, isSecondMateDesc, "",
+ m_settings->HasIsSecondMateFilter, m_settings->IsSecondMateFilter,
+ AlignmentFlagOpts, TRUE_STR);
+ Options::AddValueOption("-isSingleton", boolArg, isSingletonDesc, "",
+ m_settings->HasIsSingletonFilter, m_settings->IsSingletonFilter,
+ AlignmentFlagOpts, TRUE_STR);
+}
+
+FilterTool::~FilterTool()
+{
+
+ delete m_settings;
+ m_settings = 0;
+
+ delete m_impl;
+ m_impl = 0;
+}
+
+int FilterTool::Help()
+{
+ Options::DisplayHelp();
+ return 0;
+}
+
+int FilterTool::Run(int argc, char* argv[])
+{
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // initialize FilterTool with settings
+ m_impl = new FilterToolPrivate(m_settings);
+
+ // run FilterTool, return success/fail
+ if (m_impl->Run())
+ return 0;
+ else
+ return 1;
+}
diff --git a/src/toolkit/bamtools_filter.h b/src/toolkit/bamtools_filter.h
new file mode 100644
index 0000000..8f4247e
--- /dev/null
+++ b/src/toolkit/bamtools_filter.h
@@ -0,0 +1,38 @@
+// ***************************************************************************
+// bamtools_filter.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 28 August 2010
+// ---------------------------------------------------------------------------
+// Filters BAM file(s) according to some user-specified criteria
+// ***************************************************************************
+
+#ifndef BAMTOOLS_FILTER_H
+#define BAMTOOLS_FILTER_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class FilterTool : public AbstractTool
+{
+
+public:
+ FilterTool();
+ ~FilterTool();
+
+public:
+ int Help();
+ int Run(int argc, char* argv[]);
+
+private:
+ struct FilterSettings;
+ FilterSettings* m_settings;
+
+ class FilterToolPrivate;
+ FilterToolPrivate* m_impl;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_FILTER_H
diff --git a/src/toolkit/bamtools_header.cpp b/src/toolkit/bamtools_header.cpp
new file mode 100644
index 0000000..db4cbeb
--- /dev/null
+++ b/src/toolkit/bamtools_header.cpp
@@ -0,0 +1,152 @@
+// ***************************************************************************
+// bamtools_header.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 December 2012
+// ---------------------------------------------------------------------------
+// Prints the SAM-style header from a single BAM file ( or merged header from
+// multiple BAM files) to stdout
+// ***************************************************************************
+
+#include "bamtools_header.h"
+
+#include <api/BamMultiReader.h>
+#include <utils/bamtools_options.h>
+using namespace BamTools;
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// ---------------------------------------------
+// HeaderSettings implementation
+
+struct HeaderTool::HeaderSettings
+{
+
+ // flags
+ bool HasInput;
+ bool HasInputFilelist;
+
+ // filenames
+ std::vector<std::string> InputFiles;
+ std::string InputFilelist;
+
+ // constructor
+ HeaderSettings()
+ : HasInput(false)
+ , HasInputFilelist(false)
+ {}
+};
+
+struct HeaderTool::HeaderToolPrivate
+{
+
+ // ctor & dtor
+public:
+ HeaderToolPrivate(HeaderTool::HeaderSettings* settings)
+ : m_settings(settings)
+ {}
+
+ ~HeaderToolPrivate() {}
+
+ // interface
+public:
+ bool Run();
+
+ // data members
+private:
+ HeaderTool::HeaderSettings* m_settings;
+};
+
+bool HeaderTool::HeaderToolPrivate::Run()
+{
+
+ // set to default input if none provided
+ if (!m_settings->HasInput && !m_settings->HasInputFilelist)
+ m_settings->InputFiles.push_back(Options::StandardIn());
+
+ // add files in the filelist to the input file list
+ if (m_settings->HasInputFilelist) {
+
+ std::ifstream filelist(m_settings->InputFilelist.c_str(), std::ios::in);
+ if (!filelist.is_open()) {
+ std::cerr << "bamtools header ERROR: could not open input BAM file list... Aborting."
+ << std::endl;
+ return false;
+ }
+
+ std::string line;
+ while (std::getline(filelist, line))
+ m_settings->InputFiles.push_back(line);
+ }
+
+ // attemp to open BAM files
+ BamMultiReader reader;
+ if (!reader.Open(m_settings->InputFiles)) {
+ std::cerr << "bamtools header ERROR: could not open BAM file(s) for reading... Aborting."
+ << std::endl;
+ return false;
+ }
+
+ // dump (merged) header contents to stdout
+ std::cout << reader.GetHeaderText() << std::endl;
+
+ // clean up & exit
+ reader.Close();
+ return true;
+}
+
+// ---------------------------------------------
+// HeaderTool implementation
+
+HeaderTool::HeaderTool()
+ : AbstractTool()
+ , m_settings(new HeaderSettings)
+ , m_impl(0)
+{
+ // set program details
+ Options::SetProgramInfo("bamtools header", "prints header from BAM file(s)",
+ "[-in <filename> -in <filename> ... | -list <filelist>]");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "",
+ m_settings->HasInput, m_settings->InputFiles, IO_Opts,
+ Options::StandardIn());
+ Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "",
+ m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts);
+}
+
+HeaderTool::~HeaderTool()
+{
+
+ delete m_settings;
+ m_settings = 0;
+
+ delete m_impl;
+ m_impl = 0;
+}
+
+int HeaderTool::Help()
+{
+ Options::DisplayHelp();
+ return 0;
+}
+
+int HeaderTool::Run(int argc, char* argv[])
+{
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // initialize HeaderTool with settings
+ m_impl = new HeaderToolPrivate(m_settings);
+
+ // run HeaderTool, return success/fail
+ if (m_impl->Run())
+ return 0;
+ else
+ return 1;
+}
diff --git a/src/toolkit/bamtools_header.h b/src/toolkit/bamtools_header.h
new file mode 100644
index 0000000..fe59238
--- /dev/null
+++ b/src/toolkit/bamtools_header.h
@@ -0,0 +1,39 @@
+// ***************************************************************************
+// bamtools_header.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 7 April 2011
+// ---------------------------------------------------------------------------
+// Prints the SAM-style header from a single BAM file ( or merged header from
+// multiple BAM files) to stdout
+// ***************************************************************************
+
+#ifndef BAMTOOLS_HEADER_H
+#define BAMTOOLS_HEADER_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class HeaderTool : public AbstractTool
+{
+
+public:
+ HeaderTool();
+ ~HeaderTool();
+
+public:
+ int Help();
+ int Run(int argc, char* argv[]);
+
+private:
+ struct HeaderSettings;
+ HeaderSettings* m_settings;
+
+ struct HeaderToolPrivate;
+ HeaderToolPrivate* m_impl;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_HEADER_H
diff --git a/src/toolkit/bamtools_index.cpp b/src/toolkit/bamtools_index.cpp
new file mode 100644
index 0000000..57ca5b2
--- /dev/null
+++ b/src/toolkit/bamtools_index.cpp
@@ -0,0 +1,137 @@
+// ***************************************************************************
+// bamtools_index.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 7 April 2011
+// ---------------------------------------------------------------------------
+// Creates a BAM index file
+// ***************************************************************************
+
+#include "bamtools_index.h"
+
+#include <api/BamReader.h>
+#include <utils/bamtools_options.h>
+using namespace BamTools;
+
+#include <iostream>
+#include <string>
+
+// ---------------------------------------------
+// IndexSettings implementation
+
+struct IndexTool::IndexSettings
+{
+
+ // flags
+ bool HasInputBamFilename;
+ bool IsUsingBamtoolsIndex;
+
+ // filenames
+ std::string InputBamFilename;
+
+ // constructor
+ IndexSettings()
+ : HasInputBamFilename(false)
+ , IsUsingBamtoolsIndex(false)
+ , InputBamFilename(Options::StandardIn())
+ {}
+};
+
+// ---------------------------------------------
+// IndexToolPrivate implementation
+
+struct IndexTool::IndexToolPrivate
+{
+
+ // ctor & dtor
+public:
+ IndexToolPrivate(IndexTool::IndexSettings* settings)
+ : m_settings(settings)
+ {}
+
+ ~IndexToolPrivate() {}
+
+ // interface
+public:
+ bool Run();
+
+ // data members
+private:
+ IndexTool::IndexSettings* m_settings;
+};
+
+bool IndexTool::IndexToolPrivate::Run()
+{
+
+ // open our BAM reader
+ BamReader reader;
+ if (!reader.Open(m_settings->InputBamFilename)) {
+ std::cerr << "bamtools index ERROR: could not open BAM file: "
+ << m_settings->InputBamFilename << std::endl;
+ return false;
+ }
+
+ // create index for BAM file
+ const BamIndex::IndexType type =
+ (m_settings->IsUsingBamtoolsIndex ? BamIndex::BAMTOOLS : BamIndex::STANDARD);
+ reader.CreateIndex(type);
+
+ // clean & exit
+ reader.Close();
+ return true;
+}
+
+// ---------------------------------------------
+// IndexTool implementation
+
+IndexTool::IndexTool()
+ : AbstractTool()
+ , m_settings(new IndexSettings)
+ , m_impl(0)
+{
+ // set program details
+ Options::SetProgramInfo("bamtools index", "creates index for BAM file",
+ "[-in <filename>] [-bti]");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file", "",
+ m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts,
+ Options::StandardIn());
+ Options::AddOption("-bti",
+ "create (non-standard) BamTools index file (*.bti). Default behavior is to "
+ "create standard BAM index (*.bai)",
+ m_settings->IsUsingBamtoolsIndex, IO_Opts);
+}
+
+IndexTool::~IndexTool()
+{
+
+ delete m_settings;
+ m_settings = 0;
+
+ delete m_impl;
+ m_impl = 0;
+}
+
+int IndexTool::Help()
+{
+ Options::DisplayHelp();
+ return 0;
+}
+
+int IndexTool::Run(int argc, char* argv[])
+{
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // initialize IndexTool with settings
+ m_impl = new IndexToolPrivate(m_settings);
+
+ // run IndexTool, return success/fail
+ if (m_impl->Run())
+ return 0;
+ else
+ return 1;
+}
diff --git a/src/toolkit/bamtools_index.h b/src/toolkit/bamtools_index.h
new file mode 100644
index 0000000..c378832
--- /dev/null
+++ b/src/toolkit/bamtools_index.h
@@ -0,0 +1,38 @@
+// ***************************************************************************
+// bamtools_index.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 7 April 2011
+// ---------------------------------------------------------------------------
+// Creates a BAM index file
+// ***************************************************************************
+
+#ifndef BAMTOOLS_INDEX_H
+#define BAMTOOLS_INDEX_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class IndexTool : public AbstractTool
+{
+
+public:
+ IndexTool();
+ ~IndexTool();
+
+public:
+ int Help();
+ int Run(int argc, char* argv[]);
+
+private:
+ struct IndexSettings;
+ IndexSettings* m_settings;
+
+ struct IndexToolPrivate;
+ IndexToolPrivate* m_impl;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_INDEX_H
diff --git a/src/toolkit/bamtools_merge.cpp b/src/toolkit/bamtools_merge.cpp
new file mode 100644
index 0000000..2bac936
--- /dev/null
+++ b/src/toolkit/bamtools_merge.cpp
@@ -0,0 +1,257 @@
+// ***************************************************************************
+// bamtools_merge.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 December 2012
+// ---------------------------------------------------------------------------
+// Merges multiple BAM files into one
+// ***************************************************************************
+
+#include "bamtools_merge.h"
+
+#include <api/BamMultiReader.h>
+#include <api/BamWriter.h>
+#include <utils/bamtools_options.h>
+#include <utils/bamtools_utilities.h>
+using namespace BamTools;
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// ---------------------------------------------
+// MergeSettings implementation
+
+struct MergeTool::MergeSettings
+{
+
+ // flags
+ bool HasInput;
+ bool HasInputFilelist;
+ bool HasOutput;
+ bool IsForceCompression;
+ bool HasRegion;
+
+ // filenames
+ std::vector<std::string> InputFiles;
+ std::string InputFilelist;
+
+ // other parameters
+ std::string OutputFilename;
+ std::string Region;
+
+ // constructor
+ MergeSettings()
+ : HasInput(false)
+ , HasInputFilelist(false)
+ , HasOutput(false)
+ , IsForceCompression(false)
+ , HasRegion(false)
+ , OutputFilename(Options::StandardOut())
+ {}
+};
+
+// ---------------------------------------------
+// MergeToolPrivate implementation
+
+struct MergeTool::MergeToolPrivate
+{
+
+ // ctor & dtor
+public:
+ MergeToolPrivate(MergeTool::MergeSettings* settings)
+ : m_settings(settings)
+ {}
+
+ ~MergeToolPrivate() {}
+
+ // interface
+public:
+ bool Run();
+
+ // data members
+private:
+ MergeTool::MergeSettings* m_settings;
+};
+
+bool MergeTool::MergeToolPrivate::Run()
+{
+
+ // set to default input if none provided
+ if (!m_settings->HasInput && !m_settings->HasInputFilelist)
+ m_settings->InputFiles.push_back(Options::StandardIn());
+
+ // add files in the filelist to the input file list
+ if (m_settings->HasInputFilelist) {
+
+ std::ifstream filelist(m_settings->InputFilelist.c_str(), std::ios::in);
+ if (!filelist.is_open()) {
+ std::cerr << "bamtools merge ERROR: could not open input BAM file list... Aborting."
+ << std::endl;
+ return false;
+ }
+
+ std::string line;
+ while (std::getline(filelist, line))
+ m_settings->InputFiles.push_back(line);
+ }
+
+ // opens the BAM files (by default without checking for indexes)
+ BamMultiReader reader;
+ if (!reader.Open(m_settings->InputFiles)) {
+ std::cerr << "bamtools merge ERROR: could not open input BAM file(s)... Aborting."
+ << std::endl;
+ return false;
+ }
+
+ // retrieve header & reference dictionary info
+ std::string mergedHeader = reader.GetHeaderText();
+ RefVector references = reader.GetReferenceData();
+
+ // determine compression mode for BamWriter
+ bool writeUncompressed =
+ (m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression);
+ BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
+ if (writeUncompressed) compressionMode = BamWriter::Uncompressed;
+
+ // open BamWriter
+ BamWriter writer;
+ writer.SetCompressionMode(compressionMode);
+ if (!writer.Open(m_settings->OutputFilename, mergedHeader, references)) {
+ std::cerr << "bamtools merge ERROR: could not open " << m_settings->OutputFilename
+ << " for writing." << std::endl;
+ reader.Close();
+ return false;
+ }
+
+ // if no region specified, store entire contents of file(s)
+ if (!m_settings->HasRegion) {
+ BamAlignment al;
+ while (reader.GetNextAlignmentCore(al))
+ writer.SaveAlignment(al);
+ }
+
+ // otherwise attempt to use region as constraint
+ else {
+
+ // if region string parses OK
+ BamRegion region;
+ if (Utilities::ParseRegionString(m_settings->Region, reader, region)) {
+
+ // attempt to find index files
+ reader.LocateIndexes();
+
+ // if index data available for all BAM files, we can use SetRegion
+ if (reader.HasIndexes()) {
+
+ // attempt to use SetRegion(), if failed report error
+ if (!reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID,
+ region.RightPosition)) {
+ std::cerr << "bamtools merge ERROR: set region failed. Check that REGION "
+ "describes a valid range"
+ << std::endl;
+ reader.Close();
+ return false;
+ }
+
+ // everything checks out, just iterate through specified region, storing alignments
+ BamAlignment al;
+ while (reader.GetNextAlignmentCore(al))
+ writer.SaveAlignment(al);
+ }
+
+ // no index data available, we have to iterate through until we
+ // find overlapping alignments
+ else {
+ BamAlignment al;
+ while (reader.GetNextAlignmentCore(al)) {
+ if ((al.RefID >= region.LeftRefID) &&
+ ((al.Position + al.Length) >= region.LeftPosition) &&
+ (al.RefID <= region.RightRefID) && (al.Position <= region.RightPosition)) {
+ writer.SaveAlignment(al);
+ }
+ }
+ }
+ }
+
+ // error parsing REGION string
+ else {
+ std::cerr << "bamtools merge ERROR: could not parse REGION - " << m_settings->Region
+ << std::endl;
+ std::cerr << "Check that REGION is in valid format (see documentation) and that the "
+ "coordinates are valid"
+ << std::endl;
+ reader.Close();
+ writer.Close();
+ return false;
+ }
+ }
+
+ // clean & exit
+ reader.Close();
+ writer.Close();
+ return true;
+}
+
+// ---------------------------------------------
+// MergeTool implementation
+
+MergeTool::MergeTool()
+ : AbstractTool()
+ , m_settings(new MergeSettings)
+ , m_impl(0)
+{
+ // set program details
+ Options::SetProgramInfo("bamtools merge", "merges multiple BAM files into one",
+ "[-in <filename> -in <filename> ... | -list <filelist>] [-out "
+ "<filename> | [-forceCompression]] [-region <REGION>]");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "",
+ m_settings->HasInput, m_settings->InputFiles, IO_Opts);
+ Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "",
+ m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts);
+ Options::AddValueOption("-out", "BAM filename", "the output BAM file", "",
+ m_settings->HasOutput, m_settings->OutputFilename, IO_Opts);
+ Options::AddOption("-forceCompression",
+ "if results are sent to stdout (like when piping to another tool), default "
+ "behavior is to leave output uncompressed. Use this flag to override and "
+ "force compression",
+ m_settings->IsForceCompression, IO_Opts);
+ Options::AddValueOption("-region", "REGION", "genomic region. See README for more details", "",
+ m_settings->HasRegion, m_settings->Region, IO_Opts);
+}
+
+MergeTool::~MergeTool()
+{
+
+ delete m_settings;
+ m_settings = 0;
+
+ delete m_impl;
+ m_impl = 0;
+}
+
+int MergeTool::Help()
+{
+ Options::DisplayHelp();
+ return 0;
+}
+
+int MergeTool::Run(int argc, char* argv[])
+{
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // initialize MergeTool with settings
+ m_impl = new MergeToolPrivate(m_settings);
+
+ // run MergeTool, return success/fail
+ if (m_impl->Run())
+ return 0;
+ else
+ return 1;
+}
diff --git a/src/toolkit/bamtools_merge.h b/src/toolkit/bamtools_merge.h
new file mode 100644
index 0000000..0db4bc7
--- /dev/null
+++ b/src/toolkit/bamtools_merge.h
@@ -0,0 +1,38 @@
+// ***************************************************************************
+// bamtools_merge.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 7 April 2011
+// ---------------------------------------------------------------------------
+// Merges multiple BAM files into one
+// ***************************************************************************
+
+#ifndef BAMTOOLS_MERGE_H
+#define BAMTOOLS_MERGE_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class MergeTool : public AbstractTool
+{
+
+public:
+ MergeTool();
+ ~MergeTool();
+
+public:
+ int Help();
+ int Run(int argc, char* argv[]);
+
+private:
+ struct MergeSettings;
+ MergeSettings* m_settings;
+
+ struct MergeToolPrivate;
+ MergeToolPrivate* m_impl;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_MERGE_H
diff --git a/src/toolkit/bamtools_random.cpp b/src/toolkit/bamtools_random.cpp
new file mode 100644
index 0000000..fceebda
--- /dev/null
+++ b/src/toolkit/bamtools_random.cpp
@@ -0,0 +1,316 @@
+// ***************************************************************************
+// bamtools_random.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 24 July 2013 (DB)
+// ---------------------------------------------------------------------------
+// Grab a random subset of alignments (testing tool)
+// ***************************************************************************
+
+#include "bamtools_random.h"
+
+#include <api/BamMultiReader.h>
+#include <api/BamWriter.h>
+#include <utils/bamtools_options.h>
+#include <utils/bamtools_utilities.h>
+using namespace BamTools;
+
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+// define constants
+const unsigned int RANDOM_MAX_ALIGNMENT_COUNT = 10000;
+
+// utility methods for RandomTool
+int getRandomInt(const int& lowerBound, const int& upperBound)
+{
+ const int range = (upperBound - lowerBound) + 1;
+ return (lowerBound + (int)(range * (double)rand() / ((double)RAND_MAX + 1)));
+}
+
+} // namespace BamTools
+
+// ---------------------------------------------
+// RandomSettings implementation
+
+struct RandomTool::RandomSettings
+{
+
+ // flags
+ bool HasAlignmentCount;
+ bool HasInput;
+ bool HasInputFilelist;
+ bool HasOutput;
+ bool HasRandomNumberSeed;
+ bool HasRegion;
+ bool IsForceCompression;
+
+ // parameters
+ unsigned int AlignmentCount;
+ std::vector<std::string> InputFiles;
+ std::string InputFilelist;
+ std::string OutputFilename;
+ unsigned int RandomNumberSeed;
+ std::string Region;
+
+ // constructor
+ RandomSettings()
+ : HasAlignmentCount(false)
+ , HasInput(false)
+ , HasInputFilelist(false)
+ , HasOutput(false)
+ , HasRandomNumberSeed(false)
+ , HasRegion(false)
+ , IsForceCompression(false)
+ , AlignmentCount(RANDOM_MAX_ALIGNMENT_COUNT)
+ , OutputFilename(Options::StandardOut())
+ , RandomNumberSeed(0)
+ {}
+};
+
+// ---------------------------------------------
+// RandomToolPrivate implementation
+
+struct RandomTool::RandomToolPrivate
+{
+
+ // ctor & dtor
+public:
+ RandomToolPrivate(RandomTool::RandomSettings* settings)
+ : m_settings(settings)
+ {}
+
+ ~RandomToolPrivate() {}
+
+ // interface
+public:
+ bool Run();
+
+ // data members
+private:
+ RandomTool::RandomSettings* m_settings;
+};
+
+bool RandomTool::RandomToolPrivate::Run()
+{
+
+ // set to default stdin if no input files provided
+ if (!m_settings->HasInput && !m_settings->HasInputFilelist)
+ m_settings->InputFiles.push_back(Options::StandardIn());
+
+ // add files in the filelist to the input file list
+ if (m_settings->HasInputFilelist) {
+
+ std::ifstream filelist(m_settings->InputFilelist.c_str(), std::ios::in);
+ if (!filelist.is_open()) {
+ std::cerr << "bamtools random ERROR: could not open input BAM file list... Aborting."
+ << std::endl;
+ return false;
+ }
+
+ std::string line;
+ while (std::getline(filelist, line))
+ m_settings->InputFiles.push_back(line);
+ }
+
+ // open our reader
+ BamMultiReader reader;
+ if (!reader.Open(m_settings->InputFiles)) {
+ std::cerr << "bamtools random ERROR: could not open input BAM file(s)... Aborting."
+ << std::endl;
+ return false;
+ }
+
+ // look up index files for all BAM files
+ reader.LocateIndexes();
+
+ // make sure index data is available
+ if (!reader.HasIndexes()) {
+ std::cerr << "bamtools random ERROR: could not load index data for all input BAM "
+ "file(s)... Aborting."
+ << std::endl;
+ reader.Close();
+ return false;
+ }
+
+ // get BamReader metadata
+ const std::string headerText = reader.GetHeaderText();
+ const RefVector references = reader.GetReferenceData();
+ if (references.empty()) {
+ std::cerr << "bamtools random ERROR: no reference data available... Aborting." << std::endl;
+ reader.Close();
+ return false;
+ }
+
+ // determine compression mode for BamWriter
+ bool writeUncompressed =
+ (m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression);
+ BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
+ if (writeUncompressed) compressionMode = BamWriter::Uncompressed;
+
+ // open BamWriter
+ BamWriter writer;
+ writer.SetCompressionMode(compressionMode);
+ if (!writer.Open(m_settings->OutputFilename, headerText, references)) {
+ std::cerr << "bamtools random ERROR: could not open " << m_settings->OutputFilename
+ << " for writing... Aborting." << std::endl;
+ reader.Close();
+ return false;
+ }
+
+ // if user specified a REGION constraint, attempt to parse REGION string
+ BamRegion region;
+ if (m_settings->HasRegion &&
+ !Utilities::ParseRegionString(m_settings->Region, reader, region)) {
+ std::cerr << "bamtools random ERROR: could not parse REGION: " << m_settings->Region
+ << std::endl;
+ std::cerr << "Check that REGION is in valid format (see documentation) and that the "
+ "coordinates are valid"
+ << std::endl;
+ reader.Close();
+ writer.Close();
+ return false;
+ }
+
+ // seed our random number generator
+ if (m_settings->HasRandomNumberSeed)
+ srand(m_settings->RandomNumberSeed);
+ else
+ srand(time(NULL));
+
+ // grab random alignments
+ BamAlignment al;
+ unsigned int i = 0;
+ while (i < m_settings->AlignmentCount) {
+
+ int randomRefId = 0;
+ int randomPosition = 0;
+
+ // use REGION constraints to select random refId & position
+ if (m_settings->HasRegion) {
+
+ // select a random refId
+ randomRefId = getRandomInt(region.LeftRefID, region.RightRefID);
+
+ // select a random position based on randomRefId
+ const int lowerBoundPosition =
+ ((randomRefId == region.LeftRefID) ? region.LeftPosition : 0);
+ const int upperBoundPosition =
+ ((randomRefId == region.RightRefID) ? region.RightPosition
+ : (references.at(randomRefId).RefLength - 1));
+ randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition);
+ }
+
+ // otherwise select from all possible random refId & position
+ else {
+
+ // select random refId
+ randomRefId = getRandomInt(0, (int)references.size() - 1);
+
+ // select random position based on randomRefId
+ const int lowerBoundPosition = 0;
+ const int upperBoundPosition = references.at(randomRefId).RefLength - 1;
+ randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition);
+ }
+
+ // if jump & read successful, save first alignment that overlaps random refId & position
+ if (reader.Jump(randomRefId, randomPosition)) {
+ while (reader.GetNextAlignmentCore(al)) {
+ if (al.RefID == randomRefId && al.Position >= randomPosition) {
+ writer.SaveAlignment(al);
+ ++i;
+ break;
+ }
+ }
+ }
+ }
+
+ // cleanup & exit
+ reader.Close();
+ writer.Close();
+ return true;
+}
+
+// ---------------------------------------------
+// RandomTool implementation
+
+RandomTool::RandomTool()
+ : AbstractTool()
+ , m_settings(new RandomSettings)
+ , m_impl(0)
+{
+ // set program details
+ Options::SetProgramInfo("bamtools random", "grab a random subset of alignments",
+ "[-in <filename> -in <filename> ... | -list <filelist>] [-out "
+ "<filename>] [-forceCompression] [-n] [-region <REGION>]");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput,
+ m_settings->InputFiles, IO_Opts, Options::StandardIn());
+ Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "",
+ m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts);
+ Options::AddValueOption("-out", "BAM filename", "the output BAM file", "",
+ m_settings->HasOutput, m_settings->OutputFilename, IO_Opts,
+ Options::StandardOut());
+ Options::AddValueOption("-region", "REGION",
+ "only pull random alignments from within this genomic region. Index "
+ "file is recommended for better performance, and is used automatically "
+ "if it exists. See \'bamtools help index\' for more details on "
+ "creating one",
+ "", m_settings->HasRegion, m_settings->Region, IO_Opts);
+ Options::AddOption("-forceCompression",
+ "if results are sent to stdout (like when piping to another tool), default "
+ "behavior is to leave output uncompressed. Use this flag to override and "
+ "force compression",
+ m_settings->IsForceCompression, IO_Opts);
+
+ OptionGroup* SettingsOpts = Options::CreateOptionGroup("Settings");
+ Options::AddValueOption(
+ "-n", "count", "number of alignments to grab. Note - no duplicate checking is performed",
+ "", m_settings->HasAlignmentCount, m_settings->AlignmentCount, SettingsOpts,
+ RANDOM_MAX_ALIGNMENT_COUNT);
+ Options::AddValueOption("-seed", "unsigned integer",
+ "random number generator seed (for repeatable results). Current time "
+ "is used if no seed value is provided.",
+ "", m_settings->HasRandomNumberSeed, m_settings->RandomNumberSeed,
+ SettingsOpts);
+}
+
+RandomTool::~RandomTool()
+{
+
+ delete m_settings;
+ m_settings = 0;
+
+ delete m_impl;
+ m_impl = 0;
+}
+
+int RandomTool::Help()
+{
+ Options::DisplayHelp();
+ return 0;
+}
+
+int RandomTool::Run(int argc, char* argv[])
+{
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // initialize RandomTool with settings
+ m_impl = new RandomToolPrivate(m_settings);
+
+ // run RandomTool, return success/fail
+ if (m_impl->Run())
+ return 0;
+ else
+ return 1;
+}
diff --git a/src/toolkit/bamtools_random.h b/src/toolkit/bamtools_random.h
new file mode 100644
index 0000000..664a919
--- /dev/null
+++ b/src/toolkit/bamtools_random.h
@@ -0,0 +1,38 @@
+// ***************************************************************************
+// bamtools_random.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 7 April 2010 (DB)
+// ---------------------------------------------------------------------------
+// Grab a random subset of alignments (testing tool)
+// ***************************************************************************
+
+#ifndef BAMTOOLS_RANDOM_H
+#define BAMTOOLS_RANDOM_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class RandomTool : public AbstractTool
+{
+
+public:
+ RandomTool();
+ ~RandomTool();
+
+public:
+ int Help();
+ int Run(int argc, char* argv[]);
+
+private:
+ struct RandomSettings;
+ RandomSettings* m_settings;
+
+ struct RandomToolPrivate;
+ RandomToolPrivate* m_impl;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_RANDOM _H
diff --git a/src/toolkit/bamtools_resolve.cpp b/src/toolkit/bamtools_resolve.cpp
new file mode 100644
index 0000000..9f8c3e3
--- /dev/null
+++ b/src/toolkit/bamtools_resolve.cpp
@@ -0,0 +1,1523 @@
+// ***************************************************************************
+// bamtools_resolve.cpp (c) 2011
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 24 July 2013 (DB)
+// ---------------------------------------------------------------------------
+// Resolves paired-end reads (marking the IsProperPair flag as needed).
+// ***************************************************************************
+
+#include "bamtools_resolve.h"
+#include <api/BamReader.h>
+#include <api/BamWriter.h>
+#include <utils/bamtools_options.h>
+#include <utils/bamtools_utilities.h>
+#include "bamtools_version.h"
+using namespace BamTools;
+
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+// --------------------------------------------------------------------------
+// general ResolveTool constants
+// --------------------------------------------------------------------------
+
+static const int NUM_MODELS = 8;
+static const std::string READ_GROUP_TAG = "RG";
+static const double DEFAULT_CONFIDENCE_INTERVAL = 0.9973;
+static const uint16_t DEFAULT_MIN_MAPQUALITY = 1;
+static const double DEFAULT_UNUSEDMODEL_THRESHOLD = 0.1;
+
+// --------------------------------------------------------------------------
+// stats file constants
+// --------------------------------------------------------------------------
+
+// basic char/string constants
+static const char COMMENT_CHAR = '#';
+static const char EQUAL_CHAR = '=';
+static const char TAB_CHAR = '\t';
+
+static const std::string WHITESPACE_CHARS = " \t\n";
+static const std::string TRUE_KEYWORD = "true";
+static const std::string FALSE_KEYWORD = "false";
+
+// field counts
+static const std::size_t NUM_OPTIONS_FIELDS = 2;
+static const std::size_t NUM_READGROUPS_FIELDS = 7;
+
+// header strings
+static const std::string INPUT_TOKEN = "[Input]";
+static const std::string OPTIONS_TOKEN = "[Options]";
+static const std::string READGROUPS_TOKEN = "[ReadGroups]";
+
+// option keywords
+static const std::string OPTION_CONFIDENCEINTERVAL = "ConfidenceInterval";
+static const std::string OPTION_MINIMUMMAPQUALITY = "MinimumMapQuality";
+static const std::string OPTION_UNUSEDMODELTHRESHOLD = "UnusedModelThreshold";
+static const std::string OPTION_FORCEMARKREADGROUPS = "ForceMarkReadGroups";
+
+// other string constants
+static const std::string RG_FIELD_DESCRIPTION =
+ "#<name> <medianFL> <minFL> <maxFL> <topModelID> <nextTopModelID> <isAmbiguous?>";
+
+static const std::string MODEL_DESCRIPTION =
+ "# ------------- Model Types Description ---------------\n"
+ "#\n"
+ "# ID Position Orientation \n"
+ "# 1 mate1 < mate2 mate1:forward, mate2:forward \n"
+ "# 2 mate1 < mate2 mate1:forward, mate2:reverse \n"
+ "# 3 mate1 < mate2 mate1:reverse, mate2:forward \n"
+ "# 4 mate1 < mate2 mate1:reverse, mate2:reverse \n"
+ "# 5 mate2 < mate1 mate2:forward, mate1:forward \n"
+ "# 6 mate2 < mate1 mate2:forward, mate1:reverse \n"
+ "# 7 mate2 < mate1 mate2:reverse, mate1:forward \n"
+ "# 8 mate2 < mate1 mate2:reverse, mate1:reverse \n"
+ "# -----------------------------------------------------\n";
+
+// --------------------------------------------------------------------------
+// unique readname file constants
+// --------------------------------------------------------------------------
+
+static const std::string READNAME_FILE_SUFFIX = ".uniq_names.txt";
+static const std::string DEFAULT_READNAME_FILE = "bt_resolve_TEMP" + READNAME_FILE_SUFFIX;
+
+// --------------------------------------------------------------------------
+// ModelType implementation
+
+struct ModelType
+{
+
+ // data members
+ uint16_t ID;
+ std::vector<int32_t> FragmentLengths;
+
+ // ctor
+ ModelType(const uint16_t id)
+ : ID(id)
+ {
+ // preallocate space for 10K fragments per model type
+ FragmentLengths.reserve(10000);
+ }
+
+ // convenience access to internal fragment lengths vector
+ std::vector<int32_t>::iterator begin()
+ {
+ return FragmentLengths.begin();
+ }
+ std::vector<int32_t>::const_iterator begin() const
+ {
+ return FragmentLengths.begin();
+ }
+ void clear()
+ {
+ FragmentLengths.clear();
+ }
+ std::vector<int32_t>::iterator end()
+ {
+ return FragmentLengths.end();
+ }
+ std::vector<int32_t>::const_iterator end() const
+ {
+ return FragmentLengths.end();
+ }
+ void push_back(const int32_t& x)
+ {
+ FragmentLengths.push_back(x);
+ }
+ std::size_t size() const
+ {
+ return FragmentLengths.size();
+ }
+
+ // constants
+ static const uint16_t DUMMY_ID;
+};
+
+const uint16_t ModelType::DUMMY_ID = 100;
+
+bool operator>(const ModelType& lhs, const ModelType& rhs)
+{
+ return lhs.size() > rhs.size();
+}
+
+uint16_t CalculateModelType(const BamAlignment& al)
+{
+
+ // localize alignment's mate positions & orientations for convenience
+ const int32_t m1_begin = (al.IsFirstMate() ? al.Position : al.MatePosition);
+ const int32_t m2_begin = (al.IsFirstMate() ? al.MatePosition : al.Position);
+ const bool m1_isReverseStrand =
+ (al.IsFirstMate() ? al.IsReverseStrand() : al.IsMateReverseStrand());
+ const bool m2_isReverseStrand =
+ (al.IsFirstMate() ? al.IsMateReverseStrand() : al.IsReverseStrand());
+
+ // determine 'model type'
+ if (m1_begin < m2_begin) {
+ if (!m1_isReverseStrand && !m2_isReverseStrand) return 0; // ID: 1
+ if (!m1_isReverseStrand && m2_isReverseStrand) return 1; // ID: 2
+ if (m1_isReverseStrand && !m2_isReverseStrand) return 2; // ID: 3
+ if (m1_isReverseStrand && m2_isReverseStrand) return 3; // ID: 4
+ } else {
+ if (!m2_isReverseStrand && !m1_isReverseStrand) return 4; // ID: 5
+ if (!m2_isReverseStrand && m1_isReverseStrand) return 5; // ID: 6
+ if (m2_isReverseStrand && !m1_isReverseStrand) return 6; // ID: 7
+ if (m2_isReverseStrand && m1_isReverseStrand) return 7; // ID: 8
+ }
+
+ // unknown model
+ return ModelType::DUMMY_ID;
+}
+
+// --------------------------------------------------------------------------
+// ReadGroupResolver implementation
+
+struct ReadGroupResolver
+{
+
+ // data members
+ int32_t MinFragmentLength;
+ int32_t MedianFragmentLength;
+ int32_t MaxFragmentLength;
+ uint16_t TopModelId;
+ uint16_t NextTopModelId;
+ bool IsAmbiguous;
+ bool HasData;
+ std::vector<ModelType> Models;
+ std::map<std::string, bool> ReadNames;
+
+ // ctor
+ ReadGroupResolver();
+
+ // resolving methods
+ bool IsValidInsertSize(const BamAlignment& al) const;
+ bool IsValidOrientation(const BamAlignment& al) const;
+
+ // select 2 best models based on observed data
+ void DetermineTopModels(const std::string& readGroupName);
+
+ // static settings
+ static double ConfidenceInterval;
+ static double UnusedModelThreshold;
+ static void SetConfidenceInterval(const double& ci);
+ static void SetUnusedModelThreshold(const double& umt);
+};
+
+double ReadGroupResolver::ConfidenceInterval = DEFAULT_CONFIDENCE_INTERVAL;
+double ReadGroupResolver::UnusedModelThreshold = DEFAULT_UNUSEDMODEL_THRESHOLD;
+
+ReadGroupResolver::ReadGroupResolver()
+ : MinFragmentLength(0)
+ , MedianFragmentLength(0)
+ , MaxFragmentLength(0)
+ , TopModelId(ModelType::DUMMY_ID)
+ , NextTopModelId(ModelType::DUMMY_ID)
+ , IsAmbiguous(false)
+ , HasData(false)
+{
+ // pre-allocate space for 8 models
+ Models.reserve(NUM_MODELS);
+ for (uint16_t i = 0; i < NUM_MODELS; ++i)
+ Models.push_back(ModelType(i + 1));
+}
+
+bool ReadGroupResolver::IsValidInsertSize(const BamAlignment& al) const
+{
+ const int32_t absInsertSize = abs(al.InsertSize);
+ return (absInsertSize >= MinFragmentLength && absInsertSize <= MaxFragmentLength);
+}
+
+bool ReadGroupResolver::IsValidOrientation(const BamAlignment& al) const
+{
+ const uint16_t currentModelId =
+ CalculateModelType(al) + 1; // convert model type (array index) to ID number
+ return (currentModelId == TopModelId || currentModelId == NextTopModelId);
+}
+
+void ReadGroupResolver::DetermineTopModels(const std::string& readGroupName)
+{
+
+ // sort models (from most common to least common)
+ std::sort(Models.begin(), Models.end(), std::greater<ModelType>());
+
+ // store top 2 models for later
+ TopModelId = Models[0].ID;
+ NextTopModelId = Models[1].ID;
+
+ // make sure that the 2 most common models are some threshold more common
+ // than the remaining models
+ const unsigned int activeModelCountSum = Models[0].size() + Models[1].size();
+ if (activeModelCountSum == 0) return; // skip if no data in this read group
+ const unsigned int unusedModelCountSum = Models[2].size() + Models[3].size() +
+ Models[4].size() + Models[5].size() +
+ Models[6].size() + Models[7].size();
+ const double unusedPercentage = (double)unusedModelCountSum / (double)activeModelCountSum;
+ if (unusedPercentage > UnusedModelThreshold) {
+ std::cerr << "WARNING: " << readGroupName << " does not have clearly defined 'top models'"
+ << std::endl
+ << " The fraction of alignments in bottom 6 models (" << unusedPercentage
+ << ") exceeds threshold: " << UnusedModelThreshold << std::endl;
+ IsAmbiguous = true;
+ }
+
+ // emit a warning if the best alignment models are non-standard
+ const bool isModel1Top = (TopModelId == 1) || (NextTopModelId == 1);
+ const bool isModel2Top = (TopModelId == 2) || (NextTopModelId == 2);
+ const bool isModel4Top = (TopModelId == 4) || (NextTopModelId == 4);
+ const bool isModel5Top = (TopModelId == 5) || (NextTopModelId == 5);
+ const bool isModel6Top = (TopModelId == 6) || (NextTopModelId == 6);
+ const bool isModel8Top = (TopModelId == 8) || (NextTopModelId == 8);
+
+ bool isMatePair = (isModel4Top && isModel5Top ? true : false);
+ bool isPairedEnd = (isModel2Top && isModel6Top ? true : false);
+ bool isSolidPair = (isModel1Top && isModel8Top ? true : false);
+
+ if (!isMatePair && !isPairedEnd && !isSolidPair) {
+ std::cerr << "WARNING: Found a non-standard alignment model configuration. " << std::endl
+ << " Using alignment models " << TopModelId << " & " << NextTopModelId
+ << std::endl;
+ }
+
+ // store only the fragments from the best alignment models, then sort
+ std::vector<int32_t> fragments;
+ fragments.reserve(Models[0].size() + Models[1].size());
+ fragments.insert(fragments.end(), Models[0].begin(), Models[0].end());
+ fragments.insert(fragments.end(), Models[1].begin(), Models[1].end());
+ sort(fragments.begin(), fragments.end());
+
+ // clear out Model fragment data, not needed anymore
+ Models.clear();
+
+ // skip if no fragments found for this read group
+ if (fragments.empty()) {
+ HasData = false;
+ return;
+ } else
+ HasData = true;
+
+ // calculate & store the min,median, & max fragment lengths
+ const unsigned int numFragmentLengths = fragments.size();
+ const double halfNonConfidenceInterval = (1.0 - ReadGroupResolver::ConfidenceInterval) / 2.0;
+ const unsigned int minIndex = (unsigned int)(numFragmentLengths * halfNonConfidenceInterval);
+ const unsigned int medianIndex = (unsigned int)(numFragmentLengths * 0.5);
+ const unsigned int maxIndex =
+ (unsigned int)(numFragmentLengths * (1.0 - halfNonConfidenceInterval));
+
+ MinFragmentLength = fragments[minIndex];
+ MedianFragmentLength = fragments[medianIndex];
+ MaxFragmentLength = fragments[maxIndex];
+}
+
+void ReadGroupResolver::SetConfidenceInterval(const double& ci)
+{
+ ConfidenceInterval = ci;
+}
+
+void ReadGroupResolver::SetUnusedModelThreshold(const double& umt)
+{
+ UnusedModelThreshold = umt;
+}
+
+// --------------------------------------------------------------------------
+// ResolveSettings implementation
+
+struct ResolveTool::ResolveSettings
+{
+
+ // modes
+ bool IsMakeStats;
+ bool IsMarkPairs;
+ bool IsTwoPass;
+
+ // I/O flags
+ bool HasInputBamFile;
+ bool HasOutputBamFile;
+ bool HasStatsFile;
+ bool IsForceCompression;
+
+ // resolve option flags
+ bool HasConfidenceInterval;
+ bool HasForceMarkReadGroups;
+ bool HasMinimumMapQuality;
+ bool HasUnusedModelThreshold;
+
+ // I/O filenames
+ std::string InputBamFilename;
+ std::string OutputBamFilename;
+ std::string StatsFilename;
+ std::string ReadNamesFilename; // ** N.B. - Only used internally, not set from cmdline **
+
+ // resolve options
+ double ConfidenceInterval;
+ uint16_t MinimumMapQuality;
+ double UnusedModelThreshold;
+
+ // constructor
+ ResolveSettings()
+ : IsMakeStats(false)
+ , IsMarkPairs(false)
+ , IsTwoPass(false)
+ , HasInputBamFile(false)
+ , HasOutputBamFile(false)
+ , HasStatsFile(false)
+ , IsForceCompression(false)
+ , HasConfidenceInterval(false)
+ , HasForceMarkReadGroups(false)
+ , HasMinimumMapQuality(false)
+ , HasUnusedModelThreshold(false)
+ , InputBamFilename(Options::StandardIn())
+ , OutputBamFilename(Options::StandardOut())
+ , ReadNamesFilename(DEFAULT_READNAME_FILE)
+ , ConfidenceInterval(DEFAULT_CONFIDENCE_INTERVAL)
+ , MinimumMapQuality(DEFAULT_MIN_MAPQUALITY)
+ , UnusedModelThreshold(DEFAULT_UNUSEDMODEL_THRESHOLD)
+ {}
+};
+
+// --------------------------------------------------------------------------
+// ReadNamesFileReader implementation
+
+struct ResolveTool::ReadNamesFileReader
+{
+
+ // ctor & dtor
+ ReadNamesFileReader() {}
+ ~ReadNamesFileReader()
+ {
+ Close();
+ }
+
+ // main reader interface
+public:
+ void Close();
+ bool Open(const std::string& filename);
+ bool Read(std::map<std::string, ReadGroupResolver>& readGroups);
+
+ // data members
+private:
+ std::ifstream m_stream;
+};
+
+void ResolveTool::ReadNamesFileReader::Close()
+{
+ if (m_stream.is_open()) m_stream.close();
+}
+
+bool ResolveTool::ReadNamesFileReader::Open(const std::string& filename)
+{
+
+ // make sure stream is fresh
+ Close();
+
+ // attempt to open filename, return status
+ m_stream.open(filename.c_str(), std::ifstream::in);
+ return m_stream.good();
+}
+
+bool ResolveTool::ReadNamesFileReader::Read(std::map<std::string, ReadGroupResolver>& readGroups)
+{
+
+ // up-front sanity check
+ if (!m_stream.is_open()) return false;
+
+ // parse read names file
+ std::string line;
+ std::vector<std::string> fields;
+ std::map<std::string, ReadGroupResolver>::iterator rgIter;
+ std::map<std::string, ReadGroupResolver>::iterator rgEnd = readGroups.end();
+ while (std::getline(m_stream, line)) {
+
+ // skip if empty line
+ if (line.empty()) continue;
+
+ // split line on '\t'
+ fields = Utilities::Split(line, TAB_CHAR);
+ if (fields.size() != 2) continue;
+
+ // look up resolver for read group
+ rgIter = readGroups.find(fields[0]);
+ if (rgIter == rgEnd) return false;
+ ReadGroupResolver& resolver = (*rgIter).second;
+
+ // store read name with resolver
+ resolver.ReadNames.insert(std::make_pair(fields[1], true));
+ }
+
+ // if here, return success
+ return true;
+}
+
+// --------------------------------------------------------------------------
+// ReadNamesFileWriter implementation
+
+struct ResolveTool::ReadNamesFileWriter
+{
+
+ // ctor & dtor
+ ReadNamesFileWriter() {}
+ ~ReadNamesFileWriter()
+ {
+ Close();
+ }
+
+ // main reader interface
+public:
+ void Close();
+ bool Open(const std::string& filename);
+ void Write(const std::string& readGroupName, const std::string& readName);
+
+ // data members
+private:
+ std::ofstream m_stream;
+};
+
+void ResolveTool::ReadNamesFileWriter::Close()
+{
+ if (m_stream.is_open()) m_stream.close();
+}
+
+bool ResolveTool::ReadNamesFileWriter::Open(const std::string& filename)
+{
+
+ // make sure stream is fresh
+ Close();
+
+ // attempt to open filename, return status
+ m_stream.open(filename.c_str(), std::ofstream::out);
+ return m_stream.good();
+}
+
+void ResolveTool::ReadNamesFileWriter::Write(const std::string& readGroupName,
+ const std::string& readName)
+{
+ m_stream << readGroupName << TAB_CHAR << readName << std::endl;
+}
+
+// --------------------------------------------------------------------------
+// StatsFileReader implementation
+
+struct ResolveTool::StatsFileReader
+{
+
+ // ctor & dtor
+public:
+ StatsFileReader() {}
+ ~StatsFileReader()
+ {
+ Close();
+ }
+
+ // main reader interface
+public:
+ void Close();
+ bool Open(const std::string& filename);
+ bool Read(ResolveTool::ResolveSettings* settings,
+ std::map<std::string, ReadGroupResolver>& readGroups);
+
+ // internal methods
+private:
+ bool IsComment(const std::string& line) const;
+ bool IsWhitespace(const std::string& line) const;
+ bool ParseInputLine(const std::string& line);
+ bool ParseOptionLine(const std::string& line, ResolveTool::ResolveSettings* settings);
+ bool ParseReadGroupLine(const std::string& line,
+ std::map<std::string, ReadGroupResolver>& readGroups);
+ std::string SkipCommentsAndWhitespace();
+
+ // data members
+private:
+ std::ifstream m_stream;
+
+ enum State
+ {
+ None = 0,
+ InInput,
+ InOptions,
+ InReadGroups
+ };
+};
+
+void ResolveTool::StatsFileReader::Close()
+{
+ if (m_stream.is_open()) m_stream.close();
+}
+
+bool ResolveTool::StatsFileReader::IsComment(const std::string& line) const
+{
+ assert(!line.empty());
+ return (line.at(0) == COMMENT_CHAR);
+}
+
+bool ResolveTool::StatsFileReader::IsWhitespace(const std::string& line) const
+{
+ if (line.empty()) return true;
+ return (isspace(line.at(0)));
+}
+
+bool ResolveTool::StatsFileReader::Open(const std::string& filename)
+{
+
+ // make sure stream is fresh
+ Close();
+
+ // attempt to open filename, return status
+ m_stream.open(filename.c_str(), std::ifstream::in);
+ return m_stream.good();
+}
+
+bool ResolveTool::StatsFileReader::ParseInputLine(const std::string& /*line*/)
+{
+ // input lines are ignored (for now at least), tool will use input from command line
+ return true;
+}
+
+bool ResolveTool::StatsFileReader::ParseOptionLine(const std::string& line,
+ ResolveTool::ResolveSettings* settings)
+{
+ // split line into option, value
+ std::vector<std::string> fields = Utilities::Split(line, EQUAL_CHAR);
+ if (fields.size() != NUM_OPTIONS_FIELDS) return false;
+ const std::string& option = fields.at(0);
+ std::stringstream value(fields.at(1));
+
+ // -----------------------------------
+ // handle option based on keyword
+
+ // ConfidenceInterval
+ if (option == OPTION_CONFIDENCEINTERVAL) {
+ value >> settings->ConfidenceInterval;
+ settings->HasConfidenceInterval = true;
+ return true;
+ }
+
+ // ForceMarkReadGroups
+ if (option == OPTION_FORCEMARKREADGROUPS) {
+ value >> settings->HasForceMarkReadGroups;
+ return true;
+ }
+
+ // MinimumMapQuality
+ if (option == OPTION_MINIMUMMAPQUALITY) {
+ value >> settings->MinimumMapQuality;
+ settings->HasMinimumMapQuality = true;
+ return true;
+ }
+
+ // UnusedModelThreshold
+ if (option == OPTION_UNUSEDMODELTHRESHOLD) {
+ value >> settings->UnusedModelThreshold;
+ settings->HasUnusedModelThreshold = true;
+ return true;
+ }
+
+ // otherwise unknown option
+ std::cerr << "bamtools resolve ERROR - unrecognized option: " << option << " in stats file"
+ << std::endl;
+ return false;
+}
+
+bool ResolveTool::StatsFileReader::ParseReadGroupLine(
+ const std::string& line, std::map<std::string, ReadGroupResolver>& readGroups)
+{
+ // split read group data in to fields
+ std::vector<std::string> fields = Utilities::Split(line, WHITESPACE_CHARS);
+ if (fields.size() != NUM_READGROUPS_FIELDS) return false;
+
+ // retrieve RG name
+ const std::string& name = fields.at(0);
+
+ // populate RG's 'resolver' data
+ ReadGroupResolver resolver;
+
+ std::stringstream dataStream;
+ dataStream.str(fields.at(1));
+ dataStream >> resolver.MedianFragmentLength;
+ dataStream.clear();
+
+ dataStream.str(fields.at(2));
+ dataStream >> resolver.MinFragmentLength;
+ dataStream.clear();
+
+ dataStream.str(fields.at(3));
+ dataStream >> resolver.MaxFragmentLength;
+ dataStream.clear();
+
+ dataStream.str(fields.at(4));
+ dataStream >> resolver.TopModelId;
+ dataStream.clear();
+
+ dataStream.str(fields.at(5));
+ dataStream >> resolver.NextTopModelId;
+ dataStream.clear();
+
+ resolver.IsAmbiguous = (fields.at(6) == TRUE_KEYWORD);
+
+ // store RG entry and return success
+ readGroups.insert(std::make_pair(name, resolver));
+ return true;
+}
+
+bool ResolveTool::StatsFileReader::Read(ResolveTool::ResolveSettings* settings,
+ std::map<std::string, ReadGroupResolver>& readGroups)
+{
+ // up-front sanity checks
+ if (!m_stream.is_open() || settings == 0) return false;
+
+ // clear out read group data
+ readGroups.clear();
+
+ // initialize state
+ State currentState = StatsFileReader::None;
+
+ // read stats file
+ std::string line = SkipCommentsAndWhitespace();
+ while (!line.empty()) {
+
+ bool foundError = false;
+
+ // switch state on keyword found
+ if (Utilities::StartsWith(line, INPUT_TOKEN))
+ currentState = StatsFileReader::InInput;
+ else if (Utilities::StartsWith(line, OPTIONS_TOKEN))
+ currentState = StatsFileReader::InOptions;
+ else if (Utilities::StartsWith(line, READGROUPS_TOKEN))
+ currentState = StatsFileReader::InReadGroups;
+
+ // otherwise parse data line, depending on state
+ else {
+ if (currentState == StatsFileReader::InInput)
+ foundError = !ParseInputLine(line);
+ else if (currentState == StatsFileReader::InOptions)
+ foundError = !ParseOptionLine(line, settings);
+ else if (currentState == StatsFileReader::InReadGroups)
+ foundError = !ParseReadGroupLine(line, readGroups);
+ else
+ foundError = true;
+ }
+
+ // break out if error found
+ if (foundError) return false;
+
+ // get next line
+ line = SkipCommentsAndWhitespace();
+ }
+
+ // if here, return success
+ return true;
+}
+
+std::string ResolveTool::StatsFileReader::SkipCommentsAndWhitespace()
+{
+ std::string line;
+ do {
+ if (m_stream.eof()) return std::string();
+ std::getline(m_stream, line);
+ } while (IsWhitespace(line) || IsComment(line));
+ return line;
+}
+
+// --------------------------------------------------------------------------
+// StatsFileReader implementation
+
+struct ResolveTool::StatsFileWriter
+{
+
+ // ctor & dtor
+public:
+ StatsFileWriter() {}
+ ~StatsFileWriter()
+ {
+ Close();
+ }
+
+ // main reader interface
+public:
+ void Close();
+ bool Open(const std::string& filename);
+ bool Write(ResolveTool::ResolveSettings* settings,
+ const std::map<std::string, ReadGroupResolver>& readGroups);
+
+ // internal methods
+private:
+ void WriteHeader();
+ void WriteInput(ResolveTool::ResolveSettings* settings);
+ void WriteOptions(ResolveTool::ResolveSettings* settings);
+ void WriteReadGroups(const std::map<std::string, ReadGroupResolver>& readGroups);
+
+ // data members
+private:
+ std::ofstream m_stream;
+};
+
+void ResolveTool::StatsFileWriter::Close()
+{
+ if (m_stream.is_open()) m_stream.close();
+}
+
+bool ResolveTool::StatsFileWriter::Open(const std::string& filename)
+{
+
+ // make sure stream is fresh
+ Close();
+
+ // attempt to open filename, return status
+ m_stream.open(filename.c_str(), std::ofstream::out);
+ return m_stream.good();
+}
+
+bool ResolveTool::StatsFileWriter::Write(ResolveTool::ResolveSettings* settings,
+ const std::map<std::string, ReadGroupResolver>& readGroups)
+{
+ // return failure if file not open
+ if (!m_stream.is_open()) return false;
+
+ // write stats file elements
+ WriteHeader();
+ WriteInput(settings);
+ WriteOptions(settings);
+ WriteReadGroups(readGroups);
+
+ // return success
+ return true;
+}
+
+void ResolveTool::StatsFileWriter::WriteHeader()
+{
+
+ // stringify current bamtools version
+ std::stringstream versionStream;
+ versionStream << 'v' << BAMTOOLS_VERSION_MAJOR << '.' << BAMTOOLS_VERSION_MINOR << '.'
+ << BAMTOOLS_VERSION_PATCH;
+
+ // # bamtools resolve (vX.Y.Z)
+ // #
+ // # MODEL DESCRIPTION - see above for actual text
+ // \n
+
+ m_stream << COMMENT_CHAR << " bamtools resolve (" << versionStream.str() << ')' << std::endl
+ << COMMENT_CHAR << std::endl
+ << MODEL_DESCRIPTION << std::endl;
+}
+
+void ResolveTool::StatsFileWriter::WriteInput(ResolveTool::ResolveSettings* settings)
+{
+
+ // [Input]
+ // filename
+ // \n
+
+ m_stream << INPUT_TOKEN << std::endl << settings->InputBamFilename << std::endl << std::endl;
+}
+
+void ResolveTool::StatsFileWriter::WriteOptions(ResolveTool::ResolveSettings* settings)
+{
+
+ // [Options]
+ // ConfidenceInterval=<double>
+ // ForceMarkReadGroups=<true|false>
+ // MinimumMapQuality=<uint16_t>
+ // UnusedModelThreshold=<double>
+ // \n
+
+ m_stream << OPTIONS_TOKEN << std::endl
+ << OPTION_CONFIDENCEINTERVAL << EQUAL_CHAR << settings->ConfidenceInterval << std::endl
+ << OPTION_FORCEMARKREADGROUPS << EQUAL_CHAR << std::boolalpha
+ << settings->HasForceMarkReadGroups << std::endl
+ << OPTION_MINIMUMMAPQUALITY << EQUAL_CHAR << settings->MinimumMapQuality << std::endl
+ << OPTION_UNUSEDMODELTHRESHOLD << EQUAL_CHAR << settings->UnusedModelThreshold
+ << std::endl
+ << std::endl;
+}
+
+void ResolveTool::StatsFileWriter::WriteReadGroups(
+ const std::map<std::string, ReadGroupResolver>& readGroups)
+{
+
+ // [ReadGroups]
+ // #<name> <medianFL> <minFL> <maxFL> <topModelID> <nextTopModelID> <isAmbiguous?>
+ m_stream << READGROUPS_TOKEN << std::endl << RG_FIELD_DESCRIPTION << std::endl;
+
+ // iterate over read groups
+ std::map<std::string, ReadGroupResolver>::const_iterator rgIter = readGroups.begin();
+ std::map<std::string, ReadGroupResolver>::const_iterator rgEnd = readGroups.end();
+ for (; rgIter != rgEnd; ++rgIter) {
+ const std::string& name = (*rgIter).first;
+ const ReadGroupResolver& resolver = (*rgIter).second;
+
+ // skip if read group has no data
+ if (!resolver.HasData) continue;
+
+ // write read group data
+ m_stream << name << TAB_CHAR << resolver.MedianFragmentLength << TAB_CHAR
+ << resolver.MinFragmentLength << TAB_CHAR << resolver.MaxFragmentLength << TAB_CHAR
+ << resolver.TopModelId << TAB_CHAR << resolver.NextTopModelId << TAB_CHAR
+ << std::boolalpha << resolver.IsAmbiguous << std::endl;
+ }
+
+ // extra newline at end
+ m_stream << std::endl;
+}
+
+// --------------------------------------------------------------------------
+// ResolveToolPrivate implementation
+
+struct ResolveTool::ResolveToolPrivate
+{
+
+ // ctor & dtor
+public:
+ ResolveToolPrivate(ResolveTool::ResolveSettings* settings)
+ : m_settings(settings)
+ {}
+ ~ResolveToolPrivate() {}
+
+ // 'public' interface
+public:
+ bool Run();
+
+ // internal methods
+private:
+ bool CheckSettings(std::vector<std::string>& errors);
+ bool MakeStats();
+ void ParseHeader(const SamHeader& header);
+ bool ReadStatsFile();
+ void ResolveAlignment(BamAlignment& al);
+ bool ResolvePairs();
+ bool WriteStatsFile();
+
+ // data members
+private:
+ ResolveTool::ResolveSettings* m_settings;
+ std::map<std::string, ReadGroupResolver> m_readGroups;
+};
+
+bool ResolveTool::ResolveToolPrivate::CheckSettings(std::vector<std::string>& errors)
+{
+
+ // ensure clean slate
+ errors.clear();
+
+ // if MakeStats mode
+ if (m_settings->IsMakeStats) {
+
+ // ensure mutex mode
+ if (m_settings->IsMarkPairs)
+ errors.push_back(
+ "Cannot run in both -makeStats & -markPairs modes. Please select ONE.");
+ if (m_settings->IsTwoPass)
+ errors.push_back("Cannot run in both -makeStats & -twoPass modes. Please select ONE.");
+
+ // error if output BAM options supplied
+ if (m_settings->HasOutputBamFile)
+ errors.push_back("Cannot use -out (output BAM file) in -makeStats mode.");
+ if (m_settings->IsForceCompression)
+ errors.push_back(
+ "Cannot use -forceCompression. No output BAM file is being generated.");
+
+ // make sure required stats file supplied
+ if (!m_settings->HasStatsFile)
+ errors.push_back(
+ "Ouptut stats filename required for -makeStats mode. Please specify one using "
+ "-stats option.");
+
+ // check for UseStats options
+ if (m_settings->HasForceMarkReadGroups)
+ errors.push_back(
+ "Cannot use -forceMarkReadGroups. -markPairs options are DISABLED in -makeStats "
+ "mode.");
+ }
+
+ // if MarkPairs mode
+ else if (m_settings->IsMarkPairs) {
+
+ // ensure mutex mode
+ if (m_settings->IsMakeStats)
+ errors.push_back(
+ "Cannot run in both -makeStats & -markPairs modes. Please select ONE.");
+ if (m_settings->IsTwoPass)
+ errors.push_back("Cannot run in both -markPairs & -twoPass modes. Please select ONE.");
+
+ // make sure required stats file supplied
+ if (!m_settings->HasStatsFile)
+ errors.push_back(
+ "Input stats filename required for -markPairs mode. Please specify one using "
+ "-stats option.");
+
+ // check for MakeStats options
+ if (m_settings->HasConfidenceInterval)
+ errors.push_back("Cannot use -ci. -makeStats options are DISABLED is -markPairs mode.");
+ }
+
+ // if TwoPass mode
+ else if (m_settings->IsTwoPass) {
+
+ // ensure mutex mode
+ if (m_settings->IsMakeStats)
+ errors.push_back("Cannot run in both -makeStats & -twoPass modes. Please select ONE.");
+ if (m_settings->IsMarkPairs)
+ errors.push_back("Cannot run in both -markPairs & -twoPass modes. Please select ONE.");
+
+ // make sure input is file not stdin
+ if (!m_settings->HasInputBamFile || m_settings->InputBamFilename == Options::StandardIn())
+ errors.push_back(
+ "Cannot run -twoPass mode with BAM data from stdin. Please specify existing file "
+ "using -in option.");
+ }
+
+ // no mode selected
+ else
+ errors.push_back(
+ "No resolve mode specified. Please select ONE of the following: -makeStats, "
+ "-markPairs, or -twoPass. See help for more info.");
+
+ // boundary checks on values
+ if (m_settings->HasConfidenceInterval) {
+ if (m_settings->ConfidenceInterval < 0.0 || m_settings->ConfidenceInterval > 1.0)
+ errors.push_back("Invalid confidence interval. Must be between 0 and 1");
+ }
+ if (m_settings->HasMinimumMapQuality) {
+ if (m_settings->MinimumMapQuality >= 256)
+ errors.push_back("Invalid minimum map quality. Must be between 0 and 255");
+ }
+ if (m_settings->HasUnusedModelThreshold) {
+ if (m_settings->UnusedModelThreshold < 0.0 || m_settings->UnusedModelThreshold > 1.0)
+ errors.push_back("Invalid unused model threshold. Must be between 0 and 1");
+ }
+
+ // return success if no errors found
+ return (errors.empty());
+}
+
+bool ResolveTool::ResolveToolPrivate::MakeStats()
+{
+
+ // pull resolver settings from command-line settings
+ ReadGroupResolver::SetConfidenceInterval(m_settings->ConfidenceInterval);
+ ReadGroupResolver::SetUnusedModelThreshold(m_settings->UnusedModelThreshold);
+
+ // open our BAM reader
+ BamReader bamReader;
+ if (!bamReader.Open(m_settings->InputBamFilename)) {
+ std::cerr << "bamtools resolve ERROR: could not open input BAM file: "
+ << m_settings->InputBamFilename << std::endl;
+ return false;
+ }
+
+ // retrieve header & parse for read groups
+ const SamHeader& header = bamReader.GetHeader();
+ ParseHeader(header);
+
+ // open ReadNamesFileWriter
+ ResolveTool::ReadNamesFileWriter readNamesWriter;
+ if (!readNamesWriter.Open(m_settings->ReadNamesFilename)) {
+ std::cerr << "bamtools resolve ERROR: could not open (temp) output read names file: "
+ << m_settings->ReadNamesFilename << std::endl;
+ bamReader.Close();
+ return false;
+ }
+
+ // read through BAM file
+ BamAlignment al;
+ std::string readGroup;
+ std::map<std::string, ReadGroupResolver>::iterator rgIter;
+ std::map<std::string, bool>::iterator readNameIter;
+ while (bamReader.GetNextAlignmentCore(al)) {
+
+ // skip if alignment is not paired, mapped, nor mate is mapped
+ if (!al.IsPaired() || !al.IsMapped() || !al.IsMateMapped()) continue;
+
+ // skip if alignment & mate not on same reference sequence
+ if (al.RefID != al.MateRefID) continue;
+
+ // flesh out the char data, so we can retrieve its read group ID
+ al.BuildCharData();
+
+ // get read group from alignment (OK if empty)
+ readGroup.clear();
+ al.GetTag(READ_GROUP_TAG, readGroup);
+
+ // look up resolver for read group
+ rgIter = m_readGroups.find(readGroup);
+ if (rgIter == m_readGroups.end()) {
+ std::cerr << "bamtools resolve ERROR - unable to calculate stats, unknown read group "
+ "encountered: "
+ << readGroup << std::endl;
+ bamReader.Close();
+ return false;
+ }
+ ReadGroupResolver& resolver = (*rgIter).second;
+
+ // determine unique-ness of current alignment
+ const bool isCurrentMateUnique = (al.MapQuality >= m_settings->MinimumMapQuality);
+
+ // look up read name
+ readNameIter = resolver.ReadNames.find(al.Name);
+
+ // if read name found (current alignment's mate already parsed)
+ if (readNameIter != resolver.ReadNames.end()) {
+
+ // if both unique mates are unique, store read name & insert size for later
+ const bool isStoredMateUnique = (*readNameIter).second;
+ if (isCurrentMateUnique && isStoredMateUnique) {
+
+ // save read name in temp file as candidates for later pair marking
+ readNamesWriter.Write(readGroup, al.Name);
+
+ // determine model type & store fragment length for stats calculation
+ const uint16_t currentModelType = CalculateModelType(al);
+ assert(currentModelType != ModelType::DUMMY_ID);
+ resolver.Models[currentModelType].push_back(abs(al.InsertSize));
+ }
+
+ // unique or not, remove read name from map
+ resolver.ReadNames.erase(readNameIter);
+ }
+
+ // if read name not found, store new entry
+ else
+ resolver.ReadNames.insert(std::make_pair(al.Name, isCurrentMateUnique));
+ }
+
+ // close files
+ readNamesWriter.Close();
+ bamReader.Close();
+
+ // iterate back through read groups
+ std::map<std::string, ReadGroupResolver>::iterator rgEnd = m_readGroups.end();
+ for (rgIter = m_readGroups.begin(); rgIter != rgEnd; ++rgIter) {
+ const std::string& name = (*rgIter).first;
+ ReadGroupResolver& resolver = (*rgIter).second;
+
+ // calculate acceptable orientation & insert sizes for this read group
+ resolver.DetermineTopModels(name);
+
+ // clear out left over read names
+ // (these have mates that did not pass filters or were already removed as non-unique)
+ resolver.ReadNames.clear();
+ }
+
+ // if we get here, return success
+ return true;
+}
+
+void ResolveTool::ResolveToolPrivate::ParseHeader(const SamHeader& header)
+{
+
+ // iterate over header read groups, creating a 'resolver' for each
+ SamReadGroupConstIterator rgIter = header.ReadGroups.ConstBegin();
+ SamReadGroupConstIterator rgEnd = header.ReadGroups.ConstEnd();
+ for (; rgIter != rgEnd; ++rgIter) {
+ const SamReadGroup& rg = (*rgIter);
+ m_readGroups.insert(std::make_pair(rg.ID, ReadGroupResolver()));
+ }
+}
+
+bool ResolveTool::ResolveToolPrivate::ReadStatsFile()
+{
+
+ // skip if no filename provided
+ if (m_settings->StatsFilename.empty()) return false;
+
+ // attempt to open stats file
+ ResolveTool::StatsFileReader statsReader;
+ if (!statsReader.Open(m_settings->StatsFilename)) {
+ std::cerr << "bamtools resolve ERROR - could not open stats file: "
+ << m_settings->StatsFilename << " for reading" << std::endl;
+ return false;
+ }
+
+ // attempt to read stats data
+ if (!statsReader.Read(m_settings, m_readGroups)) {
+ std::cerr << "bamtools resolve ERROR - could not parse stats file: "
+ << m_settings->StatsFilename << " for data" << std::endl;
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+void ResolveTool::ResolveToolPrivate::ResolveAlignment(BamAlignment& al)
+{
+
+ // clear proper-pair flag
+ al.SetIsProperPair(false);
+
+ // quit check if alignment is not from paired-end read
+ if (!al.IsPaired()) return;
+
+ // quit check if either alignment or its mate are unmapped
+ if (!al.IsMapped() || !al.IsMateMapped()) return;
+
+ // quit check if alignment & its mate are on differenct references
+ if (al.RefID != al.MateRefID) return;
+
+ // quit check if map quality less than cutoff
+ if (al.MapQuality < m_settings->MinimumMapQuality) return;
+
+ // get read group from alignment
+ // empty string if not found, this is OK - we handle empty read group case
+ std::string readGroupName;
+ al.GetTag(READ_GROUP_TAG, readGroupName);
+
+ // look up read group's 'resolver'
+ std::map<std::string, ReadGroupResolver>::iterator rgIter = m_readGroups.find(readGroupName);
+ if (rgIter == m_readGroups.end()) {
+ std::cerr << "bamtools resolve ERROR - read group found that was not in header: "
+ << readGroupName << std::endl;
+ std::exit(EXIT_FAILURE);
+ }
+ const ReadGroupResolver& resolver = (*rgIter).second;
+
+ // quit check if pairs are not in proper orientation (can differ for each RG)
+ if (!resolver.IsValidOrientation(al)) return;
+
+ // quit check if pairs are not within "reasonable" distance (can differ for each RG)
+ if (!resolver.IsValidInsertSize(al)) return;
+
+ // quit check if alignment is not a "candidate proper pair"
+ std::map<std::string, bool>::const_iterator readNameIter;
+ readNameIter = resolver.ReadNames.find(al.Name);
+ if (readNameIter == resolver.ReadNames.end()) return;
+
+ // if we get here, alignment is OK - set 'proper pair' flag
+ al.SetIsProperPair(true);
+}
+
+bool ResolveTool::ResolveToolPrivate::ResolvePairs()
+{
+
+ // open file containing read names of candidate proper pairs
+ ResolveTool::ReadNamesFileReader readNamesReader;
+ if (!readNamesReader.Open(m_settings->ReadNamesFilename)) {
+ std::cerr << "bamtools resolve ERROR: could not open (temp) inputput read names file: "
+ << m_settings->ReadNamesFilename << std::endl;
+ return false;
+ }
+
+ // parse read names (matching with corresponding read groups)
+ if (!readNamesReader.Read(m_readGroups)) {
+ std::cerr << "bamtools resolve ERROR: could not read candidate read names from file: "
+ << m_settings->ReadNamesFilename << std::endl;
+ readNamesReader.Close();
+ return false;
+ }
+
+ // close read name file reader & delete temp file
+ readNamesReader.Close();
+ if (remove(m_settings->ReadNamesFilename.c_str()) != 0) {
+ std::cerr << "bamtools resolve WARNING: could not delete temp file: "
+ << m_settings->ReadNamesFilename << std::endl;
+ }
+
+ // open our BAM reader
+ BamReader reader;
+ if (!reader.Open(m_settings->InputBamFilename)) {
+ std::cerr << "bamtools resolve ERROR: could not open input BAM file: "
+ << m_settings->InputBamFilename << std::endl;
+ return false;
+ }
+
+ // retrieve header & reference dictionary info
+ const SamHeader& header = reader.GetHeader();
+ const RefVector& references = reader.GetReferenceData();
+
+ // determine compression mode for BamWriter
+ bool writeUncompressed = (m_settings->OutputBamFilename == Options::StandardOut() &&
+ !m_settings->IsForceCompression);
+ BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
+ if (writeUncompressed) compressionMode = BamWriter::Uncompressed;
+
+ // open BamWriter
+ BamWriter writer;
+ writer.SetCompressionMode(compressionMode);
+ if (!writer.Open(m_settings->OutputBamFilename, header, references)) {
+ std::cerr << "bamtools resolve ERROR: could not open " << m_settings->OutputBamFilename
+ << " for writing." << std::endl;
+ reader.Close();
+ return false;
+ }
+
+ // plow through alignments, setting/clearing 'proper pair' flag
+ // and writing to new output BAM file
+ BamAlignment al;
+ while (reader.GetNextAlignment(al)) {
+ ResolveAlignment(al);
+ writer.SaveAlignment(al);
+ }
+
+ // clean up & return success
+ reader.Close();
+ writer.Close();
+ return true;
+}
+
+bool ResolveTool::ResolveToolPrivate::Run()
+{
+
+ // verify that command line settings are acceptable
+ std::vector<std::string> errors;
+ if (!CheckSettings(errors)) {
+ std::cerr << "bamtools resolve ERROR - invalid settings: " << std::endl;
+ std::vector<std::string>::const_iterator errorIter = errors.begin();
+ std::vector<std::string>::const_iterator errorEnd = errors.end();
+ for (; errorIter != errorEnd; ++errorIter)
+ std::cerr << (*errorIter) << std::endl;
+ return false;
+ }
+
+ // initialize read group map with default (empty name) read group
+ m_readGroups.insert(std::make_pair(std::string(), ReadGroupResolver()));
+
+ // init readname filename
+ // uses (adjusted) stats filename if provided (req'd for makeStats, markPairs modes; optional for twoPass)
+ // else keep default filename
+ if (m_settings->HasStatsFile)
+ m_settings->ReadNamesFilename = m_settings->StatsFilename + READNAME_FILE_SUFFIX;
+
+ // -makeStats mode
+ if (m_settings->IsMakeStats) {
+
+ // generate stats data
+ if (!MakeStats()) {
+ std::cerr << "bamtools resolve ERROR - could not generate stats" << std::endl;
+ return false;
+ }
+
+ // write stats to file
+ if (!WriteStatsFile()) {
+ std::cerr << "bamtools resolve ERROR - could not write stats file: "
+ << m_settings->StatsFilename << std::endl;
+ return false;
+ }
+ }
+
+ // -markPairs mode
+ else if (m_settings->IsMarkPairs) {
+
+ // read stats from file
+ if (!ReadStatsFile()) {
+ std::cerr << "bamtools resolve ERROR - could not read stats file: "
+ << m_settings->StatsFilename << std::endl;
+ return false;
+ }
+
+ // do paired-end resolution
+ if (!ResolvePairs()) {
+ std::cerr << "bamtools resolve ERROR - could not resolve pairs" << std::endl;
+ return false;
+ }
+ }
+
+ // -twoPass mode
+ else {
+
+ // generate stats data
+ if (!MakeStats()) {
+ std::cerr << "bamtools resolve ERROR - could not generate stats" << std::endl;
+ return false;
+ }
+
+ // if stats file requested
+ if (m_settings->HasStatsFile) {
+
+ // write stats to file
+ // emit warning if write fails, but paired-end resolution should be allowed to proceed
+ if (!WriteStatsFile())
+ std::cerr << "bamtools resolve WARNING - could not write stats file: "
+ << m_settings->StatsFilename << std::endl;
+ }
+
+ // do paired-end resolution
+ if (!ResolvePairs()) {
+ std::cerr << "bamtools resolve ERROR - could not resolve pairs" << std::endl;
+ return false;
+ }
+ }
+
+ // return success
+ return true;
+}
+
+bool ResolveTool::ResolveToolPrivate::WriteStatsFile()
+{
+
+ // skip if no filename provided
+ if (m_settings->StatsFilename.empty()) return false;
+
+ // attempt to open stats file
+ ResolveTool::StatsFileWriter statsWriter;
+ if (!statsWriter.Open(m_settings->StatsFilename)) {
+ std::cerr << "bamtools resolve ERROR - could not open stats file: "
+ << m_settings->StatsFilename << " for writing" << std::endl;
+ return false;
+ }
+
+ // attempt to write stats data
+ if (!statsWriter.Write(m_settings, m_readGroups)) {
+ std::cerr << "bamtools resolve ERROR - could not write stats file: "
+ << m_settings->StatsFilename << " for data" << std::endl;
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+// --------------------------------------------------------------------------
+// ResolveTool implementation
+
+ResolveTool::ResolveTool()
+ : AbstractTool()
+ , m_settings(new ResolveSettings)
+ , m_impl(0)
+{
+ // set description texts
+ const std::string programDescription =
+ "resolves paired-end reads (marking the IsProperPair flag as needed)";
+ const std::string programUsage =
+ "<mode> [options] [-in <filename>] [-out <filename> | [-forceCompression] ] [-stats "
+ "<filename>]";
+ const std::string inputBamDescription = "the input BAM file(s)";
+ const std::string outputBamDescription = "the output BAM file";
+ const std::string statsFileDescription =
+ "input/output stats file, depending on selected mode (see below). "
+ "This file is human-readable, storing fragment length data generated per read group, as "
+ "well as "
+ "the options used to configure the -makeStats mode";
+ const std::string forceCompressionDescription =
+ "if results are sent to stdout (like when piping to another tool), "
+ "default behavior is to leave output uncompressed."
+ "Use this flag to override and force compression. This feature is disabled in -makeStats "
+ "mode.";
+ const std::string makeStatsDescription =
+ "generates a fragment-length stats file from the input BAM. "
+ "Data is written to file specified using the -stats option. "
+ "MarkPairs Mode Settings are DISABLED.";
+ const std::string markPairsDescription =
+ "generates an output BAM with alignments marked with proper-pair status. "
+ "Stats data is read from file specified using the -stats option. "
+ "MakeStats Mode Settings are DISABLED";
+ const std::string twoPassDescription =
+ "combines the -makeStats & -markPairs modes into a single command. "
+ "However, due to the two-pass nature of paired-end resolution, piping BAM data via stdin "
+ "is DISABLED. "
+ "You must supply an explicit input BAM file. Output BAM may be piped to stdout, however, "
+ "if desired. "
+ "All MakeStats & MarkPairs Mode Settings are available. "
+ "The intermediate stats file is not necessary, but if the -stats options is used, then one "
+ "will be generated. "
+ "You may find this useful for documentation purposes.";
+ const std::string minMapQualDescription =
+ "minimum map quality. Used in -makeStats mode as a heuristic for determining a mate's "
+ "uniqueness. Used in -markPairs mode as a filter for marking candidate proper pairs.";
+ const std::string confidenceIntervalDescription =
+ "confidence interval. Set min/max fragment lengths such that we capture "
+ "this fraction of pairs";
+ const std::string unusedModelThresholdDescription =
+ "unused model threshold. The resolve tool considers 8 possible orientation models "
+ "for pairs. The top 2 are selected for later use when actually marking alignments. This "
+ "value determines the "
+ "cutoff for marking a read group as ambiguous. Meaning that if the ratio of the number of "
+ "alignments from bottom 6 models "
+ "to the top 2 is greater than this threshold, then the read group is flagged as ambiguous. "
+ "By default, NO alignments "
+ "from ambiguous read groups will be marked as proper pairs. You may override this behavior "
+ "with the -force option "
+ "in -markPairs mode";
+ const std::string forceMarkDescription =
+ "forces all read groups to be marked according to their top 2 'orientation models'. "
+ "When generating stats, the 2 (out of 8 possible) models with the most observations are "
+ "chosen as the top models for each read group. "
+ "If the remaining 6 models account for more than some threshold ([default=10%], see -umt), "
+ "then the read group is marked as ambiguous. "
+ "The default behavior is that for an ambiguous read group, NONE of its alignments are "
+ "marked as proper-pairs. "
+ "By setting this option, a read group's ambiguity flag will be ignored, and all of its "
+ "alignments will be compared to the top 2 models.";
+
+ // set program details
+ Options::SetProgramInfo("bamtools resolve", programDescription, programUsage);
+
+ // set up I/O options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", inputBamDescription, "",
+ m_settings->HasInputBamFile, m_settings->InputBamFilename, IO_Opts,
+ Options::StandardIn());
+ Options::AddValueOption("-out", "BAM filename", outputBamDescription, "",
+ m_settings->HasOutputBamFile, m_settings->OutputBamFilename, IO_Opts,
+ Options::StandardOut());
+ Options::AddValueOption("-stats", "STATS filename", statsFileDescription, "",
+ m_settings->HasStatsFile, m_settings->StatsFilename, IO_Opts);
+ Options::AddOption("-forceCompression", forceCompressionDescription,
+ m_settings->IsForceCompression, IO_Opts);
+
+ OptionGroup* ModeOpts =
+ Options::CreateOptionGroup("Resolve Modes (must select ONE of the following)");
+ Options::AddOption("-makeStats", makeStatsDescription, m_settings->IsMakeStats, ModeOpts);
+ Options::AddOption("-markPairs", markPairsDescription, m_settings->IsMarkPairs, ModeOpts);
+ Options::AddOption("-twoPass", twoPassDescription, m_settings->IsTwoPass, ModeOpts);
+
+ OptionGroup* GeneralOpts =
+ Options::CreateOptionGroup("General Resolve Options (available in all modes)");
+ Options::AddValueOption("-minMQ", "unsigned short", minMapQualDescription, "",
+ m_settings->HasMinimumMapQuality, m_settings->MinimumMapQuality,
+ GeneralOpts);
+
+ OptionGroup* MakeStatsOpts =
+ Options::CreateOptionGroup("MakeStats Mode Options (disabled in -markPairs mode)");
+ Options::AddValueOption("-ci", "double", confidenceIntervalDescription, "",
+ m_settings->HasConfidenceInterval, m_settings->ConfidenceInterval,
+ MakeStatsOpts);
+ Options::AddValueOption("-umt", "double", unusedModelThresholdDescription, "",
+ m_settings->HasUnusedModelThreshold, m_settings->UnusedModelThreshold,
+ MakeStatsOpts);
+
+ OptionGroup* MarkPairsOpts =
+ Options::CreateOptionGroup("MarkPairs Mode Options (disabled in -makeStats mode)");
+ Options::AddOption("-force", forceMarkDescription, m_settings->HasForceMarkReadGroups,
+ MarkPairsOpts);
+}
+
+ResolveTool::~ResolveTool()
+{
+
+ delete m_settings;
+ m_settings = 0;
+
+ delete m_impl;
+ m_impl = 0;
+}
+
+int ResolveTool::Help()
+{
+ Options::DisplayHelp();
+ return 0;
+}
+
+int ResolveTool::Run(int argc, char* argv[])
+{
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // initialize ResolveTool
+ m_impl = new ResolveToolPrivate(m_settings);
+
+ // run ResolveTool, return success/failure
+ if (m_impl->Run())
+ return 0;
+ else
+ return 1;
+}
diff --git a/src/toolkit/bamtools_resolve.h b/src/toolkit/bamtools_resolve.h
new file mode 100644
index 0000000..26a902f
--- /dev/null
+++ b/src/toolkit/bamtools_resolve.h
@@ -0,0 +1,43 @@
+// ***************************************************************************
+// bamtools_resolve.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 23 June 2011
+// ---------------------------------------------------------------------------
+// Resolves paired-end reads (marking the IsProperPair flag as needed).
+// ***************************************************************************
+
+#ifndef BAMTOOLS_RESOLVE_H
+#define BAMTOOLS_RESOLVE_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class ResolveTool : public AbstractTool
+{
+
+public:
+ ResolveTool();
+ ~ResolveTool();
+
+public:
+ int Help();
+ int Run(int argc, char* argv[]);
+
+private:
+ struct ResolveSettings;
+ ResolveSettings* m_settings;
+
+ struct ResolveToolPrivate;
+ ResolveToolPrivate* m_impl;
+
+ struct ReadNamesFileReader;
+ struct ReadNamesFileWriter;
+ struct StatsFileReader;
+ struct StatsFileWriter;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_RESOLVE_H
diff --git a/src/toolkit/bamtools_revert.cpp b/src/toolkit/bamtools_revert.cpp
new file mode 100644
index 0000000..bdc8afc
--- /dev/null
+++ b/src/toolkit/bamtools_revert.cpp
@@ -0,0 +1,212 @@
+// ***************************************************************************
+// bamtools_revert.cpp (c) 2010 Derek Barnett, Alistair Ward
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 7 April 2011
+// ---------------------------------------------------------------------------
+// Removes duplicate marks and restores original base qualities
+// ***************************************************************************
+
+#include "bamtools_revert.h"
+
+#include <api/BamReader.h>
+#include <api/BamWriter.h>
+#include <utils/bamtools_options.h>
+#include <utils/bamtools_utilities.h>
+using namespace BamTools;
+
+#include <iostream>
+#include <string>
+
+namespace BamTools {
+
+static const std::string OQ_TAG = "OQ";
+
+} // namespace BamTools
+
+// ---------------------------------------------
+// RevertSettings implementation
+
+struct RevertTool::RevertSettings
+{
+
+ // flags
+ bool HasInput;
+ bool HasOutput;
+ bool IsForceCompression;
+ bool IsKeepDuplicateFlag;
+ bool IsKeepQualities;
+
+ // filenames
+ std::string InputFilename;
+ std::string OutputFilename;
+
+ // constructor
+ RevertSettings()
+ : HasInput(false)
+ , HasOutput(false)
+ , IsForceCompression(false)
+ , IsKeepDuplicateFlag(false)
+ , IsKeepQualities(false)
+ , InputFilename(Options::StandardIn())
+ , OutputFilename(Options::StandardOut())
+ {}
+};
+
+// ---------------------------------------------
+// RevertToolPrivate implementation
+
+struct RevertTool::RevertToolPrivate
+{
+
+ // ctor & dtor
+public:
+ RevertToolPrivate(RevertTool::RevertSettings* settings)
+ : m_settings(settings)
+ {}
+ ~RevertToolPrivate() {}
+
+ // 'public' interface
+public:
+ bool Run();
+
+ // internal methods
+private:
+ void RevertAlignment(BamAlignment& al);
+
+ // data members
+private:
+ RevertTool::RevertSettings* m_settings;
+};
+
+// 'reverts' a BAM alignment
+// default behavior (for now) is:
+// 1 - replace Qualities with OQ contents
+// 2 - clear IsDuplicate flag
+// can override default behavior using command line options
+void RevertTool::RevertToolPrivate::RevertAlignment(BamAlignment& al)
+{
+
+ // replace Qualities with OQ contents, if requested
+ if (!m_settings->IsKeepQualities) {
+ std::string originalQualities;
+ if (al.GetTag(OQ_TAG, originalQualities)) {
+ al.Qualities = originalQualities;
+ al.RemoveTag(OQ_TAG);
+ }
+ }
+
+ // clear duplicate flag, if requested
+ if (!m_settings->IsKeepDuplicateFlag) al.SetIsDuplicate(false);
+}
+
+bool RevertTool::RevertToolPrivate::Run()
+{
+
+ // opens the BAM file without checking for indexes
+ BamReader reader;
+ if (!reader.Open(m_settings->InputFilename)) {
+ std::cerr << "bamtools revert ERROR: could not open " << m_settings->InputFilename
+ << " for reading... Aborting." << std::endl;
+ return false;
+ }
+
+ // get BAM file metadata
+ const std::string& headerText = reader.GetHeaderText();
+ const RefVector& references = reader.GetReferenceData();
+
+ // determine compression mode for BamWriter
+ bool writeUncompressed =
+ (m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression);
+ BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
+ if (writeUncompressed) compressionMode = BamWriter::Uncompressed;
+
+ // open BamWriter
+ BamWriter writer;
+ writer.SetCompressionMode(compressionMode);
+ if (!writer.Open(m_settings->OutputFilename, headerText, references)) {
+ std::cerr << "bamtools revert ERROR: could not open " << m_settings->OutputFilename
+ << " for writing... Aborting." << std::endl;
+ reader.Close();
+ return false;
+ }
+
+ // plow through file, reverting alignments
+ BamAlignment al;
+ while (reader.GetNextAlignment(al)) {
+ RevertAlignment(al);
+ writer.SaveAlignment(al);
+ }
+
+ // clean and exit
+ reader.Close();
+ writer.Close();
+ return true;
+}
+
+// ---------------------------------------------
+// RevertTool implementation
+
+RevertTool::RevertTool()
+ : AbstractTool()
+ , m_settings(new RevertSettings)
+ , m_impl(0)
+{
+ // set program details
+ Options::SetProgramInfo(
+ "bamtools revert",
+ "removes duplicate marks and restores original (non-recalibrated) base qualities",
+ "[-in <filename> -in <filename> ...] [-out <filename> | [-forceCompression]] "
+ "[revertOptions]");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput,
+ m_settings->InputFilename, IO_Opts, Options::StandardIn());
+ Options::AddValueOption("-out", "BAM filename", "the output BAM file", "",
+ m_settings->HasOutput, m_settings->OutputFilename, IO_Opts,
+ Options::StandardOut());
+ Options::AddOption("-forceCompression",
+ "if results are sent to stdout (like when piping to another tool), default "
+ "behavior is to leave output uncompressed. Use this flag to override and "
+ "force compression",
+ m_settings->IsForceCompression, IO_Opts);
+
+ OptionGroup* RevertOpts = Options::CreateOptionGroup("Revert Options");
+ Options::AddOption("-keepDuplicate", "keep duplicates marked", m_settings->IsKeepDuplicateFlag,
+ RevertOpts);
+ Options::AddOption("-keepQualities", "keep base qualities (do not replace with OQ contents)",
+ m_settings->IsKeepQualities, RevertOpts);
+}
+
+RevertTool::~RevertTool()
+{
+
+ delete m_settings;
+ m_settings = 0;
+
+ delete m_impl;
+ m_impl = 0;
+}
+
+int RevertTool::Help()
+{
+ Options::DisplayHelp();
+ return 0;
+}
+
+int RevertTool::Run(int argc, char* argv[])
+{
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // intialize RevertTool with settings
+ m_impl = new RevertToolPrivate(m_settings);
+
+ // run RevertTool, return success/fail
+ if (m_impl->Run())
+ return 0;
+ else
+ return 1;
+}
diff --git a/src/toolkit/bamtools_revert.h b/src/toolkit/bamtools_revert.h
new file mode 100644
index 0000000..8e44fe3
--- /dev/null
+++ b/src/toolkit/bamtools_revert.h
@@ -0,0 +1,38 @@
+// ***************************************************************************
+// bamtools_revert.h (c) 2010 Derek Barnett, Alistair Ward
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 7 April 2011
+// ---------------------------------------------------------------------------
+// Removes duplicate marks and restores original base qualities
+// ***************************************************************************
+
+#ifndef BAMTOOLS_REVERT_H
+#define BAMTOOLS_REVERT_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class RevertTool : public AbstractTool
+{
+
+public:
+ RevertTool();
+ ~RevertTool();
+
+public:
+ int Help();
+ int Run(int argc, char* argv[]);
+
+private:
+ struct RevertSettings;
+ RevertSettings* m_settings;
+
+ struct RevertToolPrivate;
+ RevertToolPrivate* m_impl;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_REVERT_H
diff --git a/src/toolkit/bamtools_sort.cpp b/src/toolkit/bamtools_sort.cpp
new file mode 100644
index 0000000..6c52f16
--- /dev/null
+++ b/src/toolkit/bamtools_sort.cpp
@@ -0,0 +1,381 @@
+// ***************************************************************************
+// bamtools_sort.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 27 March 2012 (DB)
+// ---------------------------------------------------------------------------
+// Sorts an input BAM file
+// ***************************************************************************
+
+#include "bamtools_sort.h"
+
+#include <api/BamMultiReader.h>
+#include <api/BamWriter.h>
+#include <api/SamConstants.h>
+#include <api/algorithms/Sort.h>
+#include <utils/bamtools_options.h>
+using namespace BamTools;
+using namespace BamTools::Algorithms;
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+// defaults
+//
+// ** These defaults should be tweaked & 'optimized' per testing ** //
+//
+// I say 'optimized' because each system will naturally perform
+// differently. We will attempt to determine a sensible
+// compromise that should perform well on average.
+const unsigned int SORT_DEFAULT_MAX_BUFFER_COUNT = 500000; // max numberOfAlignments for buffer
+const unsigned int SORT_DEFAULT_MAX_BUFFER_MEMORY = 1024; // Mb
+
+} // namespace BamTools
+
+// ---------------------------------------------
+// SortSettings implementation
+
+struct SortTool::SortSettings
+{
+
+ // flags
+ bool HasInputBamFilename;
+ bool HasMaxBufferCount;
+ bool HasMaxBufferMemory;
+ bool HasOutputBamFilename;
+ bool IsSortingByName;
+
+ // filenames
+ std::string InputBamFilename;
+ std::string OutputBamFilename;
+
+ // parameters
+ unsigned int MaxBufferCount;
+ unsigned int MaxBufferMemory;
+
+ // constructor
+ SortSettings()
+ : HasInputBamFilename(false)
+ , HasMaxBufferCount(false)
+ , HasMaxBufferMemory(false)
+ , HasOutputBamFilename(false)
+ , IsSortingByName(false)
+ , InputBamFilename(Options::StandardIn())
+ , OutputBamFilename(Options::StandardOut())
+ , MaxBufferCount(SORT_DEFAULT_MAX_BUFFER_COUNT)
+ , MaxBufferMemory(SORT_DEFAULT_MAX_BUFFER_MEMORY)
+ {}
+};
+
+// ---------------------------------------------
+// SortToolPrivate implementation
+
+class SortTool::SortToolPrivate
+{
+
+ // ctor & dtor
+public:
+ SortToolPrivate(SortTool::SortSettings* settings);
+ ~SortToolPrivate() {}
+
+ // 'public' interface
+public:
+ bool Run();
+
+ // internal methods
+private:
+ bool CreateSortedTempFile(std::vector<BamAlignment>& buffer);
+ bool GenerateSortedRuns();
+ bool MergeSortedRuns();
+ bool WriteTempFile(const std::vector<BamAlignment>& buffer, const std::string& tempFilename);
+ void SortBuffer(std::vector<BamAlignment>& buffer);
+
+ // data members
+private:
+ SortTool::SortSettings* m_settings;
+ std::string m_tempFilenameStub;
+ int m_numberOfRuns;
+ std::string m_headerText;
+ RefVector m_references;
+ std::vector<std::string> m_tempFilenames;
+};
+
+// constructor
+SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings)
+ : m_settings(settings)
+ , m_numberOfRuns(0)
+{
+ // set filename stub depending on inputfile path
+ // that way multiple sort runs don't trip on each other's temp files
+ if (m_settings) {
+ std::size_t extensionFound = m_settings->InputBamFilename.find(".bam");
+ if (extensionFound != std::string::npos)
+ m_tempFilenameStub = m_settings->InputBamFilename.substr(0, extensionFound);
+ m_tempFilenameStub.append(".sort.temp.");
+ }
+}
+
+// generates mutiple sorted temp BAM files from single unsorted BAM file
+bool SortTool::SortToolPrivate::GenerateSortedRuns()
+{
+
+ // open input BAM file
+ BamReader reader;
+ if (!reader.Open(m_settings->InputBamFilename)) {
+ std::cerr << "bamtools sort ERROR: could not open " << m_settings->InputBamFilename
+ << " for reading... Aborting." << std::endl;
+ return false;
+ }
+
+ // get basic data that will be shared by all temp/output files
+ SamHeader header = reader.GetHeader();
+ if (!header.HasVersion()) header.Version = Constants::SAM_CURRENT_VERSION;
+ header.SortOrder = (m_settings->IsSortingByName ? Constants::SAM_HD_SORTORDER_QUERYNAME
+ : Constants::SAM_HD_SORTORDER_COORDINATE);
+ m_headerText = header.ToString();
+ m_references = reader.GetReferenceData();
+
+ // set up alignments buffer
+ BamAlignment al;
+ std::vector<BamAlignment> buffer;
+ buffer.reserve(static_cast<std::size_t>(m_settings->MaxBufferCount * 1.1));
+ bool bufferFull = false;
+
+ // if sorting by name, we need to generate full char data
+ // so can't use GetNextAlignmentCore()
+ if (m_settings->IsSortingByName) {
+
+ // iterate through file
+ while (reader.GetNextAlignment(al)) {
+
+ // check buffer's usage
+ bufferFull = (buffer.size() >= m_settings->MaxBufferCount);
+
+ // store alignments until buffer is "full"
+ if (!bufferFull) buffer.push_back(al);
+
+ // if buffer is "full"
+ else {
+ // so create a sorted temp file with current buffer contents
+ // then push "al" into fresh buffer
+ CreateSortedTempFile(buffer);
+ buffer.push_back(al);
+ }
+ }
+ }
+
+ // sorting by position, can take advantage of GNACore() speedup
+ else {
+
+ // iterate through file
+ while (reader.GetNextAlignmentCore(al)) {
+
+ // check buffer's usage
+ bufferFull = (buffer.size() >= m_settings->MaxBufferCount);
+
+ // store alignments until buffer is "full"
+ if (!bufferFull) buffer.push_back(al);
+
+ // if buffer is "full"
+ else {
+ // create a sorted temp file with current buffer contents
+ // then push "al" into fresh buffer
+ CreateSortedTempFile(buffer);
+ buffer.push_back(al);
+ }
+ }
+ }
+
+ // handle any leftover buffer contents
+ if (!buffer.empty()) CreateSortedTempFile(buffer);
+
+ // close reader & return success
+ reader.Close();
+ return true;
+}
+
+bool SortTool::SortToolPrivate::CreateSortedTempFile(std::vector<BamAlignment>& buffer)
+{
+
+ // do sorting
+ SortBuffer(buffer);
+
+ // write sorted contents to temp file, store success/fail
+ std::stringstream tempStr;
+ tempStr << m_tempFilenameStub << m_numberOfRuns;
+ bool success = WriteTempFile(buffer, tempStr.str());
+
+ // save temp filename for merging later
+ m_tempFilenames.push_back(tempStr.str());
+
+ // clear buffer contents & update run counter
+ buffer.clear();
+ ++m_numberOfRuns;
+
+ // return success/fail of writing to temp file
+ // TODO: a failure returned here is not actually caught and handled anywhere
+ return success;
+}
+
+// merges sorted temp BAM files into single sorted output BAM file
+bool SortTool::SortToolPrivate::MergeSortedRuns()
+{
+
+ // open up multi reader for all of our temp files
+ // this might get broken up if we do a multi-pass system later ??
+ BamMultiReader multiReader;
+ if (!multiReader.Open(m_tempFilenames)) {
+ std::cerr << "bamtools sort ERROR: could not open BamMultiReader for merging temp files... "
+ "Aborting."
+ << std::endl;
+ return false;
+ }
+
+ // open writer for our completely sorted output BAM file
+ BamWriter mergedWriter;
+ if (!mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references)) {
+ std::cerr << "bamtools sort ERROR: could not open " << m_settings->OutputBamFilename
+ << " for writing... Aborting." << std::endl;
+ multiReader.Close();
+ return false;
+ }
+
+ // while data available in temp files
+ BamAlignment al;
+ while (multiReader.GetNextAlignmentCore(al))
+ mergedWriter.SaveAlignment(al);
+
+ // close files
+ multiReader.Close();
+ mergedWriter.Close();
+
+ // delete all temp files
+ std::vector<std::string>::const_iterator tempIter = m_tempFilenames.begin();
+ std::vector<std::string>::const_iterator tempEnd = m_tempFilenames.end();
+ for (; tempIter != tempEnd; ++tempIter) {
+ const std::string& tempFilename = (*tempIter);
+ remove(tempFilename.c_str());
+ }
+
+ // return success
+ return true;
+}
+
+bool SortTool::SortToolPrivate::Run()
+{
+
+ // this does a single pass, chunking up the input file into smaller sorted temp files,
+ // then write out using BamMultiReader to handle merging
+
+ if (GenerateSortedRuns())
+ return MergeSortedRuns();
+ else
+ return false;
+}
+
+void SortTool::SortToolPrivate::SortBuffer(std::vector<BamAlignment>& buffer)
+{
+
+ // ** add further custom sort options later ?? **
+
+ // sort buffer by desired method
+ if (m_settings->IsSortingByName)
+ std::stable_sort(buffer.begin(), buffer.end(), Sort::ByName());
+ else
+ std::stable_sort(buffer.begin(), buffer.end(), Sort::ByPosition());
+}
+
+bool SortTool::SortToolPrivate::WriteTempFile(const std::vector<BamAlignment>& buffer,
+ const std::string& tempFilename)
+{
+ // open temp file for writing
+ BamWriter tempWriter;
+ if (!tempWriter.Open(tempFilename, m_headerText, m_references)) {
+ std::cerr << "bamtools sort ERROR: could not open " << tempFilename << " for writing."
+ << std::endl;
+ return false;
+ }
+
+ // write data
+ std::vector<BamAlignment>::const_iterator buffIter = buffer.begin();
+ std::vector<BamAlignment>::const_iterator buffEnd = buffer.end();
+ for (; buffIter != buffEnd; ++buffIter) {
+ const BamAlignment& al = (*buffIter);
+ tempWriter.SaveAlignment(al);
+ }
+
+ // close temp file & return success
+ tempWriter.Close();
+ return true;
+}
+
+// ---------------------------------------------
+// SortTool implementation
+
+SortTool::SortTool()
+ : AbstractTool()
+ , m_settings(new SortSettings)
+ , m_impl(0)
+{
+ // set program details
+ Options::SetProgramInfo("bamtools sort", "sorts a BAM file",
+ "[-in <filename>] [-out <filename>] [sortOptions]");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file", "",
+ m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts,
+ Options::StandardIn());
+ Options::AddValueOption("-out", "BAM filename", "the output BAM file", "",
+ m_settings->HasOutputBamFilename, m_settings->OutputBamFilename,
+ IO_Opts, Options::StandardOut());
+
+ OptionGroup* SortOpts = Options::CreateOptionGroup("Sorting Methods");
+ Options::AddOption("-byname", "sort by alignment name", m_settings->IsSortingByName, SortOpts);
+
+ OptionGroup* MemOpts = Options::CreateOptionGroup("Memory Settings");
+ Options::AddValueOption("-n", "count", "max number of alignments per tempfile", "",
+ m_settings->HasMaxBufferCount, m_settings->MaxBufferCount, MemOpts,
+ SORT_DEFAULT_MAX_BUFFER_COUNT);
+ Options::AddValueOption("-mem", "Mb", "max memory to use", "", m_settings->HasMaxBufferMemory,
+ m_settings->MaxBufferMemory, MemOpts, SORT_DEFAULT_MAX_BUFFER_MEMORY);
+}
+
+SortTool::~SortTool()
+{
+
+ delete m_settings;
+ m_settings = 0;
+
+ delete m_impl;
+ m_impl = 0;
+}
+
+int SortTool::Help()
+{
+ Options::DisplayHelp();
+ return 0;
+}
+
+int SortTool::Run(int argc, char* argv[])
+{
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // initialize SortTool with settings
+ m_impl = new SortToolPrivate(m_settings);
+
+ // run SortTool, return success/fail
+ if (m_impl->Run())
+ return 0;
+ else
+ return 1;
+}
diff --git a/src/toolkit/bamtools_sort.h b/src/toolkit/bamtools_sort.h
new file mode 100644
index 0000000..2ceb12a
--- /dev/null
+++ b/src/toolkit/bamtools_sort.h
@@ -0,0 +1,38 @@
+// ***************************************************************************
+// bamtools_sort.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 7 April 2011 (DB)
+// ---------------------------------------------------------------------------
+// Sorts a BAM file
+// ***************************************************************************
+
+#ifndef BAMTOOLS_SORT_H
+#define BAMTOOLS_SORT_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class SortTool : public AbstractTool
+{
+
+public:
+ SortTool();
+ ~SortTool();
+
+public:
+ int Help();
+ int Run(int argc, char* argv[]);
+
+private:
+ struct SortSettings;
+ SortSettings* m_settings;
+
+ class SortToolPrivate;
+ SortToolPrivate* m_impl;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_SORT_H
diff --git a/src/toolkit/bamtools_split.cpp b/src/toolkit/bamtools_split.cpp
new file mode 100644
index 0000000..f303b49
--- /dev/null
+++ b/src/toolkit/bamtools_split.cpp
@@ -0,0 +1,750 @@
+// ***************************************************************************
+// bamtools_split.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 24 July 2013 (DB)
+// ---------------------------------------------------------------------------
+// Splits a BAM file on user-specified property, creating a new BAM output
+// file for each value found
+// ***************************************************************************
+
+#include "bamtools_split.h"
+
+#include <api/BamConstants.h>
+#include <api/BamReader.h>
+#include <api/BamWriter.h>
+#include <utils/bamtools_options.h>
+#include <utils/bamtools_variant.h>
+using namespace BamTools;
+
+#include <cstddef>
+#include <ctime>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+// string constants
+static const std::string SPLIT_MAPPED_TOKEN = ".MAPPED";
+static const std::string SPLIT_UNMAPPED_TOKEN = ".UNMAPPED";
+static const std::string SPLIT_PAIRED_TOKEN = ".PAIRED_END";
+static const std::string SPLIT_SINGLE_TOKEN = ".SINGLE_END";
+static const std::string SPLIT_REFERENCE_TOKEN = ".REF_";
+static const std::string SPLIT_TAG_TOKEN = ".TAG_";
+
+std::string GetTimestampString()
+{
+
+ // get human readable timestamp
+ time_t currentTime;
+ time(&currentTime);
+ std::stringstream timeStream;
+ timeStream << ctime(&currentTime);
+
+ // convert whitespace to '_'
+ std::string timeString = timeStream.str();
+ std::size_t found = timeString.find(' ');
+ while (found != std::string::npos) {
+ timeString.replace(found, 1, 1, '_');
+ found = timeString.find(' ', found + 1);
+ }
+ return timeString;
+}
+
+// remove copy of filename without extension
+// (so /path/to/file.txt becomes /path/to/file )
+std::string RemoveFilenameExtension(const std::string& filename)
+{
+ std::size_t found = filename.rfind('.');
+ return filename.substr(0, found);
+}
+
+} // namespace BamTools
+
+// ---------------------------------------------
+// SplitSettings implementation
+
+struct SplitTool::SplitSettings
+{
+
+ // flags
+ bool HasInputFilename;
+ bool HasCustomOutputStub;
+ bool HasCustomRefPrefix;
+ bool HasCustomTagPrefix;
+ bool HasListTagDelimiter;
+ bool IsSplittingMapped;
+ bool IsSplittingPaired;
+ bool IsSplittingReference;
+ bool IsSplittingTag;
+
+ // string args
+ std::string CustomOutputStub;
+ std::string CustomRefPrefix;
+ std::string CustomTagPrefix;
+ std::string InputFilename;
+ std::string TagToSplit;
+ std::string ListTagDelimiter;
+
+ // constructor
+ SplitSettings()
+ : HasInputFilename(false)
+ , HasCustomOutputStub(false)
+ , HasCustomRefPrefix(false)
+ , HasCustomTagPrefix(false)
+ , HasListTagDelimiter(false)
+ , IsSplittingMapped(false)
+ , IsSplittingPaired(false)
+ , IsSplittingReference(false)
+ , IsSplittingTag(false)
+ , InputFilename(Options::StandardIn())
+ , ListTagDelimiter("--")
+ {}
+};
+
+// ---------------------------------------------
+// SplitToolPrivate declaration
+
+class SplitTool::SplitToolPrivate
+{
+
+ // ctor & dtor
+public:
+ SplitToolPrivate(SplitTool::SplitSettings* settings)
+ : m_settings(settings)
+ {}
+
+ ~SplitToolPrivate()
+ {
+ m_reader.Close();
+ }
+
+ // 'public' interface
+public:
+ bool Run();
+
+ // internal methods
+private:
+ // close & delete BamWriters in map
+ template <typename T>
+ void CloseWriters(std::map<T, BamWriter*>& writers);
+ // calculate output stub based on IO args given
+ void DetermineOutputFilenameStub();
+ // open our BamReader
+ bool OpenReader();
+ // split alignments in BAM file based on isMapped property
+ bool SplitMapped();
+ // split alignments in BAM file based on isPaired property
+ bool SplitPaired();
+ // split alignments in BAM file based on refID property
+ bool SplitReference();
+ // finds first alignment and calls corresponding SplitTagImpl<>
+ // depending on tag type
+ bool SplitTag();
+
+public:
+ // handles list-type tags
+ template <typename T>
+ bool SplitListTagImpl(BamAlignment& al);
+
+ // handles single-value tags
+ template <typename T>
+ bool SplitTagImpl(BamAlignment& al);
+
+ // data members
+private:
+ SplitTool::SplitSettings* m_settings;
+ std::string m_outputFilenameStub;
+ BamReader m_reader;
+ std::string m_header;
+ RefVector m_references;
+};
+
+void SplitTool::SplitToolPrivate::DetermineOutputFilenameStub()
+{
+
+ // if user supplied output filename stub, use that
+ if (m_settings->HasCustomOutputStub) m_outputFilenameStub = m_settings->CustomOutputStub;
+
+ // else if user supplied input BAM filename, use that (minus ".bam" extension) as stub
+ else if (m_settings->HasInputFilename)
+ m_outputFilenameStub = RemoveFilenameExtension(m_settings->InputFilename);
+
+ // otherwise, user did not specify -stub, and input is coming from STDIN
+ // generate stub from timestamp
+ else
+ m_outputFilenameStub = GetTimestampString();
+}
+
+bool SplitTool::SplitToolPrivate::OpenReader()
+{
+
+ // attempt to open BAM file
+ if (!m_reader.Open(m_settings->InputFilename)) {
+ std::cerr << "bamtools split ERROR: could not open BAM file: " << m_settings->InputFilename
+ << std::endl;
+ return false;
+ }
+
+ // save file 'metadata' & return success
+ m_header = m_reader.GetHeaderText();
+ m_references = m_reader.GetReferenceData();
+ return true;
+}
+
+bool SplitTool::SplitToolPrivate::Run()
+{
+
+ // determine output stub
+ DetermineOutputFilenameStub();
+
+ // open up BamReader
+ if (!OpenReader()) return false;
+
+ // determine split type from settings
+ if (m_settings->IsSplittingMapped) return SplitMapped();
+ if (m_settings->IsSplittingPaired) return SplitPaired();
+ if (m_settings->IsSplittingReference) return SplitReference();
+ if (m_settings->IsSplittingTag) return SplitTag();
+
+ // if we get here, no property was specified
+ std::cerr
+ << "bamtools split ERROR: no property given to split on... " << std::endl
+ << "Please use -mapped, -paired, -reference, or -tag TAG to specify desired split behavior."
+ << std::endl;
+ return false;
+}
+
+bool SplitTool::SplitToolPrivate::SplitMapped()
+{
+
+ // set up splitting data structure
+ std::map<bool, BamWriter*> outputFiles;
+ std::map<bool, BamWriter*>::iterator writerIter;
+
+ // iterate through alignments
+ BamAlignment al;
+ BamWriter* writer;
+ bool isCurrentAlignmentMapped;
+ while (m_reader.GetNextAlignment(al)) {
+
+ // see if bool value exists
+ isCurrentAlignmentMapped = al.IsMapped();
+ writerIter = outputFiles.find(isCurrentAlignmentMapped);
+
+ // if no writer associated with this value
+ if (writerIter == outputFiles.end()) {
+
+ // open new BamWriter
+ const std::string outputFilename =
+ m_outputFilenameStub +
+ (isCurrentAlignmentMapped ? SPLIT_MAPPED_TOKEN : SPLIT_UNMAPPED_TOKEN) + ".bam";
+ writer = new BamWriter;
+ if (!writer->Open(outputFilename, m_header, m_references)) {
+ std::cerr << "bamtools split ERROR: could not open " << outputFilename
+ << " for writing." << std::endl;
+ return false;
+ }
+
+ // store in map
+ outputFiles.insert(std::make_pair(isCurrentAlignmentMapped, writer));
+ }
+
+ // else grab corresponding writer
+ else
+ writer = (*writerIter).second;
+
+ // store alignment in proper BAM output file
+ if (writer) writer->SaveAlignment(al);
+ }
+
+ // clean up BamWriters
+ CloseWriters(outputFiles);
+
+ // return success
+ return true;
+}
+
+bool SplitTool::SplitToolPrivate::SplitPaired()
+{
+
+ // set up splitting data structure
+ std::map<bool, BamWriter*> outputFiles;
+ std::map<bool, BamWriter*>::iterator writerIter;
+
+ // iterate through alignments
+ BamAlignment al;
+ BamWriter* writer;
+ bool isCurrentAlignmentPaired;
+ while (m_reader.GetNextAlignment(al)) {
+
+ // see if bool value exists
+ isCurrentAlignmentPaired = al.IsPaired();
+ writerIter = outputFiles.find(isCurrentAlignmentPaired);
+
+ // if no writer associated with this value
+ if (writerIter == outputFiles.end()) {
+
+ // open new BamWriter
+ const std::string outputFilename =
+ m_outputFilenameStub +
+ (isCurrentAlignmentPaired ? SPLIT_PAIRED_TOKEN : SPLIT_SINGLE_TOKEN) + ".bam";
+ writer = new BamWriter;
+ if (!writer->Open(outputFilename, m_header, m_references)) {
+ std::cerr << "bamtool split ERROR: could not open " << outputFilename
+ << " for writing." << std::endl;
+ return false;
+ }
+
+ // store in map
+ outputFiles.insert(std::make_pair(isCurrentAlignmentPaired, writer));
+ }
+
+ // else grab corresponding writer
+ else
+ writer = (*writerIter).second;
+
+ // store alignment in proper BAM output file
+ if (writer) writer->SaveAlignment(al);
+ }
+
+ // clean up BamWriters
+ CloseWriters(outputFiles);
+
+ // return success
+ return true;
+}
+
+bool SplitTool::SplitToolPrivate::SplitReference()
+{
+
+ // set up splitting data structure
+ std::map<int32_t, BamWriter*> outputFiles;
+ std::map<int32_t, BamWriter*>::iterator writerIter;
+
+ // determine reference prefix
+ std::string refPrefix = SPLIT_REFERENCE_TOKEN;
+ if (m_settings->HasCustomRefPrefix) refPrefix = m_settings->CustomRefPrefix;
+
+ // make sure prefix starts with '.'
+ const std::size_t dotFound = refPrefix.find('.');
+ if (dotFound != 0) refPrefix = std::string(1, '.') + refPrefix;
+
+ // iterate through alignments
+ BamAlignment al;
+ BamWriter* writer;
+ int32_t currentRefId;
+ while (m_reader.GetNextAlignment(al)) {
+
+ // see if bool value exists
+ currentRefId = al.RefID;
+ writerIter = outputFiles.find(currentRefId);
+
+ // if no writer associated with this value
+ if (writerIter == outputFiles.end()) {
+
+ // fetch reference name for ID
+ std::string refName;
+ if (currentRefId == -1)
+ refName = "unmapped";
+ else
+ refName = m_references.at(currentRefId).RefName;
+
+ // construct new output filename
+ const std::string outputFilename = m_outputFilenameStub + refPrefix + refName + ".bam";
+
+ // open new BamWriter
+ writer = new BamWriter;
+ if (!writer->Open(outputFilename, m_header, m_references)) {
+ std::cerr << "bamtools split ERROR: could not open " << outputFilename
+ << " for writing." << std::endl;
+ return false;
+ }
+
+ // store in map
+ outputFiles.insert(std::make_pair(currentRefId, writer));
+ }
+
+ // else grab corresponding writer
+ else
+ writer = (*writerIter).second;
+
+ // store alignment in proper BAM output file
+ if (writer) writer->SaveAlignment(al);
+ }
+
+ // clean up BamWriters
+ CloseWriters(outputFiles);
+
+ // return success
+ return true;
+}
+
+// finds first alignment and calls corresponding SplitTagImpl<>() depending on tag type
+bool SplitTool::SplitToolPrivate::SplitTag()
+{
+
+ // iterate through alignments, until we hit TAG
+ BamAlignment al;
+ while (m_reader.GetNextAlignment(al)) {
+
+ // look for tag in this alignment and get tag type
+ char tagType(0);
+ if (!al.GetTagType(m_settings->TagToSplit, tagType)) continue;
+
+ // request split method based on tag type
+ // pass it the current alignment found
+ switch (tagType) {
+
+ case (Constants::BAM_TAG_TYPE_INT8):
+ return SplitTagImpl<int8_t>(al);
+ case (Constants::BAM_TAG_TYPE_INT16):
+ return SplitTagImpl<int16_t>(al);
+ case (Constants::BAM_TAG_TYPE_INT32):
+ return SplitTagImpl<int32_t>(al);
+ case (Constants::BAM_TAG_TYPE_UINT8):
+ return SplitTagImpl<uint8_t>(al);
+ case (Constants::BAM_TAG_TYPE_UINT16):
+ return SplitTagImpl<uint16_t>(al);
+ case (Constants::BAM_TAG_TYPE_UINT32):
+ return SplitTagImpl<uint32_t>(al);
+ case (Constants::BAM_TAG_TYPE_FLOAT):
+ return SplitTagImpl<float>(al);
+
+ case (Constants::BAM_TAG_TYPE_ASCII):
+ case (Constants::BAM_TAG_TYPE_STRING):
+ case (Constants::BAM_TAG_TYPE_HEX):
+ return SplitTagImpl<std::string>(al);
+
+ case (Constants::BAM_TAG_TYPE_ARRAY): {
+
+ char arrayTagType(0);
+ if (!al.GetArrayTagType(m_settings->TagToSplit, arrayTagType)) continue;
+ switch (arrayTagType) {
+ case (Constants::BAM_TAG_TYPE_INT8):
+ return SplitListTagImpl<int8_t>(al);
+ case (Constants::BAM_TAG_TYPE_INT16):
+ return SplitListTagImpl<int16_t>(al);
+ case (Constants::BAM_TAG_TYPE_INT32):
+ return SplitListTagImpl<int32_t>(al);
+ case (Constants::BAM_TAG_TYPE_UINT8):
+ return SplitListTagImpl<uint8_t>(al);
+ case (Constants::BAM_TAG_TYPE_UINT16):
+ return SplitListTagImpl<uint16_t>(al);
+ case (Constants::BAM_TAG_TYPE_UINT32):
+ return SplitListTagImpl<uint32_t>(al);
+ case (Constants::BAM_TAG_TYPE_FLOAT):
+ return SplitListTagImpl<float>(al);
+ default:
+ std::cerr
+ << "bamtools split ERROR: array tag has unsupported element type: "
+ << arrayTagType << std::endl;
+ return false;
+ }
+ }
+
+ default:
+ std::cerr << "bamtools split ERROR: unknown tag type encountered: " << tagType
+ << std::endl;
+ return false;
+ }
+ }
+
+ // tag not found, but that's not an error - return success
+ return true;
+}
+
+// --------------------------------------------------------------------------------
+// template method implementation
+// *Technical Note* - use of template methods declared & defined in ".cpp" file
+// goes against normal practices, but works here because these
+// are purely internal (no one can call from outside this file)
+
+// close BamWriters & delete pointers
+template <typename T>
+void SplitTool::SplitToolPrivate::CloseWriters(std::map<T, BamWriter*>& writers)
+{
+
+ typedef std::map<T, BamWriter*> WriterMap;
+ typedef typename WriterMap::iterator WriterMapIterator;
+
+ // iterate over writers
+ WriterMapIterator writerIter = writers.begin();
+ WriterMapIterator writerEnd = writers.end();
+ for (; writerIter != writerEnd; ++writerIter) {
+ BamWriter* writer = (*writerIter).second;
+ if (writer == 0) continue;
+
+ // close BamWriter
+ writer->Close();
+
+ // destroy BamWriter
+ delete writer;
+ writer = 0;
+ }
+
+ // clear the container (destroying the items doesn't remove them)
+ writers.clear();
+}
+
+// handle list-type tags
+template <typename T>
+bool SplitTool::SplitToolPrivate::SplitListTagImpl(BamAlignment& al)
+{
+
+ typedef std::vector<T> TagValueType;
+ typedef std::map<std::string, BamWriter*> WriterMap;
+ typedef typename WriterMap::iterator WriterMapIterator;
+
+ // set up splitting data structure
+ WriterMap outputFiles;
+ WriterMapIterator writerIter;
+
+ // determine tag prefix
+ std::string tagPrefix = SPLIT_TAG_TOKEN;
+ if (m_settings->HasCustomTagPrefix) tagPrefix = m_settings->CustomTagPrefix;
+
+ // make sure prefix starts with '.'
+ const std::size_t dotFound = tagPrefix.find('.');
+ if (dotFound != 0) tagPrefix = std::string(1, '.') + tagPrefix;
+
+ const std::string tag = m_settings->TagToSplit;
+ BamWriter* writer;
+ TagValueType currentValue;
+ while (m_reader.GetNextAlignment(al)) {
+
+ std::string listTagLabel;
+ if (!al.GetTag(tag, currentValue))
+ listTagLabel = "none";
+ else {
+ // make list label from tag data
+ std::stringstream listTagLabelStream;
+ typename TagValueType::const_iterator tagValueIter = currentValue.begin();
+ typename TagValueType::const_iterator tagValueEnd = currentValue.end();
+ for (; tagValueIter != tagValueEnd; ++tagValueIter)
+ listTagLabelStream << (*tagValueIter) << m_settings->ListTagDelimiter;
+ listTagLabel = listTagLabelStream.str();
+ if (!listTagLabel.empty())
+ listTagLabel = listTagLabel.substr(
+ 0, listTagLabel.size() -
+ m_settings->ListTagDelimiter.size()); // pop last delimiter
+ }
+
+ // lookup writer for label
+ writerIter = outputFiles.find(listTagLabel);
+
+ // if not found, create one
+ if (writerIter == outputFiles.end()) {
+
+ // open new BamWriter, save first alignment
+ std::stringstream outputFilenameStream;
+ outputFilenameStream << m_outputFilenameStub << tagPrefix << tag << '_' << listTagLabel
+ << ".bam";
+ writer = new BamWriter;
+ if (!writer->Open(outputFilenameStream.str(), m_header, m_references)) {
+ std::cerr << "bamtools split ERROR: could not open " << outputFilenameStream.str()
+ << " for writing." << std::endl;
+ return false;
+ }
+
+ // store in map
+ outputFiles.insert(std::make_pair(listTagLabel, writer));
+ }
+
+ // else grab existing writer
+ else
+ writer = (*writerIter).second;
+
+ // store alignment in proper BAM output file
+ if (writer) writer->SaveAlignment(al);
+ }
+
+ // clean up & return success
+ CloseWriters(outputFiles);
+ return true;
+}
+
+// handle the single-value tags
+template <typename T>
+bool SplitTool::SplitToolPrivate::SplitTagImpl(BamAlignment& al)
+{
+
+ typedef T TagValueType;
+ typedef std::map<TagValueType, BamWriter*> WriterMap;
+ typedef typename WriterMap::iterator WriterMapIterator;
+
+ // set up splitting data structure
+ WriterMap outputFiles;
+ WriterMapIterator writerIter;
+
+ // determine tag prefix
+ std::string tagPrefix = SPLIT_TAG_TOKEN;
+ if (m_settings->HasCustomTagPrefix) tagPrefix = m_settings->CustomTagPrefix;
+
+ // make sure prefix starts with '.'
+ const std::size_t dotFound = tagPrefix.find('.');
+ if (dotFound != 0) tagPrefix = std::string(1, '.') + tagPrefix;
+
+ // local variables
+ const std::string tag = m_settings->TagToSplit;
+ BamWriter* writer;
+ std::stringstream outputFilenameStream;
+ TagValueType currentValue;
+
+ // retrieve first alignment tag value
+ if (al.GetTag(tag, currentValue)) {
+
+ // open new BamWriter, save first alignment
+ outputFilenameStream << m_outputFilenameStub << tagPrefix << tag << '_' << currentValue
+ << ".bam";
+ writer = new BamWriter;
+ if (!writer->Open(outputFilenameStream.str(), m_header, m_references)) {
+ std::cerr << "bamtools split ERROR: could not open " << outputFilenameStream.str()
+ << " for writing." << std::endl;
+ return false;
+ }
+ writer->SaveAlignment(al);
+
+ // store in map
+ outputFiles.insert(std::make_pair(currentValue, writer));
+
+ // reset stream
+ outputFilenameStream.str(std::string());
+ }
+
+ // iterate through remaining alignments
+ while (m_reader.GetNextAlignment(al)) {
+
+ // skip if this alignment doesn't have TAG
+ if (!al.GetTag(tag, currentValue)) continue;
+
+ // look up tag value in map
+ writerIter = outputFiles.find(currentValue);
+
+ // if no writer associated with this value
+ if (writerIter == outputFiles.end()) {
+
+ // open new BamWriter
+ outputFilenameStream << m_outputFilenameStub << tagPrefix << tag << '_' << currentValue
+ << ".bam";
+ writer = new BamWriter;
+ if (!writer->Open(outputFilenameStream.str(), m_header, m_references)) {
+ std::cerr << "bamtool split ERROR: could not open " << outputFilenameStream.str()
+ << " for writing." << std::endl;
+ return false;
+ }
+
+ // store in map
+ outputFiles.insert(std::make_pair(currentValue, writer));
+
+ // reset stream
+ outputFilenameStream.str(std::string());
+ }
+
+ // else grab corresponding writer
+ else
+ writer = (*writerIter).second;
+
+ // store alignment in proper BAM output file
+ if (writer) writer->SaveAlignment(al);
+ }
+
+ // clean up BamWriters
+ CloseWriters(outputFiles);
+
+ // return success
+ return true;
+}
+
+// ---------------------------------------------
+// SplitTool implementation
+
+SplitTool::SplitTool()
+ : AbstractTool()
+ , m_settings(new SplitSettings)
+ , m_impl(0)
+{
+ // set program details
+ const std::string name = "bamtools split";
+ const std::string description =
+ "splits a BAM file on user-specified property, creating a new BAM output file for each "
+ "value found";
+ const std::string args =
+ "[-in <filename>] [-stub <filename stub>] < -mapped | -paired | -reference [-refPrefix "
+ "<prefix>] | -tag <TAG> > ";
+ Options::SetProgramInfo(name, description, args);
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file", "",
+ m_settings->HasInputFilename, m_settings->InputFilename, IO_Opts,
+ Options::StandardIn());
+ Options::AddValueOption(
+ "-refPrefix", "string",
+ "custom prefix for splitting by references. Currently files end with REF_<refName>.bam. "
+ "This option allows you to replace \"REF_\" with a prefix of your choosing.",
+ "", m_settings->HasCustomRefPrefix, m_settings->CustomRefPrefix, IO_Opts);
+ Options::AddValueOption(
+ "-tagPrefix", "string",
+ "custom prefix for splitting by tags. Current files end with TAG_<tagname>_<tagvalue>.bam. "
+ "This option allows you to replace \"TAG_\" with a prefix of your choosing.",
+ "", m_settings->HasCustomTagPrefix, m_settings->CustomTagPrefix, IO_Opts);
+ Options::AddValueOption("-stub", "filename stub",
+ "prefix stub for output BAM files (default behavior is to use input "
+ "filename, without .bam extension, as stub). If input is stdin and no "
+ "stub provided, a timestamp is generated as the stub.",
+ "", m_settings->HasCustomOutputStub, m_settings->CustomOutputStub,
+ IO_Opts);
+ Options::AddValueOption("-tagListDelim", "string",
+ "delimiter used to separate values in the filenames generated from "
+ "splitting on list-type tags [--]",
+ "", m_settings->HasListTagDelimiter, m_settings->ListTagDelimiter,
+ IO_Opts);
+
+ OptionGroup* SplitOpts = Options::CreateOptionGroup("Split Options");
+ Options::AddOption("-mapped", "split mapped/unmapped alignments", m_settings->IsSplittingMapped,
+ SplitOpts);
+ Options::AddOption("-paired", "split single-end/paired-end alignments",
+ m_settings->IsSplittingPaired, SplitOpts);
+ Options::AddOption("-reference", "split alignments by reference",
+ m_settings->IsSplittingReference, SplitOpts);
+ Options::AddValueOption("-tag", "tag name",
+ "splits alignments based on all values of TAG encountered (i.e. -tag "
+ "RG creates a BAM file for each read group in original BAM file)",
+ "", m_settings->IsSplittingTag, m_settings->TagToSplit, SplitOpts);
+}
+
+SplitTool::~SplitTool()
+{
+
+ delete m_settings;
+ m_settings = 0;
+
+ delete m_impl;
+ m_impl = 0;
+}
+
+int SplitTool::Help()
+{
+ Options::DisplayHelp();
+ return 0;
+}
+
+int SplitTool::Run(int argc, char* argv[])
+{
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // initialize SplitTool with settings
+ m_impl = new SplitToolPrivate(m_settings);
+
+ // run SplitTool, return success/fail
+ if (m_impl->Run())
+ return 0;
+ else
+ return 1;
+}
diff --git a/src/toolkit/bamtools_split.h b/src/toolkit/bamtools_split.h
new file mode 100644
index 0000000..c246b40
--- /dev/null
+++ b/src/toolkit/bamtools_split.h
@@ -0,0 +1,39 @@
+// ***************************************************************************
+// bamtools_split.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 7 April 2011 (DB)
+// ---------------------------------------------------------------------------
+// Splits a BAM file on user-specified property, creating a new BAM output
+// file for each value found
+// ***************************************************************************
+
+#ifndef BAMTOOLS_SPLIT_H
+#define BAMTOOLS_SPLIT_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class SplitTool : public AbstractTool
+{
+
+public:
+ SplitTool();
+ ~SplitTool();
+
+public:
+ int Help();
+ int Run(int argc, char* argv[]);
+
+private:
+ struct SplitSettings;
+ SplitSettings* m_settings;
+
+ class SplitToolPrivate;
+ SplitToolPrivate* m_impl;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_SPLIT_H
diff --git a/src/toolkit/bamtools_stats.cpp b/src/toolkit/bamtools_stats.cpp
new file mode 100644
index 0000000..3575aac
--- /dev/null
+++ b/src/toolkit/bamtools_stats.cpp
@@ -0,0 +1,330 @@
+// ***************************************************************************
+// bamtools_cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 December 2012
+// ---------------------------------------------------------------------------
+// Prints general alignment statistics for BAM file(s).
+// ***************************************************************************
+
+#include "bamtools_stats.h"
+
+#include <api/BamMultiReader.h>
+#include <utils/bamtools_options.h>
+using namespace BamTools;
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+// ---------------------------------------------
+// StatsSettings implementation
+
+struct StatsTool::StatsSettings
+{
+
+ // flags
+ bool HasInput;
+ bool HasInputFilelist;
+ bool IsShowingInsertSizeSummary;
+
+ // filenames
+ std::vector<std::string> InputFiles;
+ std::string InputFilelist;
+
+ // constructor
+ StatsSettings()
+ : HasInput(false)
+ , HasInputFilelist(false)
+ , IsShowingInsertSizeSummary(false)
+ {}
+};
+
+// ---------------------------------------------
+// StatsToolPrivate implementation
+
+struct StatsTool::StatsToolPrivate
+{
+
+ // ctor & dtor
+public:
+ StatsToolPrivate(StatsTool::StatsSettings* _settings);
+ ~StatsToolPrivate() {}
+
+ // 'public' interface
+public:
+ bool Run();
+
+ // internal methods
+private:
+ bool CalculateMedian(std::vector<int>& data, double& median);
+ void PrintStats();
+ void ProcessAlignment(const BamAlignment& al);
+
+ // data members
+private:
+ StatsTool::StatsSettings* m_settings;
+ unsigned int m_numReads;
+ unsigned int m_numPaired;
+ unsigned int m_numProperPair;
+ unsigned int m_numMapped;
+ unsigned int m_numBothMatesMapped;
+ unsigned int m_numForwardStrand;
+ unsigned int m_numReverseStrand;
+ unsigned int m_numFirstMate;
+ unsigned int m_numSecondMate;
+ unsigned int m_numSingletons;
+ unsigned int m_numFailedQC;
+ unsigned int m_numDuplicates;
+ std::vector<int> m_insertSizes;
+};
+
+StatsTool::StatsToolPrivate::StatsToolPrivate(StatsTool::StatsSettings* settings)
+ : m_settings(settings)
+ , m_numReads(0)
+ , m_numPaired(0)
+ , m_numProperPair(0)
+ , m_numMapped(0)
+ , m_numBothMatesMapped(0)
+ , m_numForwardStrand(0)
+ , m_numReverseStrand(0)
+ , m_numFirstMate(0)
+ , m_numSecondMate(0)
+ , m_numSingletons(0)
+ , m_numFailedQC(0)
+ , m_numDuplicates(0)
+{
+ m_insertSizes.reserve(100000);
+}
+
+// median is of type double because in the case of even number of data elements,
+// we need to return the average of middle 2 elements
+bool StatsTool::StatsToolPrivate::CalculateMedian(std::vector<int>& data, double& median)
+{
+
+ // skip if data empty
+ if (data.empty()) return false;
+
+ // find middle element
+ std::size_t middleIndex = data.size() / 2;
+ std::vector<int>::iterator target = data.begin() + middleIndex;
+ nth_element(data.begin(), target, data.end());
+
+ // odd number of elements
+ if ((data.size() % 2) != 0) {
+ median = (double)(*target);
+ return true;
+ }
+
+ // even number of elements
+ else {
+ double rightTarget = (double)(*target);
+ std::vector<int>::iterator leftTarget = target - 1;
+ nth_element(data.begin(), leftTarget, data.end());
+ median = (double)((rightTarget + *leftTarget) / 2.0);
+ return true;
+ }
+}
+
+// print BAM file alignment stats
+void StatsTool::StatsToolPrivate::PrintStats()
+{
+
+ std::cout << std::endl;
+ std::cout << "**********************************************" << std::endl;
+ std::cout << "Stats for BAM file(s): " << std::endl;
+ std::cout << "**********************************************" << std::endl;
+ std::cout << std::endl;
+ std::cout << "Total reads: " << m_numReads << std::endl;
+ std::cout << "Mapped reads: " << m_numMapped << "\t("
+ << ((float)m_numMapped / m_numReads) * 100 << "%)" << std::endl;
+ std::cout << "Forward strand: " << m_numForwardStrand << "\t("
+ << ((float)m_numForwardStrand / m_numReads) * 100 << "%)" << std::endl;
+ std::cout << "Reverse strand: " << m_numReverseStrand << "\t("
+ << ((float)m_numReverseStrand / m_numReads) * 100 << "%)" << std::endl;
+ std::cout << "Failed QC: " << m_numFailedQC << "\t("
+ << ((float)m_numFailedQC / m_numReads) * 100 << "%)" << std::endl;
+ std::cout << "Duplicates: " << m_numDuplicates << "\t("
+ << ((float)m_numDuplicates / m_numReads) * 100 << "%)" << std::endl;
+ std::cout << "Paired-end reads: " << m_numPaired << "\t("
+ << ((float)m_numPaired / m_numReads) * 100 << "%)" << std::endl;
+
+ if (m_numPaired != 0) {
+ std::cout << "'Proper-pairs': " << m_numProperPair << "\t("
+ << ((float)m_numProperPair / m_numPaired) * 100 << "%)" << std::endl;
+ std::cout << "Both pairs mapped: " << m_numBothMatesMapped << "\t("
+ << ((float)m_numBothMatesMapped / m_numPaired) * 100 << "%)" << std::endl;
+ std::cout << "Read 1: " << m_numFirstMate << std::endl;
+ std::cout << "Read 2: " << m_numSecondMate << std::endl;
+ std::cout << "Singletons: " << m_numSingletons << "\t("
+ << ((float)m_numSingletons / m_numPaired) * 100 << "%)" << std::endl;
+ }
+
+ if (m_settings->IsShowingInsertSizeSummary) {
+
+ double avgInsertSize = 0.0;
+ if (!m_insertSizes.empty()) {
+ avgInsertSize = (accumulate(m_insertSizes.begin(), m_insertSizes.end(), 0.0) /
+ (double)m_insertSizes.size());
+ std::cout << "Average insert size (absolute value): " << avgInsertSize << std::endl;
+ }
+
+ double medianInsertSize = 0.0;
+ if (CalculateMedian(m_insertSizes, medianInsertSize))
+ std::cout << "Median insert size (absolute value): " << medianInsertSize << std::endl;
+ }
+ std::cout << std::endl;
+}
+
+// use current input alignment to update BAM file alignment stats
+void StatsTool::StatsToolPrivate::ProcessAlignment(const BamAlignment& al)
+{
+
+ // increment total alignment counter
+ ++m_numReads;
+
+ // incrememt counters for pairing-independent flags
+ if (al.IsDuplicate()) ++m_numDuplicates;
+ if (al.IsFailedQC()) ++m_numFailedQC;
+ if (al.IsMapped()) ++m_numMapped;
+
+ // increment strand counters
+ if (al.IsReverseStrand())
+ ++m_numReverseStrand;
+ else
+ ++m_numForwardStrand;
+
+ // if alignment is paired-end
+ if (al.IsPaired()) {
+
+ // increment PE counter
+ ++m_numPaired;
+
+ // increment first mate/second mate counters
+ if (al.IsFirstMate()) ++m_numFirstMate;
+ if (al.IsSecondMate()) ++m_numSecondMate;
+
+ // if alignment is mapped, check mate status
+ if (al.IsMapped()) {
+ // if mate mapped
+ if (al.IsMateMapped()) ++m_numBothMatesMapped;
+ // else singleton
+ else
+ ++m_numSingletons;
+ }
+
+ // check for explicit proper pair flag
+ if (al.IsProperPair()) ++m_numProperPair;
+
+ // store insert size for first mate
+ if (m_settings->IsShowingInsertSizeSummary && al.IsFirstMate() && (al.InsertSize != 0)) {
+ int insertSize = abs(al.InsertSize);
+ m_insertSizes.push_back(insertSize);
+ }
+ }
+}
+
+bool StatsTool::StatsToolPrivate::Run()
+{
+
+ // set to default input if none provided
+ if (!m_settings->HasInput && !m_settings->HasInputFilelist)
+ m_settings->InputFiles.push_back(Options::StandardIn());
+
+ // add files in the filelist to the input file list
+ if (m_settings->HasInputFilelist) {
+
+ std::ifstream filelist(m_settings->InputFilelist.c_str(), std::ios::in);
+ if (!filelist.is_open()) {
+ std::cerr << "bamtools stats ERROR: could not open input BAM file list... Aborting."
+ << std::endl;
+ return false;
+ }
+
+ std::string line;
+ while (std::getline(filelist, line))
+ m_settings->InputFiles.push_back(line);
+ }
+
+ // open the BAM files
+ BamMultiReader reader;
+ if (!reader.Open(m_settings->InputFiles)) {
+ std::cerr << "bamtools stats ERROR: could not open input BAM file(s)... Aborting."
+ << std::endl;
+ reader.Close();
+ return false;
+ }
+
+ // plow through alignments, keeping track of stats
+ BamAlignment al;
+ while (reader.GetNextAlignmentCore(al))
+ ProcessAlignment(al);
+ reader.Close();
+
+ // print stats & exit
+ PrintStats();
+ return true;
+}
+
+// ---------------------------------------------
+// StatsTool implementation
+
+StatsTool::StatsTool()
+ : AbstractTool()
+ , m_settings(new StatsSettings)
+ , m_impl(0)
+{
+ // set program details
+ Options::SetProgramInfo(
+ "bamtools stats", "prints general alignment statistics",
+ "[-in <filename> -in <filename> ... | -list <filelist>] [statsOptions]");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput,
+ m_settings->InputFiles, IO_Opts, Options::StandardIn());
+ Options::AddValueOption("-list", "filename", "the input BAM file list, one line per file", "",
+ m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts);
+
+ OptionGroup* AdditionalOpts = Options::CreateOptionGroup("Additional Stats");
+ Options::AddOption("-insert", "summarize insert size data",
+ m_settings->IsShowingInsertSizeSummary, AdditionalOpts);
+}
+
+StatsTool::~StatsTool()
+{
+
+ delete m_settings;
+ m_settings = 0;
+
+ delete m_impl;
+ m_impl = 0;
+}
+
+int StatsTool::Help()
+{
+ Options::DisplayHelp();
+ return 0;
+}
+
+int StatsTool::Run(int argc, char* argv[])
+{
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // initialize StatsTool with settings
+ m_impl = new StatsToolPrivate(m_settings);
+
+ // run StatsTool, return success/fail
+ if (m_impl->Run())
+ return 0;
+ else
+ return 1;
+}
diff --git a/src/toolkit/bamtools_stats.h b/src/toolkit/bamtools_stats.h
new file mode 100644
index 0000000..dd2e25b
--- /dev/null
+++ b/src/toolkit/bamtools_stats.h
@@ -0,0 +1,38 @@
+// ***************************************************************************
+// bamtools_stats.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 7 April 2011
+// ---------------------------------------------------------------------------
+// Prints general statistics for a single BAM file
+// ***************************************************************************
+
+#ifndef BAMTOOLS_STATS_H
+#define BAMTOOLS_STATS_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class StatsTool : public AbstractTool
+{
+
+public:
+ StatsTool();
+ ~StatsTool();
+
+public:
+ int Help();
+ int Run(int argc, char* argv[]);
+
+private:
+ struct StatsSettings;
+ StatsSettings* m_settings;
+
+ struct StatsToolPrivate;
+ StatsToolPrivate* m_impl;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_STATS_H
diff --git a/src/toolkit/bamtools_tool.h b/src/toolkit/bamtools_tool.h
new file mode 100644
index 0000000..31ddcd7
--- /dev/null
+++ b/src/toolkit/bamtools_tool.h
@@ -0,0 +1,36 @@
+// ***************************************************************************
+// bamtools_tool.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 2 June 2010
+// ---------------------------------------------------------------------------
+// Base class for all other BamTools sub-tools
+// All derived classes must provide Help() and Run() methods
+// ***************************************************************************
+
+#ifndef BAMTOOLS_ABSTRACTTOOL_H
+#define BAMTOOLS_ABSTRACTTOOL_H
+
+#include <string>
+
+namespace BamTools {
+
+class AbstractTool
+{
+
+public:
+ AbstractTool() {}
+ virtual ~AbstractTool() {}
+
+public:
+ virtual int Help() = 0;
+ virtual int Run(int argc, char* argv[]) = 0;
+
+ // derived classes should also provide:
+ // static std::string Description();
+ // static std::String Name();
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_ABSTRACTTOOL_H
diff --git a/src/toolkit/bamtools_version.h.in b/src/toolkit/bamtools_version.h.in
new file mode 100644
index 0000000..34a6d2e
--- /dev/null
+++ b/src/toolkit/bamtools_version.h.in
@@ -0,0 +1,20 @@
+// ***************************************************************************
+// bamtools_version.h.in (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides version information for the BamTools toolkit.
+// ***************************************************************************
+
+#ifndef BAMTOOLS_VERSION_H
+#define BAMTOOLS_VERSION_H
+
+// CMake uses this file as a template to generate "bamtools_version.h".
+// These constants are defined to match the variables set in the build system.
+#define BAMTOOLS_VERSION_MAJOR @BamTools_VERSION_MAJOR@
+#define BAMTOOLS_VERSION_MINOR @BamTools_VERSION_MINOR@
+#define BAMTOOLS_VERSION_PATCH @BamTools_VERSION_PATCH@
+
+#endif // BAMTOOLS_VERSION_H
+
diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt
new file mode 100644
index 0000000..93cb62d
--- /dev/null
+++ b/src/utils/CMakeLists.txt
@@ -0,0 +1,29 @@
+# ==========================
+# BamTools CMakeLists.txt
+# (c) 2010 Derek Barnett
+#
+# src/utils/
+# ==========================
+
+# list include paths
+include_directories( ${BamTools_SOURCE_DIR}/src/api )
+
+# add compiler definitions
+add_definitions( -DBAMTOOLS_UTILS_LIBRARY ) # (for proper exporting of library symbols)
+
+# create BamTools utils library
+add_library( BamTools-utils STATIC
+ bamtools_fasta.cpp
+ bamtools_options.cpp
+ bamtools_pileup_engine.cpp
+ bamtools_utilities.cpp
+ )
+
+# link BamTools-utils library with BamTools automatically
+target_link_libraries( BamTools-utils BamTools )
+
+# set BamTools library properties
+set_target_properties( BamTools-utils PROPERTIES
+ OUTPUT_NAME bamtools-utils
+ PREFIX "lib"
+ )
diff --git a/src/utils/bamtools_fasta.cpp b/src/utils/bamtools_fasta.cpp
new file mode 100644
index 0000000..be55c43
--- /dev/null
+++ b/src/utils/bamtools_fasta.cpp
@@ -0,0 +1,643 @@
+// ***************************************************************************
+// bamtools_fasta.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 9 March 2012 (DB)
+// ---------------------------------------------------------------------------
+// Provides FASTA reading/indexing functionality.
+// ***************************************************************************
+
+#include "utils/bamtools_fasta.h"
+using namespace BamTools;
+
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+struct Fasta::FastaPrivate
+{
+
+ struct FastaIndexData
+ {
+ std::string Name;
+ int32_t Length;
+ int64_t Offset;
+ int32_t LineLength;
+ int32_t
+ ByteLength; // LineLength + newline character(s) - varies on OS where file was generated
+ };
+
+ // data members
+ FILE* Stream;
+ bool IsOpen;
+
+ FILE* IndexStream;
+ bool HasIndex;
+ bool IsIndexOpen;
+
+ std::vector<FastaIndexData> Index;
+
+ // ctor
+ FastaPrivate();
+ ~FastaPrivate();
+
+ // 'public' API methods
+ bool Close();
+ bool CreateIndex(const std::string& indexFilename);
+ bool GetBase(const int& refId, const int& position, char& base);
+ bool GetSequence(const int& refId, const int& start, const int& stop, std::string& sequence);
+ bool Open(const std::string& filename, const std::string& indexFilename);
+
+ // internal methods
+private:
+ void Chomp(char* sequence);
+ bool GetNameFromHeader(const std::string& header, std::string& name);
+ bool GetNextHeader(std::string& header);
+ bool GetNextSequence(std::string& sequence);
+ bool LoadIndexData();
+ bool Rewind();
+ bool WriteIndexData();
+};
+
+Fasta::FastaPrivate::FastaPrivate()
+ : IsOpen(false)
+ , HasIndex(false)
+ , IsIndexOpen(false)
+{}
+
+Fasta::FastaPrivate::~FastaPrivate()
+{
+ Close();
+}
+
+// remove any trailing newlines
+void Fasta::FastaPrivate::Chomp(char* sequence)
+{
+
+ static const int CHAR_LF = 10;
+ static const int CHAR_CR = 13;
+
+ int seqLength = strlen(sequence);
+ if (seqLength == 0) return;
+ --seqLength; // ignore null terminator
+
+ while (sequence[seqLength] == CHAR_LF || sequence[seqLength] == CHAR_CR) {
+ sequence[seqLength] = 0;
+ --seqLength;
+ if (seqLength < 0) break;
+ }
+}
+
+bool Fasta::FastaPrivate::Close()
+{
+
+ // close fasta file
+ if (IsOpen) {
+ fclose(Stream);
+ IsOpen = false;
+ }
+
+ // close index file
+ if (HasIndex && IsIndexOpen) {
+ fclose(IndexStream);
+ HasIndex = false;
+ IsIndexOpen = false;
+ }
+
+ // return success
+ return true;
+}
+
+bool Fasta::FastaPrivate::CreateIndex(const std::string& indexFilename)
+{
+
+ // check that file is open
+ if (!IsOpen) {
+ std::cerr << "FASTA error : cannot create index, FASTA file not open" << std::endl;
+ return false;
+ }
+
+ // rewind FASTA file
+ if (!Rewind()) {
+ std::cerr << "FASTA error : could not rewind FASTA file" << std::endl;
+ return false;
+ }
+
+ // clear out prior index data
+ Index.clear();
+
+ // -------------------------------------------
+ // calculate lineLength & byteLength
+
+ int lineLength = 0;
+ int byteLength = 0;
+
+ // skip over header
+ char buffer[1024];
+ if (fgets(buffer, 1024, Stream) == 0) {
+ std::cerr << "FASTA error : could not read from file" << std::endl;
+ return false;
+ }
+ if (feof(Stream)) return false;
+ if (buffer[0] != '>') {
+ std::cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << std::endl;
+ return false;
+ }
+
+ // read in first line of sequence
+ char c = fgetc(Stream);
+ while ((c >= 0) && (c != '\n')) {
+ ++byteLength;
+ if (isgraph(c)) ++lineLength;
+ c = fgetc(Stream);
+ }
+ ++byteLength; // store newline
+
+ // rewind FASTA file
+ if (!Rewind()) {
+ std::cerr << "FASTA error : could not rewind FASTA file" << std::endl;
+ return false;
+ }
+
+ // iterate through fasta entries
+ int currentId = 0;
+ std::string header;
+ std::string sequence;
+ while (GetNextHeader(header)) {
+
+ // ---------------------------
+ // build index entry data
+ FastaIndexData data;
+
+ // store file offset of beginning of DNA sequence (after header)
+ data.Offset = ftell64(Stream);
+
+ // parse header, store sequence name in data.Name
+ if (!GetNameFromHeader(header, data.Name)) {
+ std::cerr << "FASTA error : could not parse read name from FASTA header" << std::endl;
+ return false;
+ }
+
+ // retrieve FASTA sequence
+ if (!GetNextSequence(sequence)) {
+ std::cerr << "FASTA error : could not read in next sequence from FASTA file"
+ << std::endl;
+ return false;
+ }
+
+ // store sequence length & line/byte lengths
+ data.Length = sequence.length();
+ data.LineLength = lineLength;
+ data.ByteLength = byteLength;
+
+ // store index entry
+ Index.push_back(data);
+
+ // update ref Id
+ ++currentId;
+ }
+
+ // open index file
+ if (!indexFilename.empty()) {
+ IndexStream = fopen(indexFilename.c_str(), "wb");
+ if (!IndexStream) {
+ std::cerr << "FASTA error : Could not open " << indexFilename << " for writing."
+ << std::endl;
+ return false;
+ }
+ IsIndexOpen = true;
+ }
+
+ // write index data
+ if (!WriteIndexData()) return false;
+ HasIndex = true;
+
+ // close index file
+ fclose(IndexStream);
+ IsIndexOpen = false;
+
+ // return succes status
+ return true;
+}
+
+bool Fasta::FastaPrivate::GetBase(const int& refId, const int& position, char& base)
+{
+
+ // make sure FASTA file is open
+ if (!IsOpen) {
+ std::cerr << "FASTA error : file not open for reading" << std::endl;
+ return false;
+ }
+
+ // use index if available
+ if (HasIndex && !Index.empty()) {
+
+ // validate reference id
+ if ((refId < 0) || (refId >= (int)Index.size())) {
+ std::cerr << "FASTA error: invalid refId specified: " << refId << std::endl;
+ return false;
+ }
+
+ // retrieve reference index data
+ const FastaIndexData& referenceData = Index.at(refId);
+
+ // validate position
+ if ((position < 0) || (position > referenceData.Length)) {
+ std::cerr << "FASTA error: invalid position specified: " << position << std::endl;
+ return false;
+ }
+
+ // calculate seek position & attempt jump
+ const int64_t lines = position / referenceData.LineLength;
+ const int64_t lineOffset = position % referenceData.LineLength;
+ const int64_t seekTo =
+ referenceData.Offset + (lines * referenceData.ByteLength) + lineOffset;
+ if (fseek64(Stream, seekTo, SEEK_SET) != 0) {
+ std::cerr << "FASTA error : could not seek in file" << std::endl;
+ return false;
+ }
+
+ // set base & return success
+ base = getc(Stream);
+ return true;
+ }
+
+ // else plow through sequentially
+ else {
+
+ // rewind FASTA file
+ if (!Rewind()) {
+ std::cerr << "FASTA error : could not rewind FASTA file" << std::endl;
+ return false;
+ }
+
+ // iterate through fasta entries
+ int currentId = 0;
+ std::string header;
+ std::string sequence;
+
+ // get first entry
+ GetNextHeader(header);
+ GetNextSequence(sequence);
+
+ while (currentId != refId) {
+ GetNextHeader(header);
+ GetNextSequence(sequence);
+ ++currentId;
+ }
+
+ // get desired base from sequence
+ // TODO: error reporting on invalid position
+ if (currentId == refId && (sequence.length() >= static_cast<std::size_t>(position))) {
+ base = sequence.at(position);
+ return true;
+ }
+
+ // could not get sequence
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+bool Fasta::FastaPrivate::GetNameFromHeader(const std::string& header, std::string& name)
+{
+
+ // get rid of the leading greater than sign
+ std::string s = header.substr(1);
+
+ // extract the first non-whitespace segment
+ char* pName = (char*)s.data();
+ unsigned int nameLen = (unsigned int)s.size();
+
+ unsigned int start = 0;
+ while ((pName[start] == 32) || (pName[start] == 9) || (pName[start] == 10) ||
+ (pName[start] == 13)) {
+ start++;
+ if (start == nameLen) break;
+ }
+
+ unsigned int stop = start;
+ if (stop < nameLen) {
+ while ((pName[stop] != 32) && (pName[stop] != 9) && (pName[stop] != 10) &&
+ (pName[stop] != 13)) {
+ stop++;
+ if (stop == nameLen) break;
+ }
+ }
+
+ if (start == stop) {
+ std::cerr << "FASTA error : could not parse read name from FASTA header" << std::endl;
+ return false;
+ }
+
+ name = s.substr(start, stop - start).c_str();
+ return true;
+}
+
+bool Fasta::FastaPrivate::GetNextHeader(std::string& header)
+{
+
+ // validate input stream
+ if (!IsOpen || feof(Stream)) return false;
+
+ // read in header line
+ char buffer[1024];
+ if (fgets(buffer, 1024, Stream) == 0) {
+ std::cerr << "FASTA error : could not read from file" << std::endl;
+ return false;
+ }
+
+ // make sure it's a FASTA header
+ if (buffer[0] != '>') {
+ std::cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << std::endl;
+ return false;
+ }
+
+ // import buffer contents to header string
+ std::stringstream headerBuffer;
+ headerBuffer << buffer;
+ header = headerBuffer.str();
+
+ // return success
+ return true;
+}
+
+bool Fasta::FastaPrivate::GetNextSequence(std::string& sequence)
+{
+
+ // validate input stream
+ if (!IsOpen || feof(Stream)) return false;
+
+ // read in sequence
+ char buffer[1024];
+ std::ostringstream seqBuffer;
+ while (true) {
+
+ char ch = fgetc(Stream);
+ ungetc(ch, Stream);
+ if ((ch == '>') || feof(Stream)) break;
+
+ if (fgets(buffer, 1024, Stream) == 0) {
+ std::cerr << "FASTA error : could not read from file" << std::endl;
+ return false;
+ }
+
+ Chomp(buffer);
+ seqBuffer << buffer;
+ }
+
+ // import buffer contents to sequence string
+ sequence = seqBuffer.str();
+
+ // return success
+ return true;
+}
+
+bool Fasta::FastaPrivate::GetSequence(const int& refId, const int& start, const int& stop,
+ std::string& sequence)
+{
+
+ // make sure FASTA file is open
+ if (!IsOpen) {
+ std::cerr << "FASTA error : file not open for reading" << std::endl;
+ return false;
+ }
+
+ // use index if available
+ if (HasIndex && !Index.empty()) {
+
+ // validate reference id
+ if ((refId < 0) || (refId >= (int)Index.size())) {
+ std::cerr << "FASTA error: invalid refId specified: " << refId << std::endl;
+ return false;
+ }
+
+ // retrieve reference index data
+ const FastaIndexData& referenceData = Index.at(refId);
+
+ // validate stop position
+ if ((start < 0) || (start > stop) || (stop > referenceData.Length)) {
+ std::cerr << "FASTA error: invalid start/stop positions specified: " << start << ", "
+ << stop << std::endl;
+ return false;
+ }
+
+ // seek to beginning of sequence data
+ if (fseek64(Stream, referenceData.Offset, SEEK_SET) != 0) {
+ std::cerr << "FASTA error : could not sek in file" << std::endl;
+ return false;
+ }
+
+ // retrieve full sequence
+ std::string fullSequence;
+ if (!GetNextSequence(fullSequence)) {
+ std::cerr << "FASTA error : could not retrieve sequence from FASTA file" << std::endl;
+ return false;
+ }
+
+ // set sub-sequence & return success
+ const int seqLength = (stop - start) + 1;
+ sequence = fullSequence.substr(start, seqLength);
+ return true;
+ }
+
+ // else plow through sequentially
+ else {
+
+ // rewind FASTA file
+ if (!Rewind()) {
+ std::cerr << "FASTA error : could not rewind FASTA file" << std::endl;
+ return false;
+ }
+
+ // iterate through fasta entries
+ int currentId = 0;
+ std::string header;
+ std::string fullSequence;
+
+ // get first entry
+ GetNextHeader(header);
+ GetNextSequence(fullSequence);
+
+ while (currentId != refId) {
+ GetNextHeader(header);
+ GetNextSequence(fullSequence);
+ ++currentId;
+ }
+
+ // get desired substring from sequence
+ // TODO: error reporting on invalid start/stop positions
+ if (currentId == refId && (fullSequence.length() >= static_cast<std::size_t>(stop))) {
+ const int seqLength = (stop - start) + 1;
+ sequence = fullSequence.substr(start, seqLength);
+ return true;
+ }
+
+ // could not get sequence
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+bool Fasta::FastaPrivate::LoadIndexData()
+{
+
+ // skip if no index file available
+ if (!IsIndexOpen) return false;
+
+ // clear any prior index data
+ Index.clear();
+
+ char buffer[1024];
+ std::stringstream indexBuffer;
+ while (true) {
+
+ char c = fgetc(IndexStream);
+ if ((c == '\n') || feof(IndexStream)) break;
+ ungetc(c, IndexStream);
+
+ // clear index buffer
+ indexBuffer.str(std::string());
+
+ // read line from index file
+ if (fgets(buffer, 1024, IndexStream) == 0) {
+ std::cerr << "FASTA LoadIndexData() error : could not read from index file"
+ << std::endl;
+ HasIndex = false;
+ return false;
+ }
+
+ // store line in indexBuffer
+ indexBuffer << buffer;
+
+ // retrieve fasta index data from line
+ FastaIndexData data;
+ indexBuffer >> data.Name;
+ indexBuffer >> data.Length;
+ indexBuffer >> data.Offset;
+ indexBuffer >> data.LineLength;
+ indexBuffer >> data.ByteLength;
+
+ // store index entry
+ Index.push_back(data);
+ }
+
+ return true;
+}
+
+bool Fasta::FastaPrivate::Open(const std::string& filename, const std::string& indexFilename)
+{
+
+ bool success = true;
+
+ // open FASTA filename
+ Stream = fopen(filename.c_str(), "rb");
+ if (!Stream) {
+ std::cerr << "FASTA error: Could not open " << filename << " for reading" << std::endl;
+ return false;
+ }
+ IsOpen = true;
+ success &= IsOpen;
+
+ // open index file if it exists
+ if (!indexFilename.empty()) {
+ IndexStream = fopen(indexFilename.c_str(), "rb");
+ if (!IndexStream) {
+ std::cerr << "FASTA error : Could not open " << indexFilename << " for reading."
+ << std::endl;
+ return false;
+ }
+ IsIndexOpen = true;
+ success &= IsIndexOpen;
+
+ // attempt to load index data
+ HasIndex = LoadIndexData();
+ success &= HasIndex;
+ }
+
+ // return success status
+ return success;
+}
+
+bool Fasta::FastaPrivate::Rewind()
+{
+ if (!IsOpen) return false;
+ return (fseek64(Stream, 0, SEEK_SET) == 0);
+}
+
+bool Fasta::FastaPrivate::WriteIndexData()
+{
+
+ // skip if no index file available
+ if (!IsIndexOpen) return false;
+
+ // iterate over index entries
+ bool success = true;
+ std::stringstream indexBuffer;
+ std::vector<FastaIndexData>::const_iterator indexIter = Index.begin();
+ std::vector<FastaIndexData>::const_iterator indexEnd = Index.end();
+ for (; indexIter != indexEnd; ++indexIter) {
+
+ // clear stream
+ indexBuffer.str(std::string());
+
+ // write data to stream
+ const FastaIndexData& data = (*indexIter);
+ indexBuffer << data.Name << '\t' << data.Length << '\t' << data.Offset << '\t'
+ << data.LineLength << '\t' << data.ByteLength << std::endl;
+
+ // write stream to file
+ success &= (fputs(indexBuffer.str().c_str(), IndexStream) >= 0);
+ }
+
+ // return success status
+ return success;
+}
+
+// --------------------------------
+// Fasta implementation
+
+Fasta::Fasta()
+{
+ d = new FastaPrivate;
+}
+
+Fasta::~Fasta()
+{
+ delete d;
+ d = 0;
+}
+
+bool Fasta::Close()
+{
+ return d->Close();
+}
+
+bool Fasta::CreateIndex(const std::string& indexFilename)
+{
+ return d->CreateIndex(indexFilename);
+}
+
+bool Fasta::GetBase(const int& refId, const int& position, char& base)
+{
+ return d->GetBase(refId, position, base);
+}
+
+bool Fasta::GetSequence(const int& refId, const int& start, const int& stop, std::string& sequence)
+{
+ return d->GetSequence(refId, start, stop, sequence);
+}
+
+bool Fasta::Open(const std::string& filename, const std::string& indexFilename)
+{
+ return d->Open(filename, indexFilename);
+}
diff --git a/src/utils/bamtools_fasta.h b/src/utils/bamtools_fasta.h
new file mode 100644
index 0000000..3b30623
--- /dev/null
+++ b/src/utils/bamtools_fasta.h
@@ -0,0 +1,48 @@
+// ***************************************************************************
+// bamtools_fasta.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011
+// ---------------------------------------------------------------------------
+// Provides FASTA reading/indexing functionality.
+// ***************************************************************************
+
+#ifndef BAMTOOLS_FASTA_H
+#define BAMTOOLS_FASTA_H
+
+#include <string>
+#include "utils/utils_global.h"
+
+namespace BamTools {
+
+class UTILS_EXPORT Fasta
+{
+
+ // ctor & dtor
+public:
+ Fasta();
+ ~Fasta();
+
+ // file-handling methods
+public:
+ bool Close();
+ bool Open(const std::string& filename, const std::string& indexFilename = std::string());
+
+ // sequence access methods
+public:
+ bool GetBase(const int& refID, const int& position, char& base);
+ bool GetSequence(const int& refId, const int& start, const int& stop, std::string& sequence);
+
+ // index-handling methods
+public:
+ bool CreateIndex(const std::string& indexFilename);
+
+ // internal implementation
+private:
+ struct FastaPrivate;
+ FastaPrivate* d;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_FASTA_H
diff --git a/src/utils/bamtools_filter_engine.h b/src/utils/bamtools_filter_engine.h
new file mode 100644
index 0000000..ed303a0
--- /dev/null
+++ b/src/utils/bamtools_filter_engine.h
@@ -0,0 +1,575 @@
+// ***************************************************************************
+// bamtools_filter_engine.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 3 May 2013
+// ---------------------------------------------------------------------------
+// Provides a generic filter engine based on filter-sets of properties,
+// with possible "rules" (compound logical expressions) to create more complex
+// queries on a data set.
+//
+// FilterEngine consists, most importantly, of :
+//
+// a list of possible properties (each tagged whether it has been 'enabled' as a filter)
+// a map of filterName => propertySet
+// queue for compound rule expression (i.e. "(filter1 AND filter2) OR !filter3" )
+//
+// Each propertySet is a list of properties enabled for this particular filter object
+//
+// Implemented as a map of propertyNames to propertyFilterValue
+// ( "property1" => pfv1
+// "property2" => pfv2
+// "property4" => pfv4
+// etc. )
+//
+// Any properties that are 'possible', via FilterEngine::addProperty(), but not enabled
+// via FilterEngine::setProperty() (in our example, say "property3"), evaluate to true
+// for any query. Meaning that if a property is not set on this filter, we don't care
+// about it here, so it passes though OK.
+//
+// A propertyFilterValue contains a value and comparison type
+//
+// ( pfv1: Value = 50, Type = GREATER_THAN_EQUAL
+// pfv2: Value = "foo", Type = STARTS_WITH
+// pfv4: Value = "bar", Type = CONTAINS
+// etc. )
+//
+// This allows for more complex queries (than simple isEqual?) against a variety of data types.
+//
+// ***************************************************************************
+
+#ifndef BAMTOOLS_FILTER_ENGINE_H
+#define BAMTOOLS_FILTER_ENGINE_H
+
+#include "utils/bamtools_filter_properties.h"
+#include "utils/bamtools_filter_ruleparser.h"
+#include "utils/bamtools_utilities.h"
+#include "utils/utils_global.h"
+
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <queue>
+#include <sstream>
+#include <stack>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace BamTools {
+
+struct UTILS_EXPORT FilterCompareType
+{
+ enum Type
+ {
+ AND = 0,
+ NOT,
+ OR
+ };
+};
+
+// -----------------------------------------------------------
+// FilterEngine
+
+template <typename FilterChecker>
+class UTILS_EXPORT FilterEngine
+{
+
+ // ctor & dtor
+public:
+ FilterEngine()
+ : m_isRuleQueueGenerated(false)
+ , m_defaultCompareType(FilterCompareType::OR)
+ , AND_OPERATOR(1, '&')
+ , OR_OPERATOR(1, '|')
+ , NOT_OPERATOR(1, '!')
+ {}
+
+ ~FilterEngine() {}
+
+ // 'filter set' methods
+public:
+ // creates a new filter set, returns true if created, false if error or already exists
+ bool addFilter(const std::string& filterName);
+
+ // return list of current filter names
+ const std::vector<std::string> filterNames();
+
+ // 'property' methods
+public:
+ // add a new known property (& type) to engine
+ bool addProperty(const std::string& propertyName);
+
+ // sets property filter (value, type) for propertyName, on a particular filter set
+ // setProperty("filter1", "mapQuality", 50, GREATER_THAN_EQUAL)
+ template <typename T>
+ bool setProperty(
+ const std::string& filterName, const std::string& propertyName, const T& value,
+ const PropertyFilterValue::ValueCompareType& type = PropertyFilterValue::EXACT);
+
+ // returns list of all properties known by FilterEngine ( any created using addProperty() )
+ const std::vector<std::string> allPropertyNames();
+
+ // returns list of property names that are 'enabled' ( only those touched by setProperty() )
+ const std::vector<std::string> enabledPropertyNames();
+
+ // 'rule' methods
+public:
+ // sets comparison operator between filters if no rule string given
+ // default is to do an OR on each filter
+ void setDefaultCompareType(const FilterCompareType::Type& type = FilterCompareType::OR);
+
+ // sets rule string for building expression queue
+ // if empty, creates
+ void setRule(const std::string& ruleString = std::string());
+
+ // token parsing (for property filter generation)
+public:
+ template <typename T>
+ static bool parseToken(const std::string& token, T& value,
+ PropertyFilterValue::ValueCompareType& type);
+
+ // query evaluation
+public:
+ // returns true if query passes all filters in FilterEngine
+ template <typename T>
+ bool check(const T& query);
+
+ // internal rule-handling methods
+private:
+ void buildDefaultRuleString();
+ void buildRuleQueue();
+ template <typename T>
+ bool evaluateFilterRules(const T& query);
+
+ // data members
+private:
+ // all 'filter sets'
+ FilterMap m_filters;
+
+ // all known properties
+ std::vector<Property> m_properties;
+
+ // infix expression of filter-set comparison rules
+ std::string m_ruleString;
+
+ // postfix expression of tokens (filterNames) and operators (as strings)
+ // if this is empty, uses m_compareType to build default expression queue
+ std::queue<std::string> m_ruleQueue;
+
+ // flag to test if the rule expression queue has been generated
+ bool m_isRuleQueueGenerated;
+
+ // 'default' comparison operator between filters if no rule string given
+ // if this is changed, m_ruleString is used to build new m_ruleQueue
+ FilterCompareType::Type m_defaultCompareType;
+
+ // client-specified checking type ( provides method: bool check(PropertyFilter, T object) )
+ FilterChecker m_checker;
+
+ // token-parsing constants
+ static const int NOT_CHAR = (int)'!';
+ static const int EQUAL_CHAR = (int)'=';
+ static const int GREATER_THAN_CHAR = (int)'>';
+ static const int LESS_THAN_CHAR = (int)'<';
+ static const int WILDCARD_CHAR = (int)'*';
+
+ // filter evaluation constants
+ const std::string AND_OPERATOR;
+ const std::string OR_OPERATOR;
+ const std::string NOT_OPERATOR;
+};
+
+// creates a new filter set, returns true if created, false if error or already exists
+template <typename FilterChecker>
+inline bool FilterEngine<FilterChecker>::addFilter(const std::string& filterName)
+{
+ return (m_filters.insert(std::make_pair(filterName, PropertyFilter()))).second;
+}
+
+// add a new known property & type to engine
+template <typename FilterChecker>
+inline bool FilterEngine<FilterChecker>::addProperty(const std::string& propertyName)
+{
+ const std::vector<std::string> propertyNames = allPropertyNames();
+ bool found = std::binary_search(propertyNames.begin(), propertyNames.end(), propertyName);
+ if (found) return false;
+ m_properties.push_back(Property(propertyName));
+ std::sort(m_properties.begin(), m_properties.end());
+ return true;
+}
+
+// returns list of all properties known by FilterEngine
+// ( any that were created using addProperty() )
+template <typename FilterChecker>
+inline const std::vector<std::string> FilterEngine<FilterChecker>::allPropertyNames()
+{
+ // set up stringlist
+ std::vector<std::string> names;
+ names.reserve(m_properties.size());
+ // iterate through all properties, appending to stringlist
+ std::vector<Property>::const_iterator propIter = m_properties.begin();
+ std::vector<Property>::const_iterator propEnd = m_properties.end();
+ for (; propIter != propEnd; ++propIter)
+ names.push_back((*propIter).Name);
+ // return stringlist
+ return names;
+}
+
+// builds a default rule string based on m_defaultCompareType
+// used if user supplied an explicit rule string
+template <typename FilterChecker>
+inline void FilterEngine<FilterChecker>::buildDefaultRuleString()
+{
+
+ // set up temp string stream
+ std::stringstream ruleStream;
+
+ // get first filterName
+ FilterMap::const_iterator mapIter = m_filters.begin();
+ ruleStream << (*mapIter).first;
+
+ // if there are more filters present
+ // iterate over remaining filters, appending compare operator and filter name
+ if (m_filters.size() > 1) {
+ for (++mapIter; mapIter != m_filters.end(); ++mapIter)
+ ruleStream << ((m_defaultCompareType == FilterCompareType::AND) ? " & " : " | ")
+ << (*mapIter).first;
+ }
+
+ // set m_ruleString from temp stream
+ m_ruleString = ruleStream.str();
+}
+
+// build expression queue based on ruleString
+template <typename FilterChecker>
+inline void FilterEngine<FilterChecker>::buildRuleQueue()
+{
+
+ // skip if no filters present
+ if (m_filters.empty()) return;
+
+ // clear out any prior expression queue data
+ while (!m_ruleQueue.empty())
+ m_ruleQueue.pop();
+
+ // create a rule string, if not provided
+ if (m_ruleString.empty()) buildDefaultRuleString();
+
+ // initialize RuleParser, run, and retrieve results
+ RuleParser ruleParser(m_ruleString);
+ ruleParser.parse();
+ m_ruleQueue = ruleParser.results();
+
+ // set flag if rule queue contains any values
+ m_isRuleQueueGenerated = (!m_ruleQueue.empty());
+}
+
+// returns whether query value passes filter engine rules
+template <class FilterChecker>
+template <typename T>
+bool FilterEngine<FilterChecker>::check(const T& query)
+{
+
+ // return result of querying against filter rules
+ return evaluateFilterRules(query);
+}
+
+// returns list of property names that are 'enabled' ( only those touched by setProperty() )
+template <typename FilterChecker>
+inline const std::vector<std::string> FilterEngine<FilterChecker>::enabledPropertyNames()
+{
+ // initialize stringlist
+ std::vector<std::string> names;
+ names.reserve(m_properties.size());
+ // iterate over all properties, appending if enabled
+ std::vector<Property>::const_iterator propIter = m_properties.begin();
+ std::vector<Property>::const_iterator propEnd = m_properties.end();
+ for (; propIter != propEnd; ++propIter)
+ if ((*propIter).IsEnabled) names.push_back((*propIter).Name);
+ // return stringlist
+ return names;
+}
+
+// evaluates postfix rule queue - with each filter as an operand, AND|OR|NOT as operators
+template <class FilterChecker>
+template <typename T>
+bool FilterEngine<FilterChecker>::evaluateFilterRules(const T& query)
+{
+
+ // build ruleQueue if not done before
+ if (!m_isRuleQueueGenerated) buildRuleQueue();
+
+ std::stack<bool> resultStack;
+ FilterMap::const_iterator filterIter;
+ std::queue<std::string> ruleQueueCopy = m_ruleQueue;
+ while (!ruleQueueCopy.empty()) {
+ const std::string& token = ruleQueueCopy.front();
+
+ // token is NOT_OPERATOR
+ if (token == FilterEngine<FilterChecker>::NOT_OPERATOR) {
+ BAMTOOLS_ASSERT_MESSAGE(!resultStack.empty(),
+ "Empty result stack - cannot apply operator: !");
+ resultStack.top() = !resultStack.top();
+ }
+
+ // token is AND_OPERATOR
+ else if (token == FilterEngine<FilterChecker>::AND_OPERATOR) {
+ BAMTOOLS_ASSERT_MESSAGE(resultStack.size() >= 2,
+ "Not enough operands - cannot apply operator: &");
+ bool topResult = resultStack.top();
+ resultStack.pop();
+ resultStack.top() &= topResult;
+ }
+
+ // token is OR_OPERATOR
+ else if (token == FilterEngine<FilterChecker>::OR_OPERATOR) {
+ BAMTOOLS_ASSERT_MESSAGE(resultStack.size() >= 2,
+ "Not enough operands - cannot apply operator: |");
+ bool topResult = resultStack.top();
+ resultStack.pop();
+ resultStack.top() |= topResult;
+ }
+
+ // token is an operand
+ else {
+ // look up PropertyFilter that matches this token
+ filterIter = m_filters.find(token);
+ BAMTOOLS_ASSERT_MESSAGE((filterIter != m_filters.end()),
+ "Filter mentioned in rule, not found in FilterEngine");
+ const PropertyFilter& filter = (*filterIter).second;
+ bool result = m_checker.check(filter, query);
+ resultStack.push(result);
+ }
+
+ // pop token from ruleQueue
+ ruleQueueCopy.pop();
+ }
+
+ // return last result
+ BAMTOOLS_ASSERT_MESSAGE(
+ resultStack.size() == 1,
+ "Result stack should only have one value remaining - cannot return result");
+ return resultStack.top();
+}
+
+// return list of current filter names
+template <typename FilterChecker>
+inline const std::vector<std::string> FilterEngine<FilterChecker>::filterNames()
+{
+ // initialize stringlist
+ std::vector<std::string> names;
+ names.reserve(m_filters.size());
+ // iterate over all filters, appending filter name
+ FilterMap::const_iterator mapIter = m_filters.begin();
+ FilterMap::const_iterator mapEnd = m_filters.end();
+ for (; mapIter != mapEnd; ++mapIter)
+ names.push_back((*mapIter).first);
+ // return stringlist
+ return names;
+}
+
+// parse a filterValue token string that may contain comparison qualifiers (">50", "*SRR", etc.)
+template <class FilterChecker>
+template <typename T>
+bool FilterEngine<FilterChecker>::parseToken(const std::string& token, T& value,
+ PropertyFilterValue::ValueCompareType& type)
+{
+
+ // skip if token is empty
+ if (token.empty()) return false;
+
+ // will store token after special chars are removed
+ std::string strippedToken;
+
+ // if only single character
+ if (token.length() == 1) {
+ strippedToken = token;
+ type = PropertyFilterValue::EXACT;
+ }
+
+ // more than one character, check for special chars
+ else {
+ const int firstChar = (int)token.at(0);
+ switch (firstChar) {
+
+ case (FilterEngine<FilterChecker>::NOT_CHAR):
+ strippedToken = token.substr(1);
+ type = PropertyFilterValue::NOT;
+ break;
+
+ case (FilterEngine<FilterChecker>::GREATER_THAN_CHAR):
+
+ // check for '>=' case
+ if (token.at(1) == FilterEngine<FilterChecker>::EQUAL_CHAR) {
+ if (token.length() == 2) return false;
+ strippedToken = token.substr(2);
+ type = PropertyFilterValue::GREATER_THAN_EQUAL;
+ }
+
+ // otherwise only '>'
+ else {
+ strippedToken = token.substr(1);
+ type = PropertyFilterValue::GREATER_THAN;
+ }
+
+ break;
+
+ case (FilterEngine<FilterChecker>::LESS_THAN_CHAR):
+
+ // check for '<=' case
+ if (token.at(1) == FilterEngine<FilterChecker>::EQUAL_CHAR) {
+ if (token.length() == 2) return false;
+ strippedToken = token.substr(2);
+ type = PropertyFilterValue::LESS_THAN_EQUAL;
+ }
+
+ // otherwise only '<'
+ else {
+ strippedToken = token.substr(1);
+ type = PropertyFilterValue::LESS_THAN;
+ }
+
+ break;
+
+ case (FilterEngine<FilterChecker>::WILDCARD_CHAR):
+
+ // check for *str* case (CONTAINS)
+ if (token.at(token.length() - 1) == FilterEngine<FilterChecker>::WILDCARD_CHAR) {
+ if (token.length() == 2) return false;
+ strippedToken = token.substr(1, token.length() - 2);
+ type = PropertyFilterValue::CONTAINS;
+ }
+
+ // otherwise *str case (ENDS_WITH)
+ else {
+ strippedToken = token.substr(1);
+ type = PropertyFilterValue::ENDS_WITH;
+ }
+
+ break;
+
+ default:
+ // check for str* case (STARTS_WITH)
+ if (token.at(token.length() - 1) == FilterEngine<FilterChecker>::WILDCARD_CHAR) {
+ if (token.length() == 2) return false;
+ strippedToken = token.substr(0, token.length() - 1);
+ type = PropertyFilterValue::STARTS_WITH;
+ }
+
+ // otherwise EXACT
+ else {
+ strippedToken = token;
+ type = PropertyFilterValue::EXACT;
+ }
+
+ break;
+ }
+ }
+
+ // convert stripped token to value
+ std::stringstream stream(strippedToken);
+ if (strippedToken == "true" || strippedToken == "false")
+ stream >> std::boolalpha >> value;
+ else
+ stream >> value;
+
+ // check for valid CompareType on type T
+ Variant variantCheck = value;
+
+ // if T is not string AND CompareType is for string values, return false
+ if (!variantCheck.is_type<std::string>()) {
+ if (type == PropertyFilterValue::CONTAINS || type == PropertyFilterValue::ENDS_WITH ||
+ type == PropertyFilterValue::STARTS_WITH)
+
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+// sets comparison operator between filters if no rule string given
+// default is to do an OR on each filter
+template <typename FilterChecker>
+inline void FilterEngine<FilterChecker>::setDefaultCompareType(const FilterCompareType::Type& type)
+{
+ // check for supported compare type
+ if (type == FilterCompareType::AND || type == FilterCompareType::OR) {
+ // if not the current compare type
+ if (m_defaultCompareType != type) {
+ m_defaultCompareType = type;
+ buildRuleQueue();
+ }
+ }
+}
+
+// sets property filter (value, type) for propertyName, on a particular filter set
+// setProperty("filter1", "mapQuality", 50, GREATER_THAN_EQUAL)
+template <class FilterChecker>
+template <typename T>
+bool FilterEngine<FilterChecker>::setProperty(const std::string& filterName,
+ const std::string& propertyName, const T& value,
+ const PropertyFilterValue::ValueCompareType& type)
+{
+ // lookup filter by name, return false if not found
+ FilterMap::iterator filterIter = m_filters.find(filterName);
+ if (filterIter == m_filters.end()) return false;
+
+ // lookup property for filter, add new PropertyFilterValue if not found, modify if already exists
+ PropertyFilter& filter = (*filterIter).second;
+ PropertyMap::iterator propertyIter = filter.Properties.find(propertyName);
+
+ bool success;
+
+ // property not found for this filter, create new entry
+ if (propertyIter == filter.Properties.end())
+ success = (filter.Properties.insert(
+ std::make_pair(propertyName, PropertyFilterValue(value, type))))
+ .second;
+
+ // property already exists, modify
+ else {
+ PropertyFilterValue& filterValue = (*propertyIter).second;
+ filterValue.Value = value;
+ filterValue.Type = type;
+ success = true;
+ }
+
+ // if error so far, return false
+ if (!success) return false;
+
+ // --------------------------------------------
+ // otherwise, set Property.IsEnabled to true
+
+ // lookup property
+ std::vector<Property>::iterator knownPropertyIter =
+ std::find(m_properties.begin(), m_properties.end(), propertyName);
+
+ // if not found, create a new (enabled) entry (& re-sort list)
+ if (knownPropertyIter == m_properties.end()) {
+ m_properties.push_back(Property(propertyName, true));
+ std::sort(m_properties.begin(), m_properties.end());
+ }
+
+ // property already known, set as enabled
+ else
+ (*knownPropertyIter).IsEnabled = true;
+
+ // return success
+ return true;
+}
+
+// sets user-specified rule string & signals update of rule-expression queue
+template <typename FilterChecker>
+inline void FilterEngine<FilterChecker>::setRule(const std::string& ruleString)
+{
+ if (m_ruleString != ruleString) {
+ m_ruleString = ruleString;
+ buildRuleQueue();
+ }
+}
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_FILTER_ENGINE_H
diff --git a/src/utils/bamtools_filter_properties.h b/src/utils/bamtools_filter_properties.h
new file mode 100644
index 0000000..550b08f
--- /dev/null
+++ b/src/utils/bamtools_filter_properties.h
@@ -0,0 +1,234 @@
+// ***************************************************************************
+// bamtools_filter_properties.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011
+// ---------------------------------------------------------------------------
+// Provides support data structures & methods for FilterEngine
+//
+// The FilterEngine consists, most importantly, of :
+//
+// a list of possible properties (each tagged whether it has been 'enabled' as a filter)
+// a map of filterName => propertySet
+// queue for compound rule expression (i.e. "(filter1 AND filter2) OR !filter3" )
+//
+// Each propertySet is a list of properties enabled for this particular filter object
+//
+// Implemented as a map of propertyNames to propertyFilterValue
+// ( "property1" => pfv1
+// "property2" => pfv2
+// "property4" => pfv4
+// etc. )
+//
+// Any properties that are 'possible', via FilterEngine::addProperty(), but not enabled
+// via FilterEngine::setProperty() (in our example, say "property3"), evaluate to true
+// for any query. Meaning that if a property is not set on this filter, we don't care
+// about it here, so it passes though OK.
+//
+// A propertyFilterValue contains a value and comparison type
+//
+// ( pfv1: Value = 50, Type = GREATER_THAN_EQUAL
+// pfv2: Value = "foo", Type = STARTS_WITH
+// pfv4: Value = "bar", Type = CONTAINS
+// etc. )
+//
+// This allows for more complex queries (than simple isEqual?) against a variety of data types.
+//
+// ***************************************************************************
+
+#ifndef BAMTOOLS_FILTER_PROPERTIES_H
+#define BAMTOOLS_FILTER_PROPERTIES_H
+
+#include <iostream>
+#include <map>
+#include <string>
+#include "utils/bamtools_utilities.h"
+#include "utils/bamtools_variant.h"
+#include "utils/utils_global.h"
+
+namespace BamTools {
+
+// ----------------------------------------------------------
+// PropertyFilterValue
+
+struct UTILS_EXPORT PropertyFilterValue
+{
+
+ // define valid ValueCompareTypes
+ enum ValueCompareType
+ {
+ CONTAINS = 0,
+ ENDS_WITH,
+ EXACT,
+ GREATER_THAN,
+ GREATER_THAN_EQUAL,
+ LESS_THAN,
+ LESS_THAN_EQUAL,
+ NOT,
+ STARTS_WITH
+ };
+
+ // ctor
+ PropertyFilterValue(const Variant& value = Variant(),
+ const ValueCompareType& type = PropertyFilterValue::EXACT)
+ : Value(value)
+ , Type(type)
+ {}
+
+ // filter check methods
+ template <typename T>
+ bool check(const T& query) const;
+ bool check(const std::string& query) const;
+
+ // data members
+ Variant Value;
+ ValueCompareType Type;
+};
+
+// checks a query against a filter (value, compare type)
+template <typename T>
+bool PropertyFilterValue::check(const T& query) const
+{
+
+ // ensure filter value & query are same type
+ if (!Value.is_type<T>()) {
+ std::cerr << "Cannot compare different types!" << std::endl;
+ return false;
+ }
+
+ // string matching
+ if (Value.is_type<std::string>()) {
+ std::cerr << "Cannot compare different types - query is a string!" << std::endl;
+ return false;
+ }
+
+ // numeric matching based on our filter type
+ switch (Type) {
+ case (PropertyFilterValue::EXACT):
+ return (query == Value.get<T>());
+ case (PropertyFilterValue::GREATER_THAN):
+ return (query > Value.get<T>());
+ case (PropertyFilterValue::GREATER_THAN_EQUAL):
+ return (query >= Value.get<T>());
+ case (PropertyFilterValue::LESS_THAN):
+ return (query < Value.get<T>());
+ case (PropertyFilterValue::LESS_THAN_EQUAL):
+ return (query <= Value.get<T>());
+ case (PropertyFilterValue::NOT):
+ return (query != Value.get<T>());
+ default:
+ BAMTOOLS_ASSERT_UNREACHABLE;
+ }
+ return false;
+}
+
+// checks a string query against filter (value, compare type)
+inline bool PropertyFilterValue::check(const std::string& query) const
+{
+
+ // ensure filter value & query are same type
+ if (!Value.is_type<std::string>()) {
+ std::cerr << "Cannot compare different types!" << std::endl;
+ return false;
+ }
+
+ // localize string version of our filter value
+ const std::string& valueString = Value.get<std::string>();
+
+ // string matching based on our filter type
+ switch (Type) {
+ case (PropertyFilterValue::CONTAINS):
+ return (query.find(valueString) != std::string::npos);
+ case (PropertyFilterValue::ENDS_WITH):
+ return (query.find(valueString) == (query.length() - valueString.length()));
+ case (PropertyFilterValue::EXACT):
+ return (query == valueString);
+ case (PropertyFilterValue::GREATER_THAN):
+ return (query > valueString);
+ case (PropertyFilterValue::GREATER_THAN_EQUAL):
+ return (query >= valueString);
+ case (PropertyFilterValue::LESS_THAN):
+ return (query < valueString);
+ case (PropertyFilterValue::LESS_THAN_EQUAL):
+ return (query <= valueString);
+ case (PropertyFilterValue::NOT):
+ return (query != valueString);
+ case (PropertyFilterValue::STARTS_WITH):
+ return (query.find(valueString) == 0);
+ default:
+ BAMTOOLS_ASSERT_UNREACHABLE;
+ }
+ return false;
+}
+
+inline const std::string toString(const PropertyFilterValue::ValueCompareType& type)
+{
+
+ switch (type) {
+ case (PropertyFilterValue::CONTAINS):
+ return std::string("CONTAINS");
+ case (PropertyFilterValue::ENDS_WITH):
+ return std::string("ENDS_WITH");
+ case (PropertyFilterValue::EXACT):
+ return std::string("EXACT");
+ case (PropertyFilterValue::GREATER_THAN):
+ return std::string("GREATER_THAN");
+ case (PropertyFilterValue::GREATER_THAN_EQUAL):
+ return std::string("GREATER_THAN_EQUAL");
+ case (PropertyFilterValue::LESS_THAN):
+ return std::string("LESS_THAN");
+ case (PropertyFilterValue::LESS_THAN_EQUAL):
+ return std::string("LESS_THAN_EQUAL");
+ case (PropertyFilterValue::NOT):
+ return std::string("NOT");
+ case (PropertyFilterValue::STARTS_WITH):
+ return std::string("STARTS_WITH");
+ default:
+ BAMTOOLS_ASSERT_UNREACHABLE;
+ }
+ return std::string();
+}
+
+// property name => property filter value
+// ('name' => ('SSR', STARTS_WITH), 'mapQuality' => (50, GREATER_THAN_EQUAL), etc...)
+typedef std::map<std::string, PropertyFilterValue> PropertyMap;
+
+// ----------------------------------------------------------
+// PropertyFilter
+
+struct UTILS_EXPORT PropertyFilter
+{
+ // data members
+ PropertyMap Properties;
+};
+
+// filter name => properties
+// ('filter1' => properties1, 'filter2' => properties2, etc...)
+typedef std::map<std::string, PropertyFilter> FilterMap;
+
+// ----------------------------------------------------------
+// Property
+
+// used to store properties known to engine & keep track of enabled state
+struct UTILS_EXPORT Property
+{
+ std::string Name;
+ bool IsEnabled;
+ Property(const std::string& name, bool isEnabled = false)
+ : Name(name)
+ , IsEnabled(isEnabled)
+ {}
+};
+
+inline bool operator<(const Property& lhs, const Property& rhs)
+{
+ return lhs.Name < rhs.Name;
+}
+inline bool operator==(const Property& lhs, const Property& rhs)
+{
+ return lhs.Name == rhs.Name;
+}
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_FILTER_PROPERTIES_H
diff --git a/src/utils/bamtools_filter_ruleparser.h b/src/utils/bamtools_filter_ruleparser.h
new file mode 100644
index 0000000..15a6ada
--- /dev/null
+++ b/src/utils/bamtools_filter_ruleparser.h
@@ -0,0 +1,337 @@
+// ***************************************************************************
+// bamtools_filter_ruleparser.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011
+// ---------------------------------------------------------------------------
+// Provides a compound rule parser for FilterEngine.
+// ***************************************************************************
+
+#ifndef BAMTOOLS_FILTER_RULEPARSER_H
+#define BAMTOOLS_FILTER_RULEPARSER_H
+
+#include <queue>
+#include <stack>
+#include <string>
+#include "utils/bamtools_utilities.h"
+
+namespace BamTools {
+
+// -------------------------------------------
+// char constants
+
+const char LEFT_PARENTHESIS_CHAR = '(';
+const char RIGHT_PARENTHESIS_CHAR = ')';
+const char AND_OPERATOR_CHAR = '&';
+const char OR_OPERATOR_CHAR = '|';
+const char NOT_OPERATOR_CHAR = '!';
+const char SPACE_CHAR = ' ';
+
+// -------------------------------------------
+// RuleToken implementation
+
+struct RuleToken
+{
+
+ // enums
+ enum RuleTokenType
+ {
+ OPERAND = 0,
+ AND_OPERATOR,
+ OR_OPERATOR,
+ NOT_OPERATOR,
+ LEFT_PARENTHESIS,
+ RIGHT_PARENTHESIS
+ };
+
+ // data members
+ RuleTokenType Type;
+ std::string Value;
+};
+
+inline int priority(const RuleToken& token)
+{
+ switch (token.Type) {
+ case (RuleToken::NOT_OPERATOR):
+ return 3;
+ case (RuleToken::AND_OPERATOR):
+ return 2;
+ case (RuleToken::OR_OPERATOR):
+ return 1;
+ case (RuleToken::LEFT_PARENTHESIS):
+ return 0;
+ case (RuleToken::RIGHT_PARENTHESIS):
+ return 0;
+ default:
+ BAMTOOLS_ASSERT_UNREACHABLE;
+ return -1;
+ }
+}
+
+inline bool isRightAssociative(const RuleToken& token)
+{
+ return (token.Type == RuleToken::NOT_OPERATOR || token.Type == RuleToken::LEFT_PARENTHESIS);
+}
+
+inline bool isLeftAssociative(const RuleToken& token)
+{
+ return !isRightAssociative(token);
+}
+
+inline bool isLeftParenthesis(const RuleToken& token)
+{
+ return (token.Type == RuleToken::LEFT_PARENTHESIS);
+}
+
+inline bool isRightParenthesis(const RuleToken& token)
+{
+ return (token.Type == RuleToken::RIGHT_PARENTHESIS);
+}
+
+inline bool isOperand(const RuleToken& token)
+{
+ return (token.Type == RuleToken::OPERAND);
+}
+
+inline bool isOperator(const RuleToken& token)
+{
+ return (token.Type == RuleToken::AND_OPERATOR || token.Type == RuleToken::OR_OPERATOR ||
+ token.Type == RuleToken::NOT_OPERATOR);
+}
+
+// -------------------------------------------
+// RuleParser implementation
+
+class RuleParser
+{
+
+ // ctor & dtor
+public:
+ RuleParser(const std::string& ruleString)
+ : m_ruleString(ruleString)
+ {
+ // initialize char markers
+ m_begin = (char*)m_ruleString.c_str();
+ m_end = m_begin + m_ruleString.length();
+ ignoreQuotes();
+ }
+
+ ~RuleParser() {}
+
+ // public interface
+public:
+ void parse();
+ std::queue<std::string> results() const
+ {
+ return m_ruleQueue;
+ }
+
+ // internal methods
+private:
+ char getNextChar();
+ void ignoreQuotes();
+ bool readToken(RuleToken& token);
+ void skipSpaces();
+
+ // data members
+private:
+ std::string m_ruleString;
+ char* m_begin;
+ char* m_current;
+ char* m_end;
+
+ std::queue<std::string> m_ruleQueue;
+ std::stack<RuleToken> m_operatorStack;
+};
+
+inline char RuleParser::getNextChar()
+{
+ if (m_current == m_end) return 0;
+ return *m_current++;
+}
+
+inline void RuleParser::ignoreQuotes()
+{
+ if (*m_begin == '\"') ++m_begin;
+ if (*m_end == '\"') --m_end;
+}
+
+inline void RuleParser::parse()
+{
+
+ // clear out any prior data
+ while (!m_ruleQueue.empty())
+ m_ruleQueue.pop();
+
+ // skip if no rule to parse
+ if (m_ruleString.empty()) return;
+
+ // start at beginning of ruleString
+ m_current = m_begin;
+
+ // iterate through tokens in rule string
+ RuleToken token;
+ while (readToken(token)) {
+
+ if (token.Value.empty()) break;
+
+ // if token is an operand
+ if (isOperand(token)) m_ruleQueue.push(token.Value);
+
+ // if token is an operator
+ else if (isOperator(token)) {
+
+ // pop any operators at top of stack with higher priority
+ while (!m_operatorStack.empty()) {
+ const RuleToken& opToken = m_operatorStack.top();
+ if ((isLeftAssociative(token) && (priority(token) <= priority(opToken))) ||
+ (isRightAssociative(token) && (priority(token) < priority(opToken)))) {
+ m_ruleQueue.push(opToken.Value);
+ m_operatorStack.pop();
+ } else
+ break;
+ }
+
+ // push current operator token onto stack
+ m_operatorStack.push(token);
+ }
+
+ // if token is left parenthesis
+ else if (isLeftParenthesis(token))
+ m_operatorStack.push(token);
+
+ // if token is right parenthesis
+ else if (isRightParenthesis(token)) {
+
+ bool foundLeftParenthesis = false;
+
+ // push operators into rule queue until left parenthesis found
+ while (!m_operatorStack.empty() && !foundLeftParenthesis) {
+ const RuleToken& opToken = m_operatorStack.top();
+ if (!isLeftParenthesis(opToken))
+ m_ruleQueue.push(opToken.Value);
+ else
+ foundLeftParenthesis = true;
+ m_operatorStack.pop();
+ }
+
+ // no left parenthesis found, error
+ BAMTOOLS_ASSERT_MESSAGE(foundLeftParenthesis,
+ "ERROR: Mismatched parenthesis in rule string.1");
+ }
+
+ // error: unknown operand
+ else
+ BAMTOOLS_ASSERT_UNREACHABLE;
+ }
+
+ // while there are still operators on stack
+ while (!m_operatorStack.empty()) {
+ const RuleToken& token = m_operatorStack.top();
+ BAMTOOLS_ASSERT_MESSAGE((!isLeftParenthesis(token) && !isRightParenthesis(token)),
+ "ERROR: Mismatched parenthesis in rule string.2");
+ m_ruleQueue.push(token.Value);
+ m_operatorStack.pop();
+ }
+}
+
+inline bool RuleParser::readToken(RuleToken& token)
+{
+
+ // skip any preceding whitespace
+ skipSpaces();
+ if (m_current == m_end) return false;
+
+ // clear out prior token value
+ token.Value.clear();
+
+ // read chars while still in token
+ char c = 1;
+ bool keepReading = true;
+ bool inOperandString = false;
+ while (keepReading && (c != 0)) {
+
+ // get next char
+ c = getNextChar();
+ switch (c) {
+
+ // current char is '('
+ case (LEFT_PARENTHESIS_CHAR):
+ token.Type = RuleToken::LEFT_PARENTHESIS;
+ token.Value.append(1, LEFT_PARENTHESIS_CHAR);
+ keepReading = false;
+ break;
+
+ // current char is ')'
+ case (RIGHT_PARENTHESIS_CHAR):
+ if (inOperandString)
+ --m_current;
+ else {
+ token.Type = RuleToken::RIGHT_PARENTHESIS;
+ token.Value.append(1, RIGHT_PARENTHESIS_CHAR);
+ }
+ keepReading = false;
+ break;
+
+ // current char is '&'
+ case (AND_OPERATOR_CHAR):
+ if (inOperandString)
+ --m_current;
+ else {
+ token.Type = RuleToken::AND_OPERATOR;
+ token.Value.append(1, AND_OPERATOR_CHAR);
+ }
+ keepReading = false;
+ break;
+
+ // current char is '|'
+ case (OR_OPERATOR_CHAR):
+ if (inOperandString)
+ --m_current;
+ else {
+ token.Type = RuleToken::OR_OPERATOR;
+ token.Value.append(1, OR_OPERATOR_CHAR);
+ }
+ keepReading = false;
+ break;
+
+ // current char is '!'
+ case (NOT_OPERATOR_CHAR):
+ token.Type = RuleToken::NOT_OPERATOR;
+ token.Value.append(1, NOT_OPERATOR_CHAR);
+ keepReading = false;
+ break;
+
+ // current char is ' '
+ case (SPACE_CHAR):
+ keepReading = false;
+ break;
+
+ // current char is a true value token
+ default:
+ if (c != 0) {
+ token.Type = RuleToken::OPERAND;
+ token.Value.append(1, c);
+ inOperandString = true;
+ keepReading = true;
+ }
+ }
+ }
+
+ return true;
+}
+
+inline void RuleParser::skipSpaces()
+{
+ while (m_current != m_end) {
+ const char c = *m_current;
+ if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
+ ++m_current;
+ else
+ break;
+ }
+}
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_FILTER_RULEPARSER_H
diff --git a/src/utils/bamtools_options.cpp b/src/utils/bamtools_options.cpp
new file mode 100644
index 0000000..115581b
--- /dev/null
+++ b/src/utils/bamtools_options.cpp
@@ -0,0 +1,305 @@
+// ***************************************************************************
+// bamtools_options.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011
+// ---------------------------------------------------------------------------
+// Parses command line arguments and creates a help menu
+// ---------------------------------------------------------------------------
+// Modified from:
+// The Mosaik suite's command line parser class: COptions
+// (c) 2006 - 2009 Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// Re-licensed under MIT License with author's permission.
+//
+// * Modified slightly to fit BamTools, otherwise code is same.
+// * (BamTools namespace, added stdin/stdout) (DB)
+// ***************************************************************************
+
+#include "utils/bamtools_options.h"
+using namespace BamTools;
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iomanip>
+#include <sstream>
+
+std::string Options::m_programName; // the program name
+std::string Options::m_description; // the main description
+std::string Options::m_exampleArguments; // the example arguments
+std::vector<OptionGroup> Options::m_optionGroups; // stores the option groups
+std::map<std::string, OptionValue> Options::m_optionsMap; // stores the options in a map
+const std::string Options::m_stdin = "stdin"; // string representation of stdin
+const std::string Options::m_stdout = "stdout"; // string representation of stdout
+
+// adds a simple option to the parser
+void Options::AddOption(const std::string& argument, const std::string& optionDescription,
+ bool& foundArgument, OptionGroup* group)
+{
+ Option o;
+ o.Argument = argument;
+ o.Description = optionDescription;
+ o.StoreValue = false;
+ group->Options.push_back(o);
+
+ OptionValue ov;
+ ov.pFoundArgument = &foundArgument;
+ ov.StoreValue = false;
+
+ m_optionsMap[argument] = ov;
+}
+
+// creates an option group
+OptionGroup* Options::CreateOptionGroup(const std::string& groupName)
+{
+ OptionGroup og;
+ og.Name = groupName;
+ m_optionGroups.push_back(og);
+ return &m_optionGroups[m_optionGroups.size() - 1];
+}
+
+// displays the help menu
+void Options::DisplayHelp()
+{
+
+ // initialize
+ char argumentBuffer[ARGUMENT_LENGTH + 1];
+ std::ostringstream sb;
+
+ char indentBuffer[MAX_LINE_LENGTH - DESC_LENGTH + 1];
+ memset(indentBuffer, ' ', MAX_LINE_LENGTH - DESC_LENGTH);
+ indentBuffer[MAX_LINE_LENGTH - DESC_LENGTH] = 0;
+
+ // display the menu
+ printf("Description: %s.\n\n", m_description.c_str());
+ printf("Usage: ");
+ printf("%s", m_programName.c_str());
+ printf(" %s\n\n", m_exampleArguments.c_str());
+
+ std::vector<Option>::const_iterator optionIter;
+ std::vector<OptionGroup>::const_iterator groupIter;
+ for (groupIter = m_optionGroups.begin(); groupIter != m_optionGroups.end(); ++groupIter) {
+
+ printf("%s:\n", groupIter->Name.c_str());
+
+ for (optionIter = groupIter->Options.begin(); optionIter != groupIter->Options.end();
+ ++optionIter) {
+
+ if (optionIter->StoreValue)
+ snprintf(argumentBuffer, ARGUMENT_LENGTH + 1, " %s <%s>",
+ optionIter->Argument.c_str(), optionIter->ValueDescription.c_str());
+ else
+ snprintf(argumentBuffer, ARGUMENT_LENGTH + 1, " %s", optionIter->Argument.c_str());
+ printf("%-35s ", argumentBuffer);
+
+ std::string description = optionIter->Description;
+
+ // handle default values
+ if (optionIter->HasDefaultValue) {
+
+ sb.str(std::string());
+ sb << description << " [";
+
+ if (optionIter->DefaultValue.is_type<unsigned int>()) {
+ sb << (unsigned int)optionIter->DefaultValue;
+ } else if (optionIter->DefaultValue.is_type<unsigned char>()) {
+ sb << (unsigned short)(unsigned char)optionIter->DefaultValue;
+ } else if (optionIter->DefaultValue.is_type<float>()) {
+ sb << std::fixed << std::setprecision(2) << (float)optionIter->DefaultValue;
+ } else if (optionIter->DefaultValue.is_type<double>()) {
+ sb << std::fixed << std::setprecision(4) << (double)optionIter->DefaultValue;
+ } else if (optionIter->DefaultValue.is_type<std::string>()) {
+ const std::string stringValue = optionIter->DefaultValue;
+ sb << stringValue;
+ } else {
+ printf(
+ "ERROR: Found an unsupported data type for argument %s when casting the "
+ "default value.\n",
+ optionIter->Argument.c_str());
+ std::exit(EXIT_FAILURE);
+ }
+
+ sb << ']';
+ description = sb.str();
+ }
+
+ if (description.size() <= DESC_LENGTH_FIRST_ROW) {
+ printf("%s\n", description.c_str());
+ } else {
+
+ // handle the first row
+ const char* pDescription = description.data();
+ unsigned int cutIndex = DESC_LENGTH_FIRST_ROW;
+ while (pDescription[cutIndex] != ' ')
+ cutIndex--;
+ printf("%s\n", description.substr(0, cutIndex).c_str());
+ description = description.substr(cutIndex + 1);
+
+ // handle subsequent rows
+ while (description.size() > DESC_LENGTH) {
+ pDescription = description.data();
+ cutIndex = DESC_LENGTH;
+ while (pDescription[cutIndex] != ' ')
+ cutIndex--;
+ printf("%s%s\n", indentBuffer, description.substr(0, cutIndex).c_str());
+ description = description.substr(cutIndex + 1);
+ }
+
+ // handle last row
+ printf("%s%s\n", indentBuffer, description.c_str());
+ }
+ }
+
+ printf("\n");
+ }
+
+ printf("Help:\n");
+ printf(" --help, -h shows this help text\n");
+ std::exit(EXIT_FAILURE);
+}
+
+// parses the command line
+void Options::Parse(int argc, char* argv[], int offset)
+{
+
+ // initialize
+ std::map<std::string, OptionValue>::const_iterator ovMapIter;
+ std::map<std::string, OptionValue>::const_iterator checkMapIter;
+ const int LAST_INDEX = argc - 1;
+ std::ostringstream errorBuilder;
+ bool foundError = false;
+ char* end_ptr = NULL;
+ const std::string ERROR_SPACER(7, ' ');
+
+ // check if we should show the help menu
+ bool showHelpMenu = false;
+ if (argc > 1) {
+ for (int i = 1; i < argc; i++) {
+ const std::string argument = argv[i];
+ if ((argument == "-h") || (argument == "--help") || (argument == "help"))
+ showHelpMenu = true;
+ }
+ } else
+ showHelpMenu = true;
+
+ if (showHelpMenu) DisplayHelp();
+
+ // check each argument
+ for (int i = offset + 1; i < argc; i++) {
+
+ const std::string argument = argv[i];
+ ovMapIter = m_optionsMap.find(argument);
+
+ if (ovMapIter == m_optionsMap.end()) {
+ errorBuilder << ERROR_SPACER << "An unrecognized argument was found: " << argument
+ << std::endl;
+ foundError = true;
+ } else {
+
+ *ovMapIter->second.pFoundArgument = true;
+
+ // grab the value
+ if (ovMapIter->second.StoreValue) {
+
+ if (i < LAST_INDEX) {
+
+ // check if the next argument is really a command line option
+ const std::string val = argv[i + 1];
+ checkMapIter = m_optionsMap.find(val);
+
+ if (checkMapIter == m_optionsMap.end()) {
+
+ ++i;
+
+ if (ovMapIter->second.VariantValue.is_type<unsigned int>()) {
+ const unsigned int uint32 =
+ (unsigned int)strtoul(val.c_str(), &end_ptr, 10);
+ unsigned int* varValue = (unsigned int*)ovMapIter->second.pValue;
+ *varValue = uint32;
+ } else if (ovMapIter->second.VariantValue.is_type<unsigned char>()) {
+ const unsigned char uint8 =
+ (unsigned char)strtoul(val.c_str(), &end_ptr, 10);
+ unsigned char* varValue = (unsigned char*)ovMapIter->second.pValue;
+ *varValue = uint8;
+ } else if (ovMapIter->second.VariantValue.is_type<uint64_t>()) {
+ const uint64_t uint64 = strtoui64(val.c_str(), &end_ptr, 10);
+ uint64_t* varValue = (uint64_t*)ovMapIter->second.pValue;
+ *varValue = uint64;
+ } else if (ovMapIter->second.VariantValue.is_type<double>()) {
+ const double d = strtod(val.c_str(), &end_ptr);
+ double* varValue = (double*)ovMapIter->second.pValue;
+ *varValue = d;
+ } else if (ovMapIter->second.VariantValue.is_type<float>()) {
+ const float f = (float)strtod(val.c_str(), &end_ptr);
+ float* varValue = (float*)ovMapIter->second.pValue;
+ *varValue = f;
+ } else if (ovMapIter->second.VariantValue.is_type<std::string>()) {
+ std::string* pStringValue = (std::string*)ovMapIter->second.pValue;
+ *pStringValue = val;
+ } else if (ovMapIter->second.VariantValue
+ .is_type<std::vector<std::string> >()) {
+ std::vector<std::string>* pVectorValue =
+ (std::vector<std::string>*)ovMapIter->second.pValue;
+ pVectorValue->push_back(val);
+ } else {
+ printf(
+ "ERROR: Found an unsupported data type for argument %s when "
+ "parsing the arguments.\n",
+ argument.c_str());
+ std::exit(EXIT_FAILURE);
+ }
+ } else {
+ errorBuilder << ERROR_SPACER << "The argument (" << argument
+ << ") expects a value, but none was found." << std::endl;
+ foundError = true;
+ }
+ } else {
+ errorBuilder << ERROR_SPACER << "The argument (" << argument
+ << ") expects a value, but none was found." << std::endl;
+ foundError = true;
+ }
+ }
+ }
+ }
+
+ // check if we missed any required parameters
+ for (ovMapIter = m_optionsMap.begin(); ovMapIter != m_optionsMap.end(); ++ovMapIter) {
+ if (ovMapIter->second.IsRequired && !*ovMapIter->second.pFoundArgument) {
+ errorBuilder << ERROR_SPACER << ovMapIter->second.ValueTypeDescription
+ << " was not specified. Please use the " << ovMapIter->first
+ << " parameter." << std::endl;
+ foundError = true;
+ }
+ }
+
+ // print the errors if any were found
+ if (foundError) {
+ printf("ERROR: Some problems were encountered when parsing the command line options:\n");
+ printf("%s\n", errorBuilder.str().c_str());
+ printf("For a complete list of command line options, type \"%s help %s\"\n", argv[0],
+ argv[1]);
+ std::exit(EXIT_FAILURE);
+ }
+}
+
+// sets the program info
+void Options::SetProgramInfo(const std::string& programName, const std::string& description,
+ const std::string& arguments)
+{
+ m_programName = programName;
+ m_description = description;
+ m_exampleArguments = arguments;
+}
+
+// return string representations of stdin
+const std::string& Options::StandardIn()
+{
+ return m_stdin;
+}
+
+// return string representations of stdout
+const std::string& Options::StandardOut()
+{
+ return m_stdout;
+}
diff --git a/src/utils/bamtools_options.h b/src/utils/bamtools_options.h
new file mode 100644
index 0000000..d47b7b8
--- /dev/null
+++ b/src/utils/bamtools_options.h
@@ -0,0 +1,200 @@
+// ***************************************************************************
+// bamtools_options.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011
+// ---------------------------------------------------------------------------
+// Parses command line arguments and creates a help menu
+// ---------------------------------------------------------------------------
+// Modified from:
+// The Mosaik suite's command line parser class: COptions
+// (c) 2006 - 2009 Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// Re-licensed under MIT License with author's permission.
+//
+// * Modified slightly to fit BamTools, otherwise code is same.
+// * (BamTools namespace, added stdin/stdout) (DB)
+// ***************************************************************************
+
+#ifndef BAMTOOLS_OPTIONS_H
+#define BAMTOOLS_OPTIONS_H
+
+#include "utils/bamtools_variant.h"
+#include "utils/utils_global.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+#ifndef WIN32
+#include <stdint.h>
+#endif
+
+namespace BamTools {
+
+#define ARGUMENT_LENGTH 35
+#define DESC_LENGTH_FIRST_ROW 30
+#define DESC_LENGTH 42
+#define MAX_LINE_LENGTH 78
+
+#ifdef WIN32
+#define snprintf _snprintf
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#define strtoui64 _strtoui64
+#else
+#define strtoui64 strtoull
+#endif
+
+struct UTILS_EXPORT Option
+{
+
+ // data members
+ std::string Argument;
+ std::string ValueDescription;
+ std::string Description;
+ bool StoreValue;
+ bool HasDefaultValue;
+ Variant DefaultValue;
+
+ // constructor
+ Option()
+ : StoreValue(true)
+ , HasDefaultValue(false)
+ {}
+};
+
+struct UTILS_EXPORT OptionValue
+{
+
+ // data members
+ bool* pFoundArgument;
+ void* pValue;
+ std::string ValueTypeDescription;
+ bool UseVector;
+ bool StoreValue;
+ bool IsRequired;
+ Variant VariantValue;
+
+ // constructor
+ OptionValue()
+ : pFoundArgument(NULL)
+ , pValue(NULL)
+ , UseVector(false)
+ , StoreValue(true)
+ , IsRequired(false)
+ {}
+};
+
+struct UTILS_EXPORT OptionGroup
+{
+ std::string Name;
+ std::vector<Option> Options;
+};
+
+class UTILS_EXPORT Options
+{
+
+ // add option/argument rules
+public:
+ // adds a simple option to the parser
+ static void AddOption(const std::string& argument, const std::string& optionDescription,
+ bool& foundArgument, OptionGroup* group);
+
+ // adds a value option to the parser
+ template <typename T>
+ static void AddValueOption(const std::string& argument, const std::string& valueDescription,
+ const std::string& optionDescription,
+ const std::string& valueTypeDescription, bool& foundArgument, T& val,
+ OptionGroup* group);
+
+ // adds a value option to the parser (with a default value)
+ template <typename T, typename D>
+ static void AddValueOption(const std::string& argument, const std::string& valueDescription,
+ const std::string& optionDescription,
+ const std::string& valueTypeDescription, bool& foundArgument, T& val,
+ OptionGroup* group, D& defaultValue);
+
+ // other API methods
+public:
+ // creates an option group
+ static OptionGroup* CreateOptionGroup(const std::string& groupName);
+ // displays the help menu
+ static void DisplayHelp();
+ // parses the command line
+ static void Parse(int argc, char* argv[], int offset = 0);
+ // sets the program info
+ static void SetProgramInfo(const std::string& programName, const std::string& description,
+ const std::string& arguments);
+ // returns string representation of stdin
+ static const std::string& StandardIn();
+ // returns string representation of stdout
+ static const std::string& StandardOut();
+
+ // static data members
+private:
+ // the program name
+ static std::string m_programName;
+ // the main description
+ static std::string m_description;
+ // the example arguments
+ static std::string m_exampleArguments;
+ // stores the option groups
+ static std::vector<OptionGroup> m_optionGroups;
+ // stores the options in a map
+ static std::map<std::string, OptionValue> m_optionsMap;
+ // string representation of stdin
+ static const std::string m_stdin;
+ // string representation of stdout
+ static const std::string m_stdout;
+};
+
+// adds a value option to the parser
+template <typename T>
+void Options::AddValueOption(const std::string& argument, const std::string& valueDescription,
+ const std::string& optionDescription,
+ const std::string& valueTypeDescription, bool& foundArgument, T& val,
+ OptionGroup* group)
+{
+ Option o;
+ o.Argument = argument;
+ o.ValueDescription = valueDescription;
+ o.Description = optionDescription;
+ group->Options.push_back(o);
+
+ OptionValue ov;
+ ov.pFoundArgument = &foundArgument;
+ ov.pValue = (void*)&val;
+ ov.VariantValue = val;
+ ov.IsRequired = (valueTypeDescription.empty() ? false : true);
+ ov.ValueTypeDescription = valueTypeDescription;
+ m_optionsMap[argument] = ov;
+}
+
+// adds a value option to the parser (with a default value)
+template <typename T, typename D>
+void Options::AddValueOption(const std::string& argument, const std::string& valueDescription,
+ const std::string& optionDescription,
+ const std::string& valueTypeDescription, bool& foundArgument, T& val,
+ OptionGroup* group, D& defaultValue)
+{
+ Option o;
+ o.Argument = argument;
+ o.ValueDescription = valueDescription;
+ o.Description = optionDescription;
+ o.DefaultValue = defaultValue;
+ o.HasDefaultValue = true;
+ group->Options.push_back(o);
+
+ OptionValue ov;
+ ov.pFoundArgument = &foundArgument;
+ ov.pValue = (void*)&val;
+ ov.VariantValue = val;
+ ov.IsRequired = (valueTypeDescription.empty() ? false : true);
+ ov.ValueTypeDescription = valueTypeDescription;
+ m_optionsMap[argument] = ov;
+}
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_OPTIONS_H
diff --git a/src/utils/bamtools_pileup_engine.cpp b/src/utils/bamtools_pileup_engine.cpp
new file mode 100644
index 0000000..8874bb7
--- /dev/null
+++ b/src/utils/bamtools_pileup_engine.cpp
@@ -0,0 +1,355 @@
+// ***************************************************************************
+// bamtools_pileup_engine.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 9 March 2012 (DB)
+// ---------------------------------------------------------------------------
+// Provides pileup at position functionality for various tools.
+// ***************************************************************************
+
+#include "utils/bamtools_pileup_engine.h"
+using namespace BamTools;
+
+#include <cstddef>
+#include <iostream>
+
+// ---------------------------------------------
+// PileupEnginePrivate implementation
+
+struct PileupEngine::PileupEnginePrivate
+{
+
+ // data members
+ int CurrentId;
+ int CurrentPosition;
+ std::vector<BamAlignment> CurrentAlignments;
+ PileupPosition CurrentPileupData;
+
+ bool IsFirstAlignment;
+ std::vector<PileupVisitor*> Visitors;
+
+ // ctor & dtor
+ PileupEnginePrivate()
+ : CurrentId(-1)
+ , CurrentPosition(-1)
+ , IsFirstAlignment(true)
+ {}
+ ~PileupEnginePrivate() {}
+
+ // 'public' methods
+ bool AddAlignment(const BamAlignment& al);
+ void Flush();
+
+ // internal methods
+private:
+ void ApplyVisitors();
+ void ClearOldData();
+ void CreatePileupData();
+ void ParseAlignmentCigar(const BamAlignment& al);
+};
+
+bool PileupEngine::PileupEnginePrivate::AddAlignment(const BamAlignment& al)
+{
+
+ // if first time
+ if (IsFirstAlignment) {
+
+ // set initial markers
+ CurrentId = al.RefID;
+ CurrentPosition = al.Position;
+
+ // store first entry
+ CurrentAlignments.clear();
+ CurrentAlignments.push_back(al);
+
+ // set flag & return
+ IsFirstAlignment = false;
+ return true;
+ }
+
+ // if same reference
+ if (al.RefID == CurrentId) {
+
+ // if same position, store and move on
+ if (al.Position == CurrentPosition) CurrentAlignments.push_back(al);
+
+ // if less than CurrentPosition - sorting error => ABORT
+ else if (al.Position < CurrentPosition) {
+ std::cerr << "Pileup::Run() : Data not sorted correctly!" << std::endl;
+ return false;
+ }
+
+ // else print pileup data until 'catching up' to CurrentPosition
+ else {
+ while (al.Position > CurrentPosition) {
+ ApplyVisitors();
+ ++CurrentPosition;
+ }
+ CurrentAlignments.push_back(al);
+ }
+ }
+
+ // if reference ID less than CurrentId - sorting error => ABORT
+ else if (al.RefID < CurrentId) {
+ std::cerr << "Pileup::Run() : Data not sorted correctly!" << std::endl;
+ return false;
+ }
+
+ // else moved forward onto next reference
+ else {
+
+ // print any remaining pileup data from previous reference
+ while (!CurrentAlignments.empty()) {
+ ApplyVisitors();
+ ++CurrentPosition;
+ }
+
+ // store first entry on this new reference, update markers
+ CurrentAlignments.clear();
+ CurrentAlignments.push_back(al);
+ CurrentId = al.RefID;
+ CurrentPosition = al.Position;
+ }
+
+ return true;
+}
+
+void PileupEngine::PileupEnginePrivate::ApplyVisitors()
+{
+
+ // parse CIGAR data in BamAlignments to build up current pileup data
+ CreatePileupData();
+
+ // apply all visitors to current alignment set
+ std::vector<PileupVisitor*>::const_iterator visitorIter = Visitors.begin();
+ std::vector<PileupVisitor*>::const_iterator visitorEnd = Visitors.end();
+ for (; visitorIter != visitorEnd; ++visitorIter)
+ (*visitorIter)->Visit(CurrentPileupData);
+}
+
+void PileupEngine::PileupEnginePrivate::ClearOldData()
+{
+
+ // remove any alignments that end before our CurrentPosition
+ // N.B. - BAM positions are 0-based, half-open. GetEndPosition() returns a 1-based position,
+ // while our CurrentPosition is 0-based. For example, an alignment with 'endPosition' of
+ // 100 does not overlap a 'CurrentPosition' of 100, and should be discarded.
+
+ std::size_t i = 0;
+ std::size_t j = 0;
+ const std::size_t numAlignments = CurrentAlignments.size();
+ while (i < numAlignments) {
+
+ // skip over alignment if its (1-based) endPosition is <= to (0-based) CurrentPosition
+ // i.e. this entry will not be saved upon vector resize
+ const int endPosition = CurrentAlignments[i].GetEndPosition();
+ if (endPosition <= CurrentPosition) {
+ ++i;
+ continue;
+ }
+
+ // otherwise alignment ends after CurrentPosition
+ // move it towards vector beginning, at index j
+ if (i != j) CurrentAlignments[j] = CurrentAlignments[i];
+
+ // increment our indices
+ ++i;
+ ++j;
+ }
+
+ // 'squeeze' vector to size j, discarding all remaining alignments in the container
+ CurrentAlignments.resize(j);
+}
+
+void PileupEngine::PileupEnginePrivate::CreatePileupData()
+{
+
+ // remove any non-overlapping alignments
+ ClearOldData();
+
+ // set pileup refId, position to current markers
+ CurrentPileupData.RefId = CurrentId;
+ CurrentPileupData.Position = CurrentPosition;
+ CurrentPileupData.PileupAlignments.clear();
+
+ // parse CIGAR data in remaining alignments
+ std::vector<BamAlignment>::const_iterator alIter = CurrentAlignments.begin();
+ std::vector<BamAlignment>::const_iterator alEnd = CurrentAlignments.end();
+ for (; alIter != alEnd; ++alIter)
+ ParseAlignmentCigar((*alIter));
+}
+
+void PileupEngine::PileupEnginePrivate::Flush()
+{
+ while (!CurrentAlignments.empty()) {
+ ApplyVisitors();
+ ++CurrentPosition;
+ }
+}
+
+void PileupEngine::PileupEnginePrivate::ParseAlignmentCigar(const BamAlignment& al)
+{
+
+ // skip if unmapped
+ if (!al.IsMapped()) return;
+
+ // intialize local variables
+ int genomePosition = al.Position;
+ int positionInAlignment = 0;
+ bool isNewReadSegment = true;
+ bool saveAlignment = true;
+ PileupAlignment pileupAlignment(al);
+
+ // iterate over CIGAR operations
+ const int numCigarOps = (const int)al.CigarData.size();
+ for (int i = 0; i < numCigarOps; ++i) {
+ const CigarOp& op = al.CigarData.at(i);
+
+ // if op is MATCH
+ if (op.Type == 'M') {
+
+ // if match op overlaps current position
+ if (genomePosition + (int)op.Length > CurrentPosition) {
+
+ // set pileup data
+ pileupAlignment.IsCurrentDeletion = false;
+ pileupAlignment.IsNextDeletion = false;
+ pileupAlignment.IsNextInsertion = false;
+ pileupAlignment.PositionInAlignment =
+ positionInAlignment + (CurrentPosition - genomePosition);
+
+ // check for beginning of read segment
+ if (genomePosition == CurrentPosition && isNewReadSegment)
+ pileupAlignment.IsSegmentBegin = true;
+
+ // if we're at the end of a match operation
+ if (genomePosition + (int)op.Length - 1 == CurrentPosition) {
+
+ // if not last operation
+ if (i < numCigarOps - 1) {
+
+ // check next CIGAR op
+ const CigarOp& nextOp = al.CigarData.at(i + 1);
+
+ // if next CIGAR op is DELETION
+ if (nextOp.Type == 'D') {
+ pileupAlignment.IsNextDeletion = true;
+ pileupAlignment.DeletionLength = nextOp.Length;
+ }
+
+ // if next CIGAR op is INSERTION
+ else if (nextOp.Type == 'I') {
+ pileupAlignment.IsNextInsertion = true;
+ pileupAlignment.InsertionLength = nextOp.Length;
+ }
+
+ // if next CIGAR op is either DELETION or INSERTION
+ if (nextOp.Type == 'D' || nextOp.Type == 'I') {
+
+ // if there is a CIGAR op after the DEL/INS
+ if (i < numCigarOps - 2) {
+ const CigarOp& nextNextOp = al.CigarData.at(i + 2);
+
+ // if next CIGAR op is clipping or ref_skip
+ if (nextNextOp.Type == 'S' || nextNextOp.Type == 'N' ||
+ nextNextOp.Type == 'H')
+ pileupAlignment.IsSegmentEnd = true;
+ } else {
+ pileupAlignment.IsSegmentEnd = true;
+
+ // if next CIGAR op is clipping or ref_skip
+ if (nextOp.Type == 'S' || nextOp.Type == 'N' || nextOp.Type == 'H')
+ pileupAlignment.IsSegmentEnd = true;
+ }
+ }
+
+ // otherwise
+ else {
+
+ // if next CIGAR op is clipping or ref_skip
+ if (nextOp.Type == 'S' || nextOp.Type == 'N' || nextOp.Type == 'H')
+ pileupAlignment.IsSegmentEnd = true;
+ }
+ }
+
+ // else this is last operation
+ else
+ pileupAlignment.IsSegmentEnd = true;
+ }
+ }
+
+ // increment markers
+ genomePosition += op.Length;
+ positionInAlignment += op.Length;
+ }
+
+ // if op is DELETION
+ else if (op.Type == 'D') {
+
+ // if deletion op overlaps current position
+ if (genomePosition + (int)op.Length > CurrentPosition) {
+
+ // set pileup data
+ pileupAlignment.IsCurrentDeletion = true;
+ pileupAlignment.IsNextDeletion = false;
+ pileupAlignment.IsNextInsertion = true;
+ pileupAlignment.PositionInAlignment =
+ positionInAlignment + (CurrentPosition - genomePosition);
+ }
+
+ // increment marker
+ genomePosition += op.Length;
+ }
+
+ // if op is REF_SKIP
+ else if (op.Type == 'N') {
+ genomePosition += op.Length;
+ }
+
+ // if op is INSERTION or SOFT_CLIP
+ else if (op.Type == 'I' || op.Type == 'S') {
+ positionInAlignment += op.Length;
+ }
+
+ // checl for beginning of new read segment
+ if (op.Type == 'N' || op.Type == 'S' || op.Type == 'H')
+ isNewReadSegment = true;
+ else
+ isNewReadSegment = false;
+
+ // if we've moved beyond current position
+ if (genomePosition > CurrentPosition) {
+ if (op.Type == 'N') saveAlignment = false; // ignore alignment if REF_SKIP
+ break;
+ }
+ }
+
+ // save pileup position if flag is true
+ if (saveAlignment) CurrentPileupData.PileupAlignments.push_back(pileupAlignment);
+}
+
+// ---------------------------------------------
+// PileupEngine implementation
+
+PileupEngine::PileupEngine()
+ : d(new PileupEnginePrivate)
+{}
+
+PileupEngine::~PileupEngine()
+{
+ delete d;
+ d = 0;
+}
+
+bool PileupEngine::AddAlignment(const BamAlignment& al)
+{
+ return d->AddAlignment(al);
+}
+void PileupEngine::AddVisitor(PileupVisitor* visitor)
+{
+ d->Visitors.push_back(visitor);
+}
+void PileupEngine::Flush()
+{
+ d->Flush();
+}
diff --git a/src/utils/bamtools_pileup_engine.h b/src/utils/bamtools_pileup_engine.h
new file mode 100644
index 0000000..5533c01
--- /dev/null
+++ b/src/utils/bamtools_pileup_engine.h
@@ -0,0 +1,98 @@
+// ***************************************************************************
+// bamtools_pileup_engine.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011
+// ---------------------------------------------------------------------------
+// Provides pileup at position functionality for various tools.
+// ***************************************************************************
+
+#ifndef BAMTOOLS_PILEUP_ENGINE_H
+#define BAMTOOLS_PILEUP_ENGINE_H
+
+#include "utils/utils_global.h"
+
+#include <api/BamAlignment.h>
+#include <vector>
+
+namespace BamTools {
+
+// contains auxiliary data about a single BamAlignment
+// at current position considered
+struct UTILS_EXPORT PileupAlignment
+{
+
+ // data members
+ BamAlignment Alignment;
+ int32_t PositionInAlignment;
+ bool IsCurrentDeletion;
+ bool IsNextDeletion;
+ bool IsNextInsertion;
+ int DeletionLength;
+ int InsertionLength;
+ bool IsSegmentBegin;
+ bool IsSegmentEnd;
+
+ // ctor
+ PileupAlignment(const BamAlignment& al)
+ : Alignment(al)
+ , PositionInAlignment(-1)
+ , IsCurrentDeletion(false)
+ , IsNextDeletion(false)
+ , IsNextInsertion(false)
+ , DeletionLength(0)
+ , InsertionLength(0)
+ , IsSegmentBegin(false)
+ , IsSegmentEnd(false)
+ {}
+};
+
+// contains all data at a position
+struct UTILS_EXPORT PileupPosition
+{
+
+ // data members
+ int RefId;
+ int Position;
+ std::vector<PileupAlignment> PileupAlignments;
+
+ // ctor
+ PileupPosition(const int& refId = 0, const int& position = 0,
+ const std::vector<PileupAlignment>& alignments = std::vector<PileupAlignment>())
+ : RefId(refId)
+ , Position(position)
+ , PileupAlignments(alignments)
+ {}
+};
+
+class UTILS_EXPORT PileupVisitor
+{
+
+public:
+ PileupVisitor() {}
+ virtual ~PileupVisitor() {}
+
+public:
+ virtual void Visit(const PileupPosition& pileupData) = 0;
+};
+
+class UTILS_EXPORT PileupEngine
+{
+
+public:
+ PileupEngine();
+ ~PileupEngine();
+
+public:
+ bool AddAlignment(const BamAlignment& al);
+ void AddVisitor(PileupVisitor* visitor);
+ void Flush();
+
+private:
+ struct PileupEnginePrivate;
+ PileupEnginePrivate* d;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_PILEUP_ENGINE_H
diff --git a/src/utils/bamtools_utilities.cpp b/src/utils/bamtools_utilities.cpp
new file mode 100644
index 0000000..77a09b1
--- /dev/null
+++ b/src/utils/bamtools_utilities.cpp
@@ -0,0 +1,343 @@
+// ***************************************************************************
+// bamtools_utilities.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 8 October 2011
+// ---------------------------------------------------------------------------
+// Provides general utilities used by BamTools sub-tools.
+// ***************************************************************************
+
+#include <api/BamMultiReader.h>
+#include <api/BamReader.h>
+#include <utils/bamtools_utilities.h>
+using namespace BamTools;
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+namespace BamTools {
+
+const char REVCOMP_LOOKUP[] = {'T', 0, 'G', 'H', 0, 0, 'C', 'D', 0, 0, 0, 0, 'K',
+ 'N', 0, 0, 0, 'Y', 'W', 'A', 'A', 'B', 'S', 'X', 'R', 0};
+
+} // namespace BamTools
+
+// returns true if 'source' contains 'pattern'
+bool Utilities::Contains(const std::string& source, const std::string& pattern)
+{
+ return (source.find(pattern) != std::string::npos);
+}
+
+// returns true if 'source' contains 'c'
+bool Utilities::Contains(const std::string& source, const char c)
+{
+ return (source.find(c) != std::string::npos);
+}
+
+// returns true if 'source' ends with 'pattern'
+bool Utilities::EndsWith(const std::string& source, const std::string& pattern)
+{
+ return (source.find(pattern) == (source.length() - pattern.length()));
+}
+
+// returns true if 'source' ends with 'c'
+bool Utilities::EndsWith(const std::string& source, const char c)
+{
+ return (source.find(c) == (source.length() - 1));
+}
+
+// check if a file exists
+bool Utilities::FileExists(const std::string& filename)
+{
+ std::ifstream f(filename.c_str(), std::ifstream::in);
+ return !f.fail();
+}
+
+// Parses a region string, does validation (valid ID's, positions), stores in Region struct
+// Returns success (true/false)
+bool Utilities::ParseRegionString(const std::string& regionString, const BamReader& reader,
+ BamRegion& region)
+{
+ // -------------------------------
+ // parse region string
+
+ // check first for empty string
+ if (regionString.empty()) return false;
+
+ // non-empty string, look for a colom
+ std::size_t foundFirstColon = regionString.find(':');
+
+ // store chrom strings, and numeric positions
+ std::string startChrom;
+ std::string stopChrom;
+ int startPos;
+ int stopPos;
+
+ // no colon found
+ // going to use entire contents of requested chromosome
+ // just store entire region string as startChrom name
+ // use BamReader methods to check if its valid for current BAM file
+ if (foundFirstColon == std::string::npos) {
+ startChrom = regionString;
+ startPos = 0;
+ stopChrom = regionString;
+ stopPos = 0;
+ }
+
+ // colon found, so we at least have some sort of startPos requested
+ else {
+
+ // store start chrom from beginning to first colon
+ startChrom = regionString.substr(0, foundFirstColon);
+
+ // look for ".." after the colon
+ std::size_t foundRangeDots = regionString.find("..", foundFirstColon + 1);
+
+ // no dots found
+ // so we have a startPos but no range
+ // store contents before colon as startChrom, after as startPos
+ if (foundRangeDots == std::string::npos) {
+ startPos = std::atoi(regionString.substr(foundFirstColon + 1).c_str());
+ stopChrom = startChrom;
+ stopPos = -1;
+ }
+
+ // ".." found, so we have some sort of range selected
+ else {
+
+ // store startPos between first colon and range dots ".."
+ startPos = std::atoi(
+ regionString.substr(foundFirstColon + 1, foundRangeDots - foundFirstColon - 1)
+ .c_str());
+
+ // look for second colon
+ std::size_t foundSecondColon = regionString.find(':', foundRangeDots + 1);
+
+ // no second colon found
+ // so we have a "standard" chrom:start..stop input format (on single chrom)
+ if (foundSecondColon == std::string::npos) {
+ stopChrom = startChrom;
+ stopPos = std::atoi(regionString.substr(foundRangeDots + 2).c_str());
+ }
+
+ // second colon found
+ // so we have a range requested across 2 chrom's
+ else {
+ stopChrom = regionString.substr(foundRangeDots + 2,
+ foundSecondColon - (foundRangeDots + 2));
+ stopPos = std::atoi(regionString.substr(foundSecondColon + 1).c_str());
+ }
+ }
+ }
+
+ // -------------------------------
+ // validate reference IDs & genomic positions
+
+ const RefVector references = reader.GetReferenceData();
+
+ // if startRefID not found, return false
+ int startRefID = reader.GetReferenceID(startChrom);
+ if (startRefID == -1) return false;
+
+ // startPos cannot be greater than or equal to reference length
+ const RefData& startReference = references.at(startRefID);
+ if (startPos >= startReference.RefLength) return false;
+
+ // if stopRefID not found, return false
+ int stopRefID = reader.GetReferenceID(stopChrom);
+ if (stopRefID == -1) return false;
+
+ // stopPosition cannot be larger than reference length
+ const RefData& stopReference = references.at(stopRefID);
+ if (stopPos > stopReference.RefLength) return false;
+
+ // if no stopPosition specified, set to reference end
+ if (stopPos == -1) stopPos = stopReference.RefLength;
+
+ // -------------------------------
+ // set up Region struct & return
+
+ region.LeftRefID = startRefID;
+ region.LeftPosition = startPos;
+ region.RightRefID = stopRefID;
+ ;
+ region.RightPosition = stopPos;
+ return true;
+}
+
+// Same as ParseRegionString() above, but accepts a BamMultiReader
+bool Utilities::ParseRegionString(const std::string& regionString, const BamMultiReader& reader,
+ BamRegion& region)
+{
+ // -------------------------------
+ // parse region string
+
+ // check first for empty string
+ if (regionString.empty()) return false;
+
+ // non-empty string, look for a colom
+ std::size_t foundFirstColon = regionString.find(':');
+
+ // store chrom strings, and numeric positions
+ std::string startChrom;
+ std::string stopChrom;
+ int startPos;
+ int stopPos;
+
+ // no colon found
+ // going to use entire contents of requested chromosome
+ // just store entire region string as startChrom name
+ // use BamReader methods to check if its valid for current BAM file
+ if (foundFirstColon == std::string::npos) {
+ startChrom = regionString;
+ startPos = 0;
+ stopChrom = regionString;
+ stopPos = -1;
+ }
+
+ // colon found, so we at least have some sort of startPos requested
+ else {
+
+ // store start chrom from beginning to first colon
+ startChrom = regionString.substr(0, foundFirstColon);
+
+ // look for ".." after the colon
+ std::size_t foundRangeDots = regionString.find("..", foundFirstColon + 1);
+
+ // no dots found
+ // so we have a startPos but no range
+ // store contents before colon as startChrom, after as startPos
+ if (foundRangeDots == std::string::npos) {
+ startPos = std::atoi(regionString.substr(foundFirstColon + 1).c_str());
+ stopChrom = startChrom;
+ stopPos = -1;
+ }
+
+ // ".." found, so we have some sort of range selected
+ else {
+
+ // store startPos between first colon and range dots ".."
+ startPos = std::atoi(
+ regionString.substr(foundFirstColon + 1, foundRangeDots - foundFirstColon - 1)
+ .c_str());
+
+ // look for second colon
+ std::size_t foundSecondColon = regionString.find(':', foundRangeDots + 1);
+
+ // no second colon found
+ // so we have a "standard" chrom:start..stop input format (on single chrom)
+ if (foundSecondColon == std::string::npos) {
+ stopChrom = startChrom;
+ stopPos = std::atoi(regionString.substr(foundRangeDots + 2).c_str());
+ }
+
+ // second colon found
+ // so we have a range requested across 2 chrom's
+ else {
+ stopChrom = regionString.substr(foundRangeDots + 2,
+ foundSecondColon - (foundRangeDots + 2));
+ stopPos = std::atoi(regionString.substr(foundSecondColon + 1).c_str());
+ }
+ }
+ }
+
+ // -------------------------------
+ // validate reference IDs & genomic positions
+
+ const RefVector references = reader.GetReferenceData();
+
+ // if startRefID not found, return false
+ int startRefID = reader.GetReferenceID(startChrom);
+ if (startRefID == -1) return false;
+
+ // startPos cannot be greater than or equal to reference length
+ const RefData& startReference = references.at(startRefID);
+ if (startPos >= startReference.RefLength) return false;
+
+ // if stopRefID not found, return false
+ int stopRefID = reader.GetReferenceID(stopChrom);
+ if (stopRefID == -1) return false;
+
+ // stopPosition cannot be larger than reference length
+ const RefData& stopReference = references.at(stopRefID);
+ if (stopPos > stopReference.RefLength) return false;
+
+ // if no stopPosition specified, set to reference end
+ if (stopPos == -1) stopPos = stopReference.RefLength;
+
+ // -------------------------------
+ // set up Region struct & return
+
+ region.LeftRefID = startRefID;
+ region.LeftPosition = startPos;
+ region.RightRefID = stopRefID;
+ ;
+ region.RightPosition = stopPos;
+ return true;
+}
+
+void Utilities::Reverse(std::string& sequence)
+{
+ reverse(sequence.begin(), sequence.end());
+}
+
+void Utilities::ReverseComplement(std::string& sequence)
+{
+
+ // do complement, in-place
+ std::size_t seqLength = sequence.length();
+ for (std::size_t i = 0; i < seqLength; ++i)
+ sequence.replace(i, 1, 1, REVCOMP_LOOKUP[(int)sequence.at(i) - 65]);
+
+ // reverse it
+ Reverse(sequence);
+}
+
+std::vector<std::string> Utilities::Split(const std::string& source, const char delim)
+{
+
+ std::stringstream ss(source);
+ std::string field;
+ std::vector<std::string> fields;
+
+ while (std::getline(ss, field, delim))
+ fields.push_back(field);
+ return fields;
+}
+
+std::vector<std::string> Utilities::Split(const std::string& source, const std::string& delims)
+{
+
+ std::vector<std::string> fields;
+
+ char* tok;
+ char* cchars = new char[source.size() + 1];
+ char* cstr = &cchars[0];
+ strcpy(cstr, source.c_str());
+ tok = strtok(cstr, delims.c_str());
+ while (tok != NULL) {
+ fields.push_back(tok);
+ tok = strtok(NULL, delims.c_str());
+ }
+
+ delete[] cchars;
+
+ return fields;
+}
+
+// returns true if 'source' starts with 'pattern'
+bool Utilities::StartsWith(const std::string& source, const std::string& pattern)
+{
+ return (source.find(pattern) == 0);
+}
+
+// returns true if 'source' starts with 'c'
+bool Utilities::StartsWith(const std::string& source, const char c)
+{
+ return (source.find(c) == 0);
+}
diff --git a/src/utils/bamtools_utilities.h b/src/utils/bamtools_utilities.h
new file mode 100644
index 0000000..c85c452
--- /dev/null
+++ b/src/utils/bamtools_utilities.h
@@ -0,0 +1,64 @@
+// ***************************************************************************
+// bamtools_utilities.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 7 October 2011
+// ---------------------------------------------------------------------------
+// Provides general utilities used by BamTools sub-tools.
+// ***************************************************************************
+
+#ifndef BAMTOOLS_UTILITIES_H
+#define BAMTOOLS_UTILITIES_H
+
+#include <api/BamAux.h>
+#include <utils/utils_global.h>
+#include <string>
+#include <vector>
+
+#define BAMTOOLS_ASSERT_UNREACHABLE BT_ASSERT_UNREACHABLE
+#define BAMTOOLS_ASSERT_MESSAGE(condition, message) BT_ASSERT_X(condition, message)
+
+namespace BamTools {
+
+class BamReader;
+class BamMultiReader;
+
+class UTILS_EXPORT Utilities
+{
+
+public:
+ // returns true if 'source' contains 'pattern' or 'c'
+ static bool Contains(const std::string& source, const std::string& pattern);
+ static bool Contains(const std::string& source, const char c);
+
+ // returns true if 'source' ends with 'pattern' or 'c'
+ static bool EndsWith(const std::string& source, const std::string& pattern);
+ static bool EndsWith(const std::string& source, const char c);
+
+ // check if a file exists
+ static bool FileExists(const std::string& fname);
+
+ // Parses a region string, uses reader to do validation (valid ID's, positions), stores in Region struct
+ // Returns success (true/false)
+ static bool ParseRegionString(const std::string& regionString, const BamReader& reader,
+ BamRegion& region);
+ // Same as above, but accepts a BamMultiReader
+ static bool ParseRegionString(const std::string& regionString, const BamMultiReader& reader,
+ BamRegion& region);
+
+ // sequence utilities
+ static void Reverse(std::string& sequence);
+ static void ReverseComplement(std::string& sequence);
+
+ // split string on delimiter character (or string of allowed delimiters)
+ static std::vector<std::string> Split(const std::string& source, const char delim);
+ static std::vector<std::string> Split(const std::string& source, const std::string& delims);
+
+ // returns true if 'source' starts with 'pattern' or 'c'
+ static bool StartsWith(const std::string& source, const std::string& pattern);
+ static bool StartsWith(const std::string& source, const char c);
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_UTILITIES_H
diff --git a/src/utils/bamtools_variant.h b/src/utils/bamtools_variant.h
new file mode 100644
index 0000000..50641bc
--- /dev/null
+++ b/src/utils/bamtools_variant.h
@@ -0,0 +1,146 @@
+// ***************************************************************************
+// bamtools_variant.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011
+// ---------------------------------------------------------------------------
+// Provides a template-based variant type
+// ---------------------------------------------------------------------------
+// Modified from:
+// variant_t - An Improved Variant Type Based on Member Templates
+// (c) 2000 Fernando Cacciola
+// Dr. Dobb's (http://www.ddj.com/cpp/184401293)
+//
+// * Modified to be in BamTools namespace, otherwise code is same. (DB)
+// ***************************************************************************
+
+#ifndef BAMTOOLS_VARIANT_H
+#define BAMTOOLS_VARIANT_H
+
+#include <cstddef>
+#include <stdexcept>
+#include <string>
+#include <typeinfo>
+#include "utils/utils_global.h"
+
+namespace BamTools {
+
+class UTILS_EXPORT Variant
+{
+
+public:
+ Variant()
+ : data(NULL)
+ {}
+
+ Variant(const Variant& other)
+ {
+ if (other.data != NULL) other.data->AddRef();
+ data = other.data;
+ }
+
+ ~Variant()
+ {
+ if (data != NULL) data->Release();
+ }
+
+ // NOTE: This code takes care of self-assignment.
+ // DO NOT CHANGE THE ORDER of the statements.
+ Variant& operator=(const Variant& rhs)
+ {
+ if (rhs.data != NULL) rhs.data->AddRef();
+ if (data != NULL) data->Release();
+ data = rhs.data;
+ return *this;
+ }
+
+ // This member template constructor allows you to
+ // instance a variant_t object with a value of any type.
+ template <typename T>
+ Variant(T v)
+ : data(new Impl<T>(v))
+ {
+ data->AddRef();
+ }
+
+ // This generic conversion operator let you retrieve
+ // the value held. To avoid template specialization conflicts,
+ // it returns an instance of type T, which will be a COPY
+ // of the value contained.
+ template <typename T>
+ operator T() const
+ {
+ return CastFromBase<T>(data)->data;
+ }
+
+ // This forms returns a REFERENCE and not a COPY, which
+ // will be significant in some cases.
+ template <typename T>
+ const T& get() const
+ {
+ return CastFromBase<T>(data)->data;
+ }
+
+ template <typename T>
+ bool is_type() const
+ {
+ return typeid(*data) == typeid(Impl<T>);
+ }
+
+ template <typename T>
+ bool is_type(T v) const
+ {
+ return typeid(*data) == typeid(v);
+ }
+
+private:
+ struct ImplBase
+ {
+
+ ImplBase()
+ : refs(0)
+ {}
+ virtual ~ImplBase() {}
+
+ void AddRef()
+ {
+ ++refs;
+ }
+ void Release()
+ {
+ --refs;
+ if (refs == 0) delete this;
+ }
+
+ std::size_t refs;
+ };
+
+ template <typename T>
+ struct Impl : ImplBase
+ {
+ Impl(T v)
+ : data(v)
+ {}
+ ~Impl() {}
+ T data;
+ };
+
+ // The following method is static because it doesn't
+ // operate on variant_t instances.
+ template <typename T>
+ static Impl<T>* CastFromBase(ImplBase* v)
+ {
+ // This upcast will fail if T is other than the T used
+ // with the constructor of variant_t.
+ Impl<T>* p = dynamic_cast<Impl<T>*>(v);
+ if (p == NULL)
+ throw std::invalid_argument(typeid(T).name() + std::string(" is not a valid type"));
+ return p;
+ }
+
+ ImplBase* data;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_VARIANT_H
diff --git a/src/utils/utils_global.h b/src/utils/utils_global.h
new file mode 100644
index 0000000..6080ec4
--- /dev/null
+++ b/src/utils/utils_global.h
@@ -0,0 +1,21 @@
+// ***************************************************************************
+// utils_global.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides macros for exporting & importing BamTools-utils library symbols
+// ***************************************************************************
+
+#ifndef UTILS_GLOBAL_H
+#define UTILS_GLOBAL_H
+
+#include "shared/bamtools_global.h"
+
+#ifdef BAMTOOLS_UTILS_LIBRARY
+#define UTILS_EXPORT BAMTOOLS_LIBRARY_EXPORT
+#else
+#define UTILS_EXPORT BAMTOOLS_LIBRARY_IMPORT
+#endif
+
+#endif // UTILS_GLOBAL_H