summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.github/workflows/Build.yml8
-rw-r--r--README.md5
-rw-r--r--conf/Nanopore-ncm23-May2024.conf44
-rw-r--r--debian/changelog15
-rw-r--r--debian/control2
-rw-r--r--debian/patches/gcc-13-bis.patch37
-rw-r--r--debian/patches/gcc-13.patch77
-rw-r--r--debian/patches/series2
-rwxr-xr-xdebian/rules16
-rw-r--r--docs/CommandLineOptions.html229
-rw-r--r--docs/Configurations.html15
-rw-r--r--docs/MakeRelease.html4
-rw-r--r--docs/Mode3-0.12.0.html132
-rw-r--r--docs/Mode3Chain.pngbin0 -> 41713 bytes
-rw-r--r--docs/QuickStart.html18
-rw-r--r--docs/Running.html16
-rw-r--r--docs/Shasta-0.12.0.pdfbin0 -> 7696768 bytes
-rw-r--r--docs/SupportedPlatforms.html13
-rw-r--r--docs/index.html7
-rw-r--r--dynamicExecutable/CMakeLists.txt6
-rw-r--r--dynamicLibrary/CMakeLists.txt4
-rwxr-xr-xscripts/AlignPseudoPaths.py30
-rwxr-xr-xscripts/AnalyzeMode3Subgraph.py11
-rwxr-xr-xscripts/ComputeAlignments.py3
-rwxr-xr-xscripts/CreateConfigurationTable.py1
-rwxr-xr-xscripts/CreateMode3Detangler.py11
-rwxr-xr-xscripts/CreateMode3PathGraph.py12
-rwxr-xr-xscripts/FindAlignmentCandidatesLowHash1.py34
-rw-r--r--scripts/FlagPrimaryMarkerGraphEdges.py18
-rwxr-xr-xscripts/GenerateRandomHaplotypes.py103
-rwxr-xr-xscripts/InstallPrerequisites-Ubuntu.sh7
-rw-r--r--scripts/Mode3AssembleComponent.py39
-rw-r--r--[-rwxr-xr-x]scripts/Mode3Assembly.py44
-rwxr-xr-xscripts/RandomlySelectKmers.py16
-rwxr-xr-xscripts/ReverseTransitiveReduction.py21
-rwxr-xr-xscripts/SelectKmers2.py17
-rwxr-xr-xscripts/SelectKmers4.py17
-rwxr-xr-xscripts/SelectKmersBasedOnFrequency.py17
-rwxr-xr-xscripts/VertexCoverageStatisticsByKmerId.py23
-rwxr-xr-xscripts/WriteAlignmentDetails.py19
-rwxr-xr-xscripts/WriteMarkersFrequency.py7
-rw-r--r--scripts/testGlobalMsa.py33
-rw-r--r--src/Align4.cpp30
-rw-r--r--src/Align4.hpp25
-rw-r--r--src/Alignment.hpp4
-rw-r--r--src/Assembler.cpp69
-rw-r--r--src/Assembler.hpp492
-rw-r--r--src/AssemblerAlign.cpp103
-rw-r--r--src/AssemblerAlign1.cpp71
-rw-r--r--src/AssemblerAlign3.cpp48
-rw-r--r--src/AssemblerAlign4.cpp91
-rw-r--r--src/AssemblerAlign5.cpp737
-rw-r--r--src/AssemblerAnalyzePaths.cpp129
-rw-r--r--src/AssemblerAssemblyGraph.cpp85
-rw-r--r--src/AssemblerAssemblyGraph2.cpp1
-rw-r--r--src/AssemblerCreateReadGraphUsingPseudoPaths.cpp4
-rw-r--r--src/AssemblerDetangle.cpp1
-rw-r--r--src/AssemblerHttpServer-Alignments.cpp70
-rw-r--r--src/AssemblerHttpServer-AssemblyGraph.cpp1
-rw-r--r--src/AssemblerHttpServer-CompressedAssemblyGraph.cpp1
-rw-r--r--src/AssemblerHttpServer-MarkerGraph0.cpp (renamed from src/AssemblerHttpServer-MarkerGraph.cpp)118
-rw-r--r--src/AssemblerHttpServer-MarkerGraph1.cpp667
-rw-r--r--src/AssemblerHttpServer-Mode3.cpp996
-rw-r--r--src/AssemblerHttpServer-ReadGraph.cpp11
-rw-r--r--src/AssemblerHttpServer-Reads.cpp19
-rw-r--r--src/AssemblerHttpServer.cpp149
-rw-r--r--src/AssemblerLowHash.cpp52
-rw-r--r--src/AssemblerMarkerGraph.cpp559
-rw-r--r--src/AssemblerMarkerGraphEdges.cpp556
-rw-r--r--src/AssemblerMarkers.cpp530
-rw-r--r--src/AssemblerMode3.cpp254
-rw-r--r--src/AssemblerOptions.cpp321
-rw-r--r--src/AssemblerOptions.hpp127
-rw-r--r--src/AssemblerReadGraph.cpp6
-rw-r--r--src/AssemblerReads.cpp10
-rw-r--r--src/AssemblyGraph.cpp1
-rw-r--r--src/AssemblyGraph.hpp6
-rw-r--r--src/AssemblyGraph2.cpp55
-rw-r--r--src/AssemblyGraph2.hpp9
-rw-r--r--src/AssemblyPathGraph.cpp1
-rw-r--r--src/AssemblyPathGraph.hpp15
-rw-r--r--src/AssemblyPathGraph2.cpp1
-rw-r--r--src/AssemblyPathGraph2.hpp18
-rw-r--r--src/CompressedAssemblyGraph.cpp1
-rw-r--r--src/CompressedAssemblyGraph.hpp20
-rw-r--r--src/ConfigurationTable.cpp45
-rw-r--r--src/HashedKmerChecker.cpp118
-rw-r--r--src/HashedKmerChecker.hpp43
-rw-r--r--src/HttpServer.hpp9
-rw-r--r--src/Kmer.hpp25
-rw-r--r--src/KmerChecker.hpp23
-rw-r--r--src/KmerCheckerFactory.cpp118
-rw-r--r--src/KmerCheckerFactory.hpp39
-rw-r--r--src/KmerTable.cpp (renamed from src/AssemblerKmers.cpp)512
-rw-r--r--src/KmerTable.hpp215
-rw-r--r--src/LocalAssemblyGraph.cpp3
-rw-r--r--src/LocalAssemblyGraph.hpp16
-rw-r--r--src/LocalMarkerGraph0-Write.cpp (renamed from src/LocalMarkerGraph-Write.cpp)85
-rw-r--r--src/LocalMarkerGraph0.cpp (renamed from src/LocalMarkerGraph.cpp)80
-rw-r--r--src/LocalMarkerGraph0.hpp (renamed from src/LocalMarkerGraph.hpp)59
-rw-r--r--src/LocalMarkerGraph0RequestParameters.hpp (renamed from src/LocalMarkerGraphRequestParameters.hpp)4
-rw-r--r--src/LocalMarkerGraph1.cpp1067
-rw-r--r--src/LocalMarkerGraph1.hpp134
-rw-r--r--src/LocalReadGraph.cpp39
-rw-r--r--src/LocalReadGraph.hpp3
-rw-r--r--src/LowHash0.cpp144
-rw-r--r--src/LowHash0.hpp20
-rw-r--r--src/LowHash1.cpp685
-rw-r--r--src/LowHash1.hpp224
-rw-r--r--src/MappedMemoryOwner.hpp45
-rw-r--r--src/Marker.hpp43
-rw-r--r--src/MarkerFinder.cpp9
-rw-r--r--src/MarkerFinder.hpp5
-rw-r--r--src/MarkerGraph.cpp514
-rw-r--r--src/MarkerGraph.hpp139
-rw-r--r--src/MarkerGraphEdgePairInfo.hpp91
-rw-r--r--src/Mode3Assembler.cpp477
-rw-r--r--src/Mode3Assembler.hpp80
-rw-r--r--src/PythonModule.cpp109
-rw-r--r--src/ReadFlags.hpp8
-rw-r--r--src/Reads.cpp147
-rw-r--r--src/Reads.hpp5
-rw-r--r--src/ShortBaseSequence.cpp32
-rw-r--r--src/ShortBaseSequence.hpp31
-rw-r--r--src/approximateTopologicalSort.hpp1
-rw-r--r--src/assembleMarkerGraphPath.cpp5
-rw-r--r--src/assembleMarkerGraphPath.hpp2
-rw-r--r--src/bitReversal.hpp54
-rw-r--r--src/computeLayout.hpp5
-rw-r--r--src/copyNumber.hpp20
-rw-r--r--src/deduplicate.hpp99
-rw-r--r--src/enumeratePaths.cpp50
-rw-r--r--src/enumeratePaths.hpp146
-rw-r--r--src/findLinearChains.hpp10
-rw-r--r--src/globalMsa.cpp471
-rw-r--r--src/globalMsa.hpp62
-rw-r--r--src/html.cpp8
-rw-r--r--src/html.hpp2
-rw-r--r--src/invalid.hpp6
-rw-r--r--src/localTransitiveReduction.hpp115
-rw-r--r--src/longestPath.cpp23
-rw-r--r--src/longestPath.hpp114
-rw-r--r--src/markerAccessFunctions.cpp86
-rw-r--r--src/markerAccessFunctions.hpp53
-rw-r--r--src/mode3-AssemblyGraph.cpp8499
-rw-r--r--src/mode3-AssemblyGraph.hpp886
-rw-r--r--src/mode3-AssemblyPath.cpp1269
-rw-r--r--src/mode3-AssemblyPath.hpp206
-rw-r--r--src/mode3-Detangler.cpp415
-rw-r--r--src/mode3-Detangler.hpp153
-rw-r--r--src/mode3-JaccardGraph.cpp957
-rw-r--r--src/mode3-JaccardGraph.hpp256
-rw-r--r--src/mode3-LocalAssembly.cpp1997
-rw-r--r--src/mode3-LocalAssembly.hpp345
-rw-r--r--src/mode3-LocalAssemblyGraph.cpp1576
-rw-r--r--src/mode3-LocalAssemblyGraph.hpp186
-rw-r--r--src/mode3-PathGraph.cpp1393
-rw-r--r--src/mode3-PathGraph.hpp286
-rw-r--r--src/mode3-PhasedComponent.cpp31
-rw-r--r--src/mode3-PhasedComponent.hpp29
-rw-r--r--src/mode3-PhasingTable.cpp1258
-rw-r--r--src/mode3-PhasingTable.hpp250
-rw-r--r--src/mode3-PrimaryGraph.cpp548
-rw-r--r--src/mode3-PrimaryGraph.hpp148
-rw-r--r--src/mode3-SegmentPairInformation.hpp80
-rw-r--r--src/mode3.cpp3001
-rw-r--r--src/mode3.hpp710
-rw-r--r--src/removeReciprocalEdges.hpp34
-rw-r--r--src/seqan.hpp125
-rw-r--r--src/shastaTypes.hpp4
-rw-r--r--src/transitiveReduction.hpp156
-rw-r--r--srcMain/main.cpp226
-rw-r--r--staticExecutable/CMakeLists.txt6
-rw-r--r--staticLibrary/CMakeLists.txt2
174 files changed, 24260 insertions, 15146 deletions
diff --git a/.github/workflows/Build.yml b/.github/workflows/Build.yml
index 0506cc6..1014563 100644
--- a/.github/workflows/Build.yml
+++ b/.github/workflows/Build.yml
@@ -19,8 +19,8 @@ jobs:
tar -cvf shasta-docs.tar --transform='s/docs/shastaDocs/' docs
mkdir shasta-build
cd shasta-build
- # cmake .. -DBUILD_ID="Shasta unreleased test build newer than release 0.11.0 at commit "$GITHUB_SHA
- cmake .. -DBUILD_ID="Shasta Release 0.11.1"
+ # cmake .. -DBUILD_ID="Shasta unreleased test build newer than release 0.11.1 at commit "$GITHUB_SHA
+ cmake .. -DBUILD_ID="Shasta Release 0.12.0"
make -j 2 all
make install/strip
mv shasta-install shasta-Ubuntu-22.04
@@ -55,8 +55,8 @@ jobs:
lsb_release -a
mkdir shasta-build
cd shasta-build
- # cmake .. -DBUILD_DYNAMIC_EXECUTABLE=OFF -DBUILD_DYNAMIC_LIBRARY=OFF -DBUILD_ID="Shasta unreleased test build newer than release 0.11.0 at commit "$GITHUB_SHA
- cmake .. -DBUILD_DYNAMIC_EXECUTABLE=OFF -DBUILD_DYNAMIC_LIBRARY=OFF -DBUILD_ID="Shasta Release 0.11.1 minimal build"
+ # cmake .. -DBUILD_DYNAMIC_EXECUTABLE=OFF -DBUILD_DYNAMIC_LIBRARY=OFF -DBUILD_ID="Shasta unreleased test build newer than release 0.11.1 at commit "$GITHUB_SHA
+ cmake .. -DBUILD_DYNAMIC_EXECUTABLE=OFF -DBUILD_DYNAMIC_LIBRARY=OFF -DBUILD_ID="Shasta Release 0.12.0 minimal build"
make -j 2 all
make install/strip
mv shasta-install shasta-Ubuntu-22.04
diff --git a/README.md b/README.md
index e645f4c..3bbca68 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,11 @@
# Shasta long read assembler
De novo assembler for long reads, optimized for Oxford Nanopore (ONT) reads.
+
+🆕 [Mode 3 assembly: presentation of assembly results](https://paoloshasta.github.io/shasta/Shasta-0.12.0.pdf)
+
+🆕 [Mode 3 assembly: usage notes](https://paoloshasta.github.io/shasta/Mode3-0.12.0.html)
+
___
**Shasta development continues in this fork.**
diff --git a/conf/Nanopore-ncm23-May2024.conf b/conf/Nanopore-ncm23-May2024.conf
new file mode 100644
index 0000000..3569298
--- /dev/null
+++ b/conf/Nanopore-ncm23-May2024.conf
@@ -0,0 +1,44 @@
+# This assembly configuration is for nanopore reads generated using the
+# "Experimental extremely high-accuracy, ultra-long sequencing kit"
+# from the ONT December 2023 data release:
+# https://labs.epi2me.io/gm24385_ncm23_preview/
+
+# It uses Mode 3 assembly to create a phased assembly.
+# It was only tested for a human genome at coverage 40x to 60x,
+# but it should work at lower or higher coverage,
+# within reasonable limits, because it includes some
+# provisions for coverage adaptivity.
+
+[Reads]
+representation = 0
+minReadLength = 10000
+noCache = True
+palindromicReads.deltaThreshold = 300
+
+[Kmers]
+k = 30
+probability = 0.05
+
+[MinHash]
+minHashIterationCount = 50
+minBucketSize = 0
+maxBucketSize = 0
+minFrequency = 5
+
+[Align]
+alignMethod = 5
+sameChannelReadAlignment.suppressDeltaThreshold = 30
+minAlignedMarkerCount = 1000
+minAlignedFraction = 0.9
+maxSkip = 20
+maxDrift = 10
+maxTrim = 20
+
+[ReadGraph]
+maxAlignmentCount = 20
+strandSeparationMethod = 2
+
+[Assembly]
+mode = 3
+
+
diff --git a/debian/changelog b/debian/changelog
index a17c104..223e637 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,18 @@
+shasta (0.12.0-1) unstable; urgency=medium
+
+ * New upstream version 0.12.0
+ * gcc-13.patch: delete: applied upstream.
+ * gcc-13-bis.patch: delete: applied upstream.
+ * d/control: declare compliance to standards version 4.7.0.
+
+ -- Étienne Mollier <emollier@debian.org> Sun, 26 May 2024 18:05:55 +0200
+
+shasta (0.11.1-5) unstable; urgency=medium
+
+ * d/rules: meddle with RPATH only after dh_shlibdeps. (Closes: #1069370)
+
+ -- Étienne Mollier <emollier@debian.org> Wed, 08 May 2024 22:02:57 +0200
+
shasta (0.11.1-4) unstable; urgency=medium
* gcc-13-bis.patch: new: fix ftbfs with gcc 13.2.0. (Closes: #1059139)
diff --git a/debian/control b/debian/control
index fa214cb..e5779d8 100644
--- a/debian/control
+++ b/debian/control
@@ -24,7 +24,7 @@ Build-Depends: debhelper-compat (= 13),
libblas-dev,
liblapack-dev,
gfortran
-Standards-Version: 4.6.2
+Standards-Version: 4.7.0
Vcs-Browser: https://salsa.debian.org/med-team/shasta
Vcs-Git: https://salsa.debian.org/med-team/shasta.git
Homepage: https://github.com/chanzuckerberg/shasta
diff --git a/debian/patches/gcc-13-bis.patch b/debian/patches/gcc-13-bis.patch
deleted file mode 100644
index 30d8e9c..0000000
--- a/debian/patches/gcc-13-bis.patch
+++ /dev/null
@@ -1,37 +0,0 @@
-Description: fix a new wave of ftbfs with gcc-13.
-Author: Étienne Mollier <emollier@debian.org>
-Bug-Debian: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1059139
-Forwarded: https://github.com/paoloshasta/shasta/pull/20
-Last-Update: 2023-12-20
----
-This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
---- shasta.orig/src/CompactUndirectedGraph.hpp
-+++ shasta/src/CompactUndirectedGraph.hpp
-@@ -26,6 +26,7 @@
- #include <boost/graph/iteration_macros.hpp>
-
- // Standard library.
-+#include "algorithm.hpp"
- #include "array.hpp"
- #include "iostream.hpp"
- #include <limits>
---- shasta.orig/src/shortestPath.hpp
-+++ shasta/src/shortestPath.hpp
-@@ -32,6 +32,7 @@
- #include <boost/graph/iteration_macros.hpp>
-
- // Standard library.
-+#include "algorithm.hpp"
- #include "cstddef.hpp"
- #include "cstdint.hpp"
- #include <queue>
---- shasta.orig/src/mode3-PathGraph.cpp
-+++ shasta/src/mode3-PathGraph.cpp
-@@ -13,6 +13,7 @@
- #include <boost/icl/interval_set.hpp>
-
- // Standard library.
-+#include <bitset>
- #include "fstream.hpp"
- #include "iostream.hpp"
- #include <queue>
diff --git a/debian/patches/gcc-13.patch b/debian/patches/gcc-13.patch
deleted file mode 100644
index 89290b5..0000000
--- a/debian/patches/gcc-13.patch
+++ /dev/null
@@ -1,77 +0,0 @@
-Description: fix build failure with gcc 13.
-Author: Étienne Mollier <emollier@debian.org>
-Bug-Debian: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1042196
-Forwarded: https://github.com/paoloshasta/shasta/pull/15
-Last-Update: 2023-08-23
----
-This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
---- shasta.orig/src/Base.hpp
-+++ shasta/src/Base.hpp
-@@ -11,6 +11,7 @@
- #include "SHASTA_ASSERT.hpp"
-
- #include "array.hpp"
-+#include "cstdint.hpp"
- #include "iostream.hpp"
- #include "stdexcept.hpp"
- #include "string.hpp"
---- shasta.orig/src/PeakFinder.hpp
-+++ shasta/src/PeakFinder.hpp
-@@ -50,6 +50,7 @@
- ***********************************************************************************************************************/
-
-
-+#include "cstdint.hpp"
- #include "stdexcept.hpp"
- #include "iostream.hpp"
- #include "utility.hpp"
---- shasta.orig/src/PngImage.hpp
-+++ shasta/src/PngImage.hpp
-@@ -2,6 +2,7 @@
- #define SHASTA_PNG_IMAGE_HPP
-
- #include <png.h>
-+#include "cstdint.hpp"
- #include "string.hpp"
- #include "vector.hpp"
-
---- shasta.orig/src/dset64-gccAtomic.hpp
-+++ shasta/src/dset64-gccAtomic.hpp
-@@ -1,6 +1,7 @@
- #if !defined(__DSET64_GCC_ATOMIC_HPP)
- #define __DSET64_GCC_ATOMIC_HPP
-
-+#include <cstdint>
- #include <stdexcept>
-
- /**
---- shasta.orig/src/platformDependent.hpp
-+++ shasta/src/platformDependent.hpp
-@@ -1,6 +1,7 @@
- #ifndef SHASTA_PLATFORM_DEPENDENT_HPP
- #define SHASTA_PLATFORM_DEPENDENT_HPP
-
-+#include "cstdint.hpp"
- #include "string.hpp"
-
- namespace shasta {
---- shasta.orig/src/shortestPath.hpp
-+++ shasta/src/shortestPath.hpp
-@@ -33,6 +33,7 @@
-
- // Standard library.
- #include "cstddef.hpp"
-+#include "cstdint.hpp"
- #include <queue>
- #include "vector.hpp"
-
---- shasta.orig/src/span.hpp
-+++ shasta/src/span.hpp
-@@ -2,6 +2,7 @@
- #define SHASTA_SPAN_HPP
-
- #include "algorithm.hpp"
-+#include "cstdint.hpp"
- #include "iostream.hpp"
- #include "iterator.hpp"
- #include <span>
diff --git a/debian/patches/series b/debian/patches/series
deleted file mode 100644
index 9073569..0000000
--- a/debian/patches/series
+++ /dev/null
@@ -1,2 +0,0 @@
-gcc-13.patch
-gcc-13-bis.patch
diff --git a/debian/rules b/debian/rules
index 1dc623c..7734717 100755
--- a/debian/rules
+++ b/debian/rules
@@ -31,13 +31,6 @@ override_dh_install-arch:
dh_install -a
# Rename shastaDynamic to shasta for ease
mv debian/shasta/usr/bin/shastaDynamic debian/shasta/usr/bin/shasta
- # The library is in a more unusual place (nested within /usr/lib), so modify
- # the ELF
- chrpath -r /usr/lib/python3/dist-packages \
- debian/shasta/usr/bin/shasta
- patchelf \
- --replace-needed shasta.so $(MULTIARCH_SONAME) \
- debian/shasta/usr/bin/shasta
execute_after_dh_python3-arch:
patchelf \
@@ -57,3 +50,12 @@ override_dh_missing:
# Remove rest of files which have already been installed
rm -rf debian/tmp/${CURDIR}
dh_missing --list-missing
+
+execute_after_dh_shlibdeps:
+ # The library is in a more unusual place (nested within /usr/lib), so
+ # modify the ELF
+ chrpath -r /usr/lib/python3/dist-packages \
+ debian/shasta/usr/bin/shasta
+ patchelf \
+ --replace-needed shasta.so $(MULTIARCH_SONAME) \
+ debian/shasta/usr/bin/shasta
diff --git a/docs/CommandLineOptions.html b/docs/CommandLineOptions.html
index 29afe05..7ce8524 100644
--- a/docs/CommandLineOptions.html
+++ b/docs/CommandLineOptions.html
@@ -219,6 +219,33 @@ Implemented for Linux only (uses the <a href='http://man7.org/linux/man-pages/ma
Can help performance, but only use it if you know you will not
need to access the input files again soon.
+
+<tr id='Reads.handleDuplicates'>
+<td><code>--Reads.handleDuplicates</code><td class=centered><code>useOneCopy</code><td>
+Specifies how to handle reads with duplicate names (the name of a read is its
+id in an input fasta or fastq file).
+These can occasionally occur, typically due to glitches in the basecalling or subsequent
+pipelines before assembly starts.
+Can be one of the following:
+<ul>
+<li><code>useAllCopies</code>:
+All copies of reads with duplicate names are used in the assembly.
+This can cause artifacts in some cases.
+This was the Shasta behavior before this option was introduced.
+<li><code>useOneCopy</code>:
+For each set of reads with duplicate names, only one read is used in the assembly.
+This is the default.
+<li><code>useNone</code>:
+None of the reads with duplicate names are used in the assembly.
+<li><code>forbid</code>:
+If any reads with duplicate names are found, the assembly stops.
+</ul>
+In all cases, a message is written with the number of reads with duplicate names
+found, and the number of reads that were discarded for that reason.
+A file <code>DuplicateReads.csv</code> ,
+listing details for all reads wth duplicate names,
+is also written to the assembly directory.
+
<tr id='Reads.palindromicReads.skipFlagging'>
<td><code>--Reads.palindromicReads.skipFlagging</code><td class=centered><code>False</code><td>
Skip flagging palindromic reads. Oxford Nanopore reads should be flagged for better results.
@@ -263,21 +290,26 @@ Can be one of the following:
<li>1: Random selection, excluding k-mers that are globally overenriched,
as defined by their global frequency in input reads, and by
the value specified as <code>--Kmers.enrichmentThreshold</code>.
+Only supported when <code>--Kmers.k</code> is less than 16.
<li>2: Random selection, excluding k-mers that are overenriched
even in a single read,
as defined by
the value specified as <code>--Kmers.enrichmentThreshold</code>.
+Only supported when <code>--Kmers.k</code> is less than 16.
<li>3: Read from file. Use <code>--Kmers.file</code>
to specify the file.
+Only supported when <code>--Kmers.k</code> is less than 16.
<li>4: Random selection, excluding k-mers that appear
in two copies close to each other, even in a single read.
The two k-mer copies are considered close if they occur at a distance from each other less than
<code>--Kmers.distanceThreshold</code> RLE bases.
+Only supported when <code>--Kmers.k</code> is less than 16.
</ul>
<tr id='Kmers.k'>
<td><code>--Kmers.k</code><td class=centered><code>10</code><td>
Length of marker <i>k</i>-mers (in run-length representation).
+Can be up to 31 for Mode 0 assembly, 30 for Mode 2 assembly.
<a class=qm href='ComputationalMethods.html#Markers'/>
<tr id='Kmers.probability'>
@@ -306,7 +338,7 @@ Only used if <code>--Kmers.generationMethod</code> is 3.
<tr id='MinHash.version'>
<td><code>--MinHash.version</code><td class=centered><code>0</code><td>
The version of the MinHash/LowHash algorithm to be used.
-Can be 0 (default) or 1 (experimental).
+Must be 0 (default).
<tr id='MinHash.m'>
<td><code>--MinHash.m</code><td class=centered><code>4</code><td>
@@ -335,11 +367,15 @@ If <code>--MinHash.minHashIterationCount</code> is not 0, this is not used.
<tr id='MinHash.minBucketSize'>
<td><code>--MinHash.minBucketSize</code><td class=centered><code>0</code><td>
The minimum size for a bucket to be used by the MinHash/LowHash algoritm.
+If minBucketSize and maxBucketSize are both 0, they are adjusted automatically
+at each iteration using simple heuristics.
<a class=qm href='ComputationalMethods.html#FindingOverlappingReads'/>
<tr id='MinHash.maxBucketSize'>
<td><code>--MinHash.maxBucketSize</code><td class=centered><code>10</code><td>
The maximum size for a bucket to be used by the MinHash/LowHash algoritm.
+If minBucketSize and maxBucketSize are both 0, they are adjusted automatically
+at each iteration using simple heuristics.
<a class=qm href='ComputationalMethods.html#FindingOverlappingReads'/>
<tr id='MinHash.minFrequency'>
@@ -364,7 +400,7 @@ The alignment method to be used to compute marker alignments between reads:
<li>0 = Old Shasta alignment method. Use this to reproduce Shasta behavior before release 0.5.0.
<li>1 = SeqAn. This gives the best alignment results but it is slow and should only be used for testing.
<li>3 = Banded SeqAn.
-<li>4 = New Shasta alignment method (experimental).
+<li>4 and 5 = experimental.
</ul>
<a class=qm href='ComputationalMethods.html#OptimalAlignments'/>
@@ -471,6 +507,14 @@ Only used for alignment method 4 (experimental).
<td><code>--Align.align4.maxDistanceFromBoundary</code><td class=centered><code>100</code><td>
Only used for alignment method 4 (experimental).
+<tr id='Align.align5.driftRateTolerance'>
+<td><code>--Align.align5.driftRateTolerance</code><td class=centered><code>0.02</code><td>
+Maximum allowed drift rate for alignment method 5.
+
+<tr id='Align.align5.minBandExtend'>
+<td><code>--Align.align5.minBandExtend</code><td class=centered><code>10</code><td>
+Minimum band extension, in markers, for alignment method 5.
+
<tr id='ReadGraph.creationMethod'>
<td><code>--ReadGraph.creationMethod</code><td class=centered><code>0</code><td>
The method used to create the read graph (0 or 2).
@@ -667,14 +711,6 @@ with average edge coverage less than this value are removed, together with the
corresponding marker graph edges. A cross edge is defined as an edge v0->v1
with out-degree(v0)>1, in-degree(v1)>1.
-
-<tr id='MarkerGraph.reverseTransitiveReduction'>
-<td><code>--MarkerGraph.reverseTransitiveReduction</code><td class=centered><code>False</code><td>
-This is a
-<a href="#BooleanSwitches">Boolean switch</a>.
-If set, approximate reverse transitive reduction of the marker
-graph in the reverse direction is also performed.
-
<tr id='MarkerGraph.peakFinder.minAreaFraction'>
<td><code>--MarkerGraph.peakFinder.minAreaFraction</code><td class=centered><code>0.08</code><td>
Used in the automatic selection of
@@ -927,8 +963,181 @@ This is a
If set, output of the haploid representation of the assembly is suppressed. Mode 2 assembly only.
<a class=qm href='ComputationalMethods.html#Mode2Assembly'/>
+<tr id='Assembly.mode3.minPrimaryCoverage'>
+<td><code>--Assembly.mode3.minPrimaryCoverage</code><td class=centered><code>0</code><td>
+Minimum primary coverage.
+If <code>minPrimaryCoverage</code> and <code>maxPrimaryCoverage</code> are both 0,
+they are set automatically to appropriate values using a simple heuristic.
+Only used with <code>--Assembly.mode 3</code>.
+
+<tr id='Assembly.mode3.maxPrimaryCoverage'>
+<td><code>--Assembly.mode3.maxPrimaryCoverage</code><td class=centered><code>0</code><td>
+Maximum primary coverage.
+If <code>minPrimaryCoverage</code> and <code>maxPrimaryCoverage</code> are both 0,
+they are set automatically to appropriate values using a simple heuristic.
+Only used with <code>--Assembly.mode 3</code>.
+
+<tr id='Assembly.mode3.primaryGraph.maxLoss'>
+<td><code>--Assembly.mode3.primaryGraph.maxLoss</code>
+<td class=centered><code>0.1</code><td>
+Used for weak edge removal in the primary graph.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.primaryGraph.crossEdgesLowCoverageThreshold'>
+<td><code>--Assembly.mode3.primaryGraph.crossEdgesLowCoverageThreshold</code>
+<td class=centered><code>1</code><td>
+Low coverage threshold for cross edge removal in the primary graph.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.primaryGraph.crossEdgesHighCoverageThreshold'>
+<td><code>--Assembly.mode3.primaryGraph.crossEdgesHighCoverageThreshold</code>
+<td class=centered><code>3</code><td>
+High coverage threshold for cross edge removal in the primary graph.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.assemblyGraph.detangleToleranceLow'>
+<td><code>--Assembly.mode3.assemblyGraph.detangleToleranceLow</code>
+<td class=centered><code>0</code><td>
+Used for detangling of the assembly graph.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.assemblyGraph.detangleToleranceHigh'>
+<td><code>--Assembly.mode3.assemblyGraph.detangleToleranceHigh</code>
+<td class=centered><code>2</code><td>
+Used for detangling of the assembly graph.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.assemblyGraph.epsilon'>
+<td><code>--Assembly.mode3.assemblyGraph.epsilon</code>
+<td class=centered><code>0.1</code><td>
+&epsilon; value for the Bayesian model used for detangling the assembly graph.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.assemblyGraph.minLogP'>
+<td><code>--Assembly.mode3.assemblyGraph.minLogP</code>
+<td class=centered><code>20</code><td>
+<code>MinLogP</code> value (in dB) for the Bayesian model used for detangling the assembly graph.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.assemblyGraph.longBubbleThreshold'>
+<td><code>--Assembly.mode3.assemblyGraph.longBubbleThreshold</code>
+<td class=centered><code>5000</code><td>
+Long bubble threshold .
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.assemblyGraph.phaseErrorThreshold'>
+<td><code>--Assembly.mode3.assemblyGraph.phaseErrorThreshold</code>
+<td class=centered><code>0.1</code><td>
+Phase error threshold for phasing.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.assemblyGraph.bubbleErrorThreshold'>
+<td><code>--Assembly.mode3.assemblyGraph.bubbleErrorThreshold</code>
+<td class=centered><code>0.03</code><td>
+Bubble error threshold for bubble cleanup.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.assemblyGraph.bubbleCleanupMaxOffset'>
+<td><code>--Assembly.mode3.assemblyGraph.bubbleCleanupMaxOffset</code>
+<td class=centered><code>1000</code><td>
+Maximum bubble offset for bubble cleanup.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.assemblyGraph.chainTerminalCommonThreshold'>
+<td><code>--Assembly.mode3.assemblyGraph.chainTerminalCommonThreshold</code>
+<td class=centered><code>3</code><td>
+Used for bubble cleanup.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.assemblyGraph.superbubbleLengthThreshold1'>
+<td><code>--Assembly.mode3.assemblyGraph.superbubbleLengthThreshold1</code>
+<td class=centered><code>30000</code><td>
+Length threshold used for superbubble cleanup.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.assemblyGraph.superbubbleLengthThreshold2'>
+<td><code>--Assembly.mode3.assemblyGraph.superbubbleLengthThreshold2</code>
+<td class=centered><code>10000</code><td>
+Low length threshold used for superbubble removal.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.assemblyGraph.superbubbleLengthThreshold3'>
+<td><code>--Assembly.mode3.assemblyGraph.superbubbleLengthThreshold3</code>
+<td class=centered><code>30000</code><td>
+High length threshold used for superbubble removal.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.assemblyGraph.superbubbleLengthThreshold4'>
+<td><code>--Assembly.mode3.assemblyGraph.superbubbleLengthThreshold4</code>
+<td class=centered><code>30000</code><td>
+Length threshold used for superbubble detangling.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.localAssembly.estimatedOffsetRatio'>
+<td><code>--Assembly.mode3.localAssembly.estimatedOffsetRatio</code>
+<td class=centered><code>1.1</code><td>
+For local assembly, the estimated offset between the left and right gets
+extended by this ratio to decide how much to extend reads that only appear on one side only.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.localAssembly.vertexSamplingRate'>
+<td><code>--Assembly.mode3.localAssembly.vertexSamplingRate</code>
+<td class=centered><code>0.8</code><td>
+Vertex sampling rate for local assembly, used to set minVertexCoverage.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.localAssembly.matchScore'>
+<td><code>--Assembly.mode3.localAssembly.matchScore</code>
+<td class=centered><code>6</code><td>
+Match score for alignment computation in local assembly.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.localAssembly.mismatchScore'>
+<td><code>--Assembly.mode3.localAssembly.mismatchScore</code>
+<td class=centered><code>-1</code><td>
+Mismatch score for alignment computation in local assembly.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.localAssembly.gapScore'>
+<td><code>--Assembly.mode3.localAssembly.gapScore</code>
+<td class=centered><code>-1</code><td>
+Gap score for alignment computation in local assembly.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.localAssembly.maxSkipBases'>
+<td><code>--Assembly.mode3.localAssembly.maxSkipBases</code>
+<td class=centered><code>500</code><td>
+Number of bases (not markers) that can be skipped by an alignment in local assembly.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.localAssembly.maxDrift'>
+<td><code>--Assembly.mode3.localAssembly.maxDrift</code>
+<td class=centered><code>0.005</code><td>
+The maximum tolerated length drift of each read.
+Used to compute the band for banded alignments in local assembly.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.localAssembly.minHalfBand'>
+<td><code>--Assembly.mode3.localAssembly.minHalfBand</code>
+<td class=centered><code>100</code><td>
+Minimum half band, in markers, for alignment computations in local assembly.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.localAssembly.minScoreRatio'>
+<td><code>--Assembly.mode3.localAssembly.minScoreRatio</code>
+<td class=centered><code>0.7</code><td>
+Score threshold for discarding alignments in for local assembly.
+Mode 3 assembly only.
+
+<tr id='Assembly.mode3.localAssembly.maxMsaLength'>
+<td><code>--Assembly.mode3.localAssembly.maxMsaLength</code>
+<td class=centered><code>5000</code><td>
+Maximum allowed length of a multiple sequence alignment computation for local assembly.
+Mode 3 assembly only.
+
</table>
+
<div class="goto-index"><a href="index.html">Table of contents</a></div>
</main>
diff --git a/docs/Configurations.html b/docs/Configurations.html
index 0571ab9..7b4f6f2 100644
--- a/docs/Configurations.html
+++ b/docs/Configurations.html
@@ -92,7 +92,7 @@ use Shasta command <code>listConfigurations</code> as follows:
shasta --command listConfigurations
</pre>
-At the time of writing (May 2022), this outputs the following
+At the time of writing (May 2024), this outputs the following
list of built-in configurations:
<pre>
@@ -120,6 +120,7 @@ Nanopore-R10-Fast-Nov2022
Nanopore-R10-Slow-Nov2022
Nanopore-Phased-R10-Fast-Nov2022
Nanopore-Phased-R10-Slow-Nov2022
+Nanopore-ncm23-May2024
</pre>
<p>
@@ -156,11 +157,21 @@ under the following conditions:
<td class=centered><code>Nanopore-R10-Fast-Nov2022</code>
<td class=centered><code>Nanopore-Phased-R10-Fast-Nov2022</code>
-<tr><th>R10, slow mode<th>Standard<th>Human genome with two flowcells
+<tr><th>R10, slow mode<br>(no longer in use)<th>Standard<th>Human genome with two flowcells
(about 45x)
<td class=centered><code>Nanopore-R10-Slow-Nov2022</code>
<td class=centered><code>Nanopore-Phased-R10-Slow-Nov2022</code>
+<tr>
+<th><a href='https://labs.epi2me.io/gm24385_ncm23_preview/'>
+ONT December 2023 Data release</a><br>
+(<i>"Experimental extremely high-accuracy, ultra-long
+sequencing kit"</i>)
+<th>Ultra-Long (UL)
+<th>Tested at 40x to 60x but may be functional outside this range
+<td>
+<td class=centered><code>Nanopore-ncm23-May2024</code>
+
</table>
diff --git a/docs/MakeRelease.html b/docs/MakeRelease.html
index 5432e61..1381293 100644
--- a/docs/MakeRelease.html
+++ b/docs/MakeRelease.html
@@ -38,7 +38,7 @@ Wait for that build to complete.
<li>Download the 3 artifacts. Unzip them and rename them to the following:
<ul>
<li><code>shasta-Linux-X.Y.Z</code>
-<li><code>shasta-Ubuntu-20.04-X.Y.Z.tar</code>
+<li><code>shasta-Ubuntu-22.04-X.Y.Z.tar</code>
<li><code>shasta-docs-X.Y.Z.tar</code>
</ul>
<li>Make sure <code>shasta-Linux-X.Y.Z</code> is executable.
@@ -51,7 +51,7 @@ cd shastaBuild
cmake ../shasta -DBUILD_ID="Shasta Release X.Y.Z for 64 bit ARM"
make install/strip -j
</code></li>
-<li>Download the <code>aarch64</code> Shasta binary (using <code>scp</code>) and
+<li>Download the <code>aarch64</code> Shasta binary and
rename it to <code>shasta-Linux-ARM-X.Y.Z</code>,
then make sure it is executable. </li>
</ul>
diff --git a/docs/Mode3-0.12.0.html b/docs/Mode3-0.12.0.html
new file mode 100644
index 0000000..67c4b33
--- /dev/null
+++ b/docs/Mode3-0.12.0.html
@@ -0,0 +1,132 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+<link rel=stylesheet href=style.css />
+</head>
+
+<body>
+<main>
+<div class="goto-index"><a href="index.html">Table of contents</a></div>
+
+<h1>Shasta Mode 3 assembly</h1>
+<h2>Summary</h2>
+<ul>
+
+<li>Uses new computational techniques to extract phased sequence from the marker graph.
+
+<li>Preliminary version released with Shasta 0.12.0, despite known issues, to encourage experimentation.
+Please share your experiences by filing
+<a href='https://github.com/paoloshasta/shasta/issues'>issues on the Shasta GitHub repository</a>.
+
+<li>Initially only supported for the new high accuracy Oxford Nanopore reads from the
+<a href='https://labs.epi2me.io/gm24385_ncm23_preview/'>2023.12 data release</a>.
+It is possible that additional future releases will also support ONT R10 reads.
+
+<li>Despite the known issues, it produces useful phased assemblies.
+See <a href='Shasta-0.12.0.pdf'>this presentation</a> for an analysis of assembly results.
+
+<li>Released with minimal usage documentation (this page).
+A description of computational techniques is not yet available.
+
+<li>Invoke using <code>--config Nanopore-ncm23-May2024</code>.
+This assembly configuration was only tested on human genomes
+at coverage 40x to 60x, but may be functional at higher or lower coverage,
+within reasonable limits.
+It includes limited adaptivity to coverage.
+
+</ul>
+
+
+
+<h2>Output files</h2>
+
+<p>
+Shasta uses <a href='https://github.com/GFA-spec/GFA-spec'>GFA</a> terminology.
+A contiguous piece of assembled sequence is a <i>Segment</i>.
+<i>Links</i> define adjacency between segments.
+
+<table>
+<tr>
+<td><code>Assembly.gfa</code>
+<td>The assembly graph in GFA 1.0 format.
+All link records include a Cigar string defining an exact overlap of a small
+but variable number of bases between adjacent segments.
+
+<tr>
+<td><code>Assembly-NoSequence.gfa</code>
+<td>Identical to <code>Assembly.gfa</code>, but does not contain any sequence.
+Faster to download, manipulate, and visualize in
+<a href='https://github.com/asl/BandageNG'>Bandage</a>.
+
+<tr>
+<td><code>Assembly.fasta</code>
+<td>The sequences of all assembled segments, in FASTA format.
+
+<tr>
+<td><code>Assembly.csv</code>
+<td>Contains one line of information for each assembled segment.
+It can be loaded in Bandage and also provides custom coloring of segments.
+</table>
+
+
+
+<h2>Naming of assembled segments</h2>
+<p>
+Assembled segments are organized in bubble chains.
+A bubble chain is a linear sequence of bubbles of any ploidy
+without any incoming/outgoing connections to/from
+the middle of the bubble chain.
+Some of the bubbles have ploidy 1 (haploid) and usually correspond
+to low heterozygosity region where haplotypes could not be separated.
+
+<p>
+Assembled segment names are of the form <code>a-b-c-d-Pn</code>,
+where:
+<ul>
+<li><code>a-b</code> identifies the bubble chain.
+<li><code>c</code> is the position of the bubble in the bubble chain.
+<li><code>d</code> identifies the haplotype in the bubble.
+<li><code>n</code> is the ploidy of the bubble.
+</ul>
+For example, the figure below illustrates segment naming for bubble chain
+<code>1-341</code>. Segment lengths are not to scale.
+This bubble chain consists of 7 bubbles, numbered from 0 to 6.
+Bubbles 0, 2, 4, and 6 are haploid.
+Bubbles 1, 3, and 5 are diploid.
+
+Segment <code>1-341-3-1-P2</code> is haplotype <code>1</code> of the diploid
+bubble at position <code>3</code> in bubble chain <code>1-341</code>.
+
+<img src='Mode3Chain.png'>
+
+<p>
+The assembly will contain trivial bubble chains consisting of a single haploid bubble,
+that is, a single assembled segment.
+These segments have similar naming, but <code>c</code>, <code>d</code>, and <code>n</code> are always
+<code>0</code>. For example, <code>1-136-0-0-P0</code>.
+
+<p>
+If <code>Assembly.csv</code> is loaded in Bandage, segments are displayed
+with custom colors as follows:
+<ul>
+<li>Segments of haploid bubbles of non-trivial bubble chains (names ending with <code>-P1</code>): red.
+<li>Segments of diploid bubbles of non-trivial bubble chains (names ending with <code>-P2</code>): green.
+<li>Segments of higher ploidy bubbles of non-trivial bubble chains
+(names ending with <code>-Pn</code>) with <code>n > 2</code> : yellow.
+<li>Segments of trivial bubble chains consisting of a single haploid bubble
+(names ending with <code>-P0</code>):
+<ul>
+<li>If isolated (two free ends): blue.
+<li>If dangling (one free end): cyan.
+<li>All others: purple.
+</ul>
+</ul>
+
+
+<p>
+<div class="goto-index"><a href="index.html">Table of contents</a></div>
+</main>
+</body>
+</html>
+
diff --git a/docs/Mode3Chain.png b/docs/Mode3Chain.png
new file mode 100644
index 0000000..65c1282
--- /dev/null
+++ b/docs/Mode3Chain.png
Binary files differ
diff --git a/docs/QuickStart.html b/docs/QuickStart.html
index e11992f..e2eb2db 100644
--- a/docs/QuickStart.html
+++ b/docs/QuickStart.html
@@ -12,7 +12,7 @@
<h1>Quick start</h1>
Note that the Shasta executable has no dependencies and requires no installation
-or set up. This means that you can use it immediately afterdownloading it and setting its execute permission.
+or set up. This means that you can use it immediately after downloading it and setting its execute permission.
See below for more information.
@@ -21,26 +21,16 @@ See below for more information.
You can use the following commands to download the executable from the latest release and run an assembly:
<pre>
# Download the executable for the latest release.
-curl -O -L https://github.com/chanzuckerberg/shasta/releases/download/0.10.0/shasta-Linux-0.10.0
+curl -O -L https://github.com/paoloshasta/shasta/releases/download/0.11.1/shasta-Linux-0.11.1
# Grant execute permissions.
-chmod ugo+x shasta-Linux-0.10.0
+chmod ugo+x shasta-Linux-0.11.1
# Run an assembly.
-./shasta-Linux-0.10.0 --input input.fasta --config Nanopore-May2022
+./shasta-Linux-0.11.1 --input input.fasta --config Nanopore-May2022
</pre>
<p>
-<b>The above is valid for releases up to 0.10.0. Newer releases will appear in the <code>paoloshasta/shasta</code>
-repository instead, so the download command would be:</b>
-
-<pre>
-curl -O -L https://github.com/paoloshasta/shasta/releases/download/x.y.z/shasta-Linux-x.y.z
-</pre>
-
-(Replace <code>x.y.z</code> with the identifier for the release you want to use).
-
-<p>
You can specify multiple input FASTA files, if necessary.
On a typical laptop, this will run in minutes for a bacterial genome.
For a human size assembly, AWS instance type <code>x1.32xlarge</code>
diff --git a/docs/Running.html b/docs/Running.html
index 4d967bc..cbd9eef 100644
--- a/docs/Running.html
+++ b/docs/Running.html
@@ -72,6 +72,10 @@ including the output of this script would be helpful.
<h2 id=MemoryRequirements>Memory requirements</h2>
+<p><b><i>
+Note that in this section "performance" refers to assembly time only.
+</i></b>
+
<p>
For best performance, the Shasta assembler uses a single large
machine rather than a cluster of smaller machines,
@@ -113,6 +117,11 @@ a compute cost of around $20 per genome.
<h2 id=LowMemory>Running with less than optimal memory</h2>
+<p><b><i>
+Note that in this section "performance" refers to assembly time only.
+The memory options discussed here don't affect assembly results in any way.
+</i></b>
+
<p>
Shasta also supports a mode of operation
with data structures physically on disk
@@ -277,8 +286,11 @@ there is one core for every virtual processor.
<h2 id=MemoryModes>Memory modes</h2>
-<p>
-<i>(This section does not apply to macOS).</i>
+<p><b><i>
+Note that in this section "performance" refers to assembly time only.
+The memory options described here don't affect assembly results in any way.
+</i></b>
+
<p>
For performance, the Shasta executable operates in memory,
diff --git a/docs/Shasta-0.12.0.pdf b/docs/Shasta-0.12.0.pdf
new file mode 100644
index 0000000..bfe92c1
--- /dev/null
+++ b/docs/Shasta-0.12.0.pdf
Binary files differ
diff --git a/docs/SupportedPlatforms.html b/docs/SupportedPlatforms.html
index 4986182..53226b1 100644
--- a/docs/SupportedPlatforms.html
+++ b/docs/SupportedPlatforms.html
@@ -20,16 +20,7 @@ platforms:
<ul>
<li>
Most current 64-bit Linux distributions for the
-<code>x86_64</code> architecture, including the following
-on which it was actually tested:
-<ul>
-<li>Ubuntu 16.04 LTS
-<li>Ubuntu 18.04 LTS
-<li>Ubuntu 20.04 LTS
-</ul>
-
-<li>
-macOS, using the macOS specific version of the Shasta executable.
+<code>x86_64</code> architecture.
<li>
Windows, using the Linux version of the Shasta executable and
@@ -45,7 +36,7 @@ See <a href=Running.html>here</a> for more information.
<h2>Extended functionality</h2>
<p>
Extended Shasta functionality (http server, Python API)
-is only available on Ubuntu 16.04, Ubuntu 18.04 and Ubuntu 20.04 LTS.
+is only available on Ubuntu 22.04 LTS.
Porting to other Linux platforms is possible.
diff --git a/docs/index.html b/docs/index.html
index 7d89e77..bca4f8f 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -11,6 +11,12 @@
<h2>Shasta Documentation</h2>
<p>
+🆕&nbsp;<a href=Shasta-0.12.0.pdf>Mode 3 assembly: presentation of assembly results</a></li>
+
+<p>
+🆕&nbsp;<a href=Mode3-0.12.0.html>Mode 3 assembly: usage notes</a></li>
+
+<p>
If you are looking at this documentation on GitHub Pages
(<code>https://paoloshasta.github.io/shasta/</code>),
this documentation applies to the latest Shasta code on GitHub
@@ -42,6 +48,7 @@ in the <code>docs</code> directory.
<li><a href=Running.html#ScriptedApproaches>Scripting with Python</a></li>
<li><a href=Running.html#Errors>Dealing with errors</a></li>
</ul>
+<li><a href=Mode3-0.12.0.html>Mode 3 assembly: usage notes</a></li>
<li><a href=InspectingResults.html>Exploring assembly results</a></li>
<li><a href=Performance.html>Maximizing assembly performance</a></li>
<li><a href=Commands.html>Shasta commands</a></li>
diff --git a/dynamicExecutable/CMakeLists.txt b/dynamicExecutable/CMakeLists.txt
index 1afcbce..6a015c1 100644
--- a/dynamicExecutable/CMakeLists.txt
+++ b/dynamicExecutable/CMakeLists.txt
@@ -5,7 +5,7 @@ project(shastaDynamicExecutable)
add_definitions(-std=c++20)
# Compilation warnings.
-add_definitions(-Wall -Wconversion -Wno-unused-result)
+add_definitions(-Wall -Wconversion -Wno-unused-result -Wno-trigraphs -Wno-psabi)
# Optimization and debug options.
if(BUILD_DEBUG)
@@ -67,13 +67,13 @@ if(X86_64)
target_link_libraries(
shastaDynamicExecutable
shastaDynamicLibrary
- atomic boost_system boost_program_options boost_chrono spoa png z
+ atomic boost_system boost_program_options boost_chrono boost_serialization spoa png z
lapack blas gfortran quadmath pthread)
else(X86_64)
target_link_libraries(
shastaDynamicExecutable
shastaDynamicLibrary
- atomic boost_system boost_program_options boost_chrono spoa png z
+ atomic boost_system boost_program_options boost_chrono boost_serialization spoa png z
lapack blas gfortran pthread)
endif(X86_64)
diff --git a/dynamicLibrary/CMakeLists.txt b/dynamicLibrary/CMakeLists.txt
index 60d7269..e378e07 100644
--- a/dynamicLibrary/CMakeLists.txt
+++ b/dynamicLibrary/CMakeLists.txt
@@ -6,7 +6,7 @@ project(shastaDynamicLibrary)
add_definitions(-std=c++20)
# Compilation warnings.
-add_definitions(-Wall -Wconversion -Wno-unused-result -Wno-trigraphs)
+add_definitions(-Wall -Wconversion -Wno-unused-result -Wno-trigraphs -Wno-psabi)
# Optimization and debug options.
if(BUILD_DEBUG)
@@ -77,7 +77,7 @@ SET(CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} ${SHASTA_PYTHON_LIBRARIES}")
# Libraries to link with.
target_link_libraries(
shastaDynamicLibrary
- atomic png boost_program_options pthread z spoa lapack blas ${SHASTA_PYTHON_LIBRARIES})
+ atomic png boost_program_options boost_serialization pthread z spoa lapack blas ${SHASTA_PYTHON_LIBRARIES})
# Install the shared library into the bin directory.
install(TARGETS shastaDynamicLibrary DESTINATION shasta-install/bin)
diff --git a/scripts/AlignPseudoPaths.py b/scripts/AlignPseudoPaths.py
deleted file mode 100755
index 8a62712..0000000
--- a/scripts/AlignPseudoPaths.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/python3
-
-import shasta
-
-import argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument('readId0', type=int)
-parser.add_argument('strand0', type=int, choices=range(2))
-parser.add_argument('readId1', type=int)
-parser.add_argument('strand1', type=int, choices=range(2))
-arguments = parser.parse_args()
-
-
-a = shasta.Assembler()
-a.accessMarkers()
-a.accessReadGraph()
-a.accessMarkerGraphVertices()
-a.accessMarkerGraphEdges()
-a.accessAssemblyGraphVertices()
-a.accessAssemblyGraphEdges()
-a.accessAssemblyGraphEdgeLists()
-a.alignPseudoPaths(arguments.readId0, arguments.strand0, arguments.readId1, arguments.strand1)
-
-
-
-
-
-
-
diff --git a/scripts/AnalyzeMode3Subgraph.py b/scripts/AnalyzeMode3Subgraph.py
deleted file mode 100755
index 704470f..0000000
--- a/scripts/AnalyzeMode3Subgraph.py
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/python3
-
-import shasta
-
-segmentIds = [int(token) for token in input('Enter segment ids on one line: ').split()]
-
-a = shasta.Assembler()
-a.accessMode3AssemblyGraph()
-a.analyzeMode3Subgraph(segmentIds)
-
-
diff --git a/scripts/ComputeAlignments.py b/scripts/ComputeAlignments.py
index 887a195..8f83ae6 100755
--- a/scripts/ComputeAlignments.py
+++ b/scripts/ComputeAlignments.py
@@ -37,7 +37,10 @@ alignOptions.align4MinEntryCountPerCell = int(config['Align']['align4.minEntryCo
alignOptions.align4MaxDistanceFromBoundary = int(config['Align']['align4.maxDistanceFromBoundary'])
# Do the computation.
+shasta.openPerformanceLog('ComputeAlignments.log')
+a.computeMarkerKmerIds(0);
a.computeAlignments(alignOptions, 0)
+a.cleanupMarkerKmerIds();
diff --git a/scripts/CreateConfigurationTable.py b/scripts/CreateConfigurationTable.py
index a25d73c..f5eb144 100755
--- a/scripts/CreateConfigurationTable.py
+++ b/scripts/CreateConfigurationTable.py
@@ -51,6 +51,7 @@ configurations = [
'Nanopore-R10-Slow-Nov2022',
'Nanopore-Phased-R10-Fast-Nov2022',
'Nanopore-Phased-R10-Slow-Nov2022',
+ 'Nanopore-ncm23-May2024',
]
diff --git a/scripts/CreateMode3Detangler.py b/scripts/CreateMode3Detangler.py
deleted file mode 100755
index d558763..0000000
--- a/scripts/CreateMode3Detangler.py
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/python3
-
-import shasta
-
-a = shasta.Assembler()
-a.accessMarkers()
-a.accessMarkerGraphEdges()
-a.accessMode3AssemblyGraph()
-
-path = a.createMode3Detangler()
-
diff --git a/scripts/CreateMode3PathGraph.py b/scripts/CreateMode3PathGraph.py
deleted file mode 100755
index 06a6452..0000000
--- a/scripts/CreateMode3PathGraph.py
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/python3
-
-import shasta
-
-a = shasta.Assembler()
-a.accessMarkers()
-a.accessMarkerGraphEdges()
-a.accessMode3AssemblyGraph()
-
-path = a.createMode3PathGraph()
-
-
diff --git a/scripts/FindAlignmentCandidatesLowHash1.py b/scripts/FindAlignmentCandidatesLowHash1.py
deleted file mode 100755
index 87afa06..0000000
--- a/scripts/FindAlignmentCandidatesLowHash1.py
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/python3
-
-import shasta
-import GetConfig
-import sys
-
-helpMessage="""
-Invoke without arguments.
-"""
-
-# Check that there are no arguments.
-if not len(sys.argv)==1:
- print(helpMessage)
- exit(1)
-
-# Read the config file.
-config = GetConfig.getConfig()
-
-# Initialize the assembler and access what we need.
-a = shasta.Assembler()
-a.accessKmers()
-a.accessMarkers()
-
-# Do the computation.
-a.findAlignmentCandidatesLowHash1(
- m = int(config['MinHash']['m']),
- hashFraction = float(config['MinHash']['hashFraction']),
- minHashIterationCount = int(config['MinHash']['minHashIterationCount']),
- minBucketSize = int(config['MinHash']['minBucketSize']),
- maxBucketSize = int(config['MinHash']['maxBucketSize']),
- minFrequency = int(config['MinHash']['minFrequency']))
-# a.writeAlignmentCandidates()
-
-
diff --git a/scripts/FlagPrimaryMarkerGraphEdges.py b/scripts/FlagPrimaryMarkerGraphEdges.py
new file mode 100644
index 0000000..b9c9054
--- /dev/null
+++ b/scripts/FlagPrimaryMarkerGraphEdges.py
@@ -0,0 +1,18 @@
+#!/usr/bin/python3
+
+import shasta
+import GetConfig
+
+# Read the config file.
+config = GetConfig.getConfig()
+
+a = shasta.Assembler()
+a.accessMarkers()
+a.accessMarkerGraphVertices()
+a.accessMarkerGraphEdges(True)
+a.accessDisjointSetsHistogram()
+a.flagPrimaryMarkerGraphEdges(
+ int(config['Assembly']['mode3.minPrimaryCoverage']),
+ int(config['Assembly']['mode3.maxPrimaryCoverage']),
+ 0)
+
diff --git a/scripts/GenerateRandomHaplotypes.py b/scripts/GenerateRandomHaplotypes.py
new file mode 100755
index 0000000..a29160b
--- /dev/null
+++ b/scripts/GenerateRandomHaplotypes.py
@@ -0,0 +1,103 @@
+#!/usr/bin/python3
+
+helpMessage = """
+Generate "random" haplotypes for all bubble chains
+of a Shasta phased assembly.
+
+Each bubble chain generates two haplotypes obtained
+by concatenating UR and PR contigs in Assembly-Phased.fasta
+in the appropriate order.
+
+Because UR contigs are not phased relative to each other,
+this will generate switch errors.
+
+Run this while in the assembly directory.
+This uses as input PhasingRegions.csv and Assembly-Phased.fasta.
+It generates output files Assembly-Random-Haplotype0.fasta
+and Assembly-Random-Haplotype1.fasta
+
+This script has no dependencies other than python3
+and can invoked directly without any installation required.
+
+"""
+
+# Import what we need.
+import argparse
+import csv
+
+# Make sure we have a --help option.
+parser = argparse.ArgumentParser(description=helpMessage)
+parser.parse_args()
+
+# Read the bubble chains file.
+csvFile = open('PhasingRegions.csv', 'r')
+reader = csv.DictReader(csvFile)
+
+bubbleChains = {}
+for row in reader:
+ bubbleChainId = int(row['Bubble chain id'])
+ if not bubbleChainId in bubbleChains:
+ bubbleChains[bubbleChainId] = []
+ bubbleChains[bubbleChainId].append(row)
+if not bubbleChains:
+ print("No bubble chains were found."
+ "Run this script from a Shasta phased assembly directory.")
+
+
+# Read the Assembly-Phased.fasta file.
+# Shasta writes each contig in a header line plus
+# a single line containing sequence.
+inputFastaFile = open('Assembly-Phased.fasta', 'r')
+inputContigs = {}
+while True:
+ header = inputFastaFile.readline()
+ if not header:
+ break;
+ if not header[0] == ">":
+ raise RuntimeError("Invalid FASTA header: " + header)
+ name = header[1:].split(" ")[0]
+ sequence = inputFastaFile.readline().rstrip("\n")
+
+ # We only want to keep it if the name begins with "UR," or "PR.".
+ if len(name) < 3:
+ continue;
+ prefix = name[0:3]
+ if not (prefix == "UR." or prefix == "PR."):
+ continue;
+
+ inputContigs[name] = sequence
+
+
+# Open the output files, one for each haplotype.
+outputFileNames = [("Assembly-Random-Haplotype%i.fasta" % haplotypeId) for haplotypeId in range(2)]
+outputFiles = [open(outputFileName, "w") for outputFileName in outputFileNames]
+
+
+# Loop over bubble chains.
+for bubbleChainId, bubbleChain in bubbleChains.items():
+ print("Working on bubble chain %i of %i" % (bubbleChainId, len(bubbleChains)))
+
+ # Check the bubble chain id.
+ for x in bubbleChain:
+ assert not x["Bubble chain id"] == bubbleChainId
+
+ # Generate the two haplotypes for this bubble chain.
+ for haplotypeId in range(2):
+ sequence = ""
+ for position in range(len(bubbleChain)):
+ row = bubbleChain[position]
+ if (row["Phased"]) == "No":
+ name = "UR.%i.%i" % (bubbleChainId, position)
+ else:
+ component = int(row["Component"])
+ name = "PR.%i.%i.%i.%i" % (bubbleChainId, position, component, haplotypeId)
+ assert name in inputContigs
+ sequence += inputContigs[name]
+ print("Bubble chain %i random haplotype %i has length %i" % (bubbleChainId, haplotypeId, len(sequence)))
+ outputFiles[haplotypeId].write(">BC.%i.%i %i\n%s\n" % (bubbleChainId, haplotypeId, len(sequence), sequence))
+
+print("Generation of random haplotypes is complete.")
+print("These haplotypes can contain a switch error at each phased region.")
+print("Output is in %s and %s" % (outputFileNames[0], outputFileNames[1]))
+
+
diff --git a/scripts/InstallPrerequisites-Ubuntu.sh b/scripts/InstallPrerequisites-Ubuntu.sh
index b50f7da..a1e59f7 100755
--- a/scripts/InstallPrerequisites-Ubuntu.sh
+++ b/scripts/InstallPrerequisites-Ubuntu.sh
@@ -99,12 +99,11 @@ tar -xvf 4.0.8.tar.gz
# To avoid these additional dependencies, we turn off the dispatcher feature for now.
# We could turn it back on if we see significant performance degradation in this area.
spoaBuildFlags="-Dspoa_generate_dispatch=ON"
-if [[ "$isArm" == true ]]; then
- spoaBuildFlags="-Dspoa_generate_dispatch=OFF -Dspoa_optimize_for_portability=OFF -Dspoa_optimize_for_native=OFF"
-fi
# Per the above comment, turn off the dispatcher feature for now.
spoaBuildFlags="-DCMAKE_BUILD_TYPE=Release -Dspoa_optimize_for_portability=ON"
-
+if [[ "$isArm" == true ]]; then
+ spoaBuildFlags="-DCMAKE_BUILD_TYPE=Release -Dspoa_build_tests=OFF"
+fi
# Build the shared library.
diff --git a/scripts/Mode3AssembleComponent.py b/scripts/Mode3AssembleComponent.py
new file mode 100644
index 0000000..6ee8336
--- /dev/null
+++ b/scripts/Mode3AssembleComponent.py
@@ -0,0 +1,39 @@
+#!/usr/bin/python3
+
+import shasta
+import argparse
+
+parser = argparse.ArgumentParser(description=
+ 'Load a mode3::AssemblyGraph representing a connected component of the primary graph and assemble it.')
+
+parser.add_argument('component', type=int,
+ help='The connected component to assemble.')
+
+parser.add_argument(
+ "--no-assemble-sequence",
+ dest="dontAssembleSequence",
+ action="store_true",
+)
+
+parser.add_argument(
+ "--debug",
+ dest="debug",
+ action="store_true",
+)
+
+arguments = parser.parse_args()
+
+
+
+options = shasta.AssemblerOptions('shasta.conf')
+a = shasta.Assembler()
+a.accessMarkers()
+a.accessMarkerGraphVertices()
+a.accessMarkerGraphEdges()
+a.accessMarkerGraphReverseComplementEdge()
+a.accessMarkerGraphConsensus()
+shasta.openPerformanceLog('Mode3AssembleComponent.log')
+fileName = 'AssemblyGraph-' + str(arguments.component) + '.data'
+a.mode3AssembleComponent(fileName, 0,
+ options.assemblyOptions.mode3Options, not arguments.dontAssembleSequence, arguments.debug)
+
diff --git a/scripts/Mode3Assembly.py b/scripts/Mode3Assembly.py
index e126b69..8b07425 100755..100644
--- a/scripts/Mode3Assembly.py
+++ b/scripts/Mode3Assembly.py
@@ -1,31 +1,45 @@
#!/usr/bin/python3
-"""
-
-This run the final portion of Mode 3 assembly.
-It assumes that the marker graph has already been created.
-
-"""
-
-import ast
import shasta
+import argparse
import GetConfig
+# Read the config file.
config = GetConfig.getConfig()
-shasta.openPerformanceLog('Mode3Assembly.log')
+# Parse the command line arguments.
+parser = argparse.ArgumentParser(description=
+ 'Run Mode 3 assembly starting from the marker graph.')
+
+parser.add_argument(
+ "--debug",
+ dest="debug",
+ action="store_true",
+)
+
+arguments = parser.parse_args()
+
+
+# Create the Assembler object and access what we need.
+options = shasta.AssemblerOptions('shasta.conf')
a = shasta.Assembler()
-a.setupConsensusCaller(config['Assembly']['consensusCaller'])
a.accessMarkers()
a.accessMarkerGraphVertices()
-a.accessMarkerGraphReverseComplementVertex()
-a.accessMarkerGraphEdges()
+a.accessMarkerGraphEdges(True)
a.accessMarkerGraphReverseComplementEdge()
a.accessMarkerGraphConsensus()
+a.accessDisjointSetsHistogram()
-a.mode3Assembly()
-
-
+# Open a performance log.
+shasta.openPerformanceLog('Mode3Assembly.log')
+# Flag primary marker graph edges.
+a.flagPrimaryMarkerGraphEdges(
+ int(config['Assembly']['mode3.minPrimaryCoverage']),
+ int(config['Assembly']['mode3.maxPrimaryCoverage']),
+ 0)
+# Run Mode 3 assembly.
+a.mode3Assembly(0, options.assemblyOptions.mode3Options, arguments.debug)
+
diff --git a/scripts/RandomlySelectKmers.py b/scripts/RandomlySelectKmers.py
deleted file mode 100755
index c788cd3..0000000
--- a/scripts/RandomlySelectKmers.py
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/python3
-
-import shasta
-import GetConfig
-
-# Read the config file.
-config = GetConfig.getConfig()
-
-# Initialize the assembler and access what we need.
-a = shasta.Assembler()
-
-# Generate the k-mers and write them out.
-a.randomlySelectKmers(
- k = int(config['Kmers']['k']),
- probability = float(config['Kmers']['probability']))
-a.writeKmers()
diff --git a/scripts/ReverseTransitiveReduction.py b/scripts/ReverseTransitiveReduction.py
deleted file mode 100755
index 4fad068..0000000
--- a/scripts/ReverseTransitiveReduction.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/python3
-
-import shasta
-import GetConfig
-
-# Read the config file.
-config = GetConfig.getConfig()
-
-
-# Initialize the assembler and access what we need.
-a = shasta.Assembler()
-a.accessMarkerGraphVertices()
-a.accessMarkerGraphEdges(accessEdgesReadWrite=True)
-a.accessMarkerGraphReverseComplementEdge()
-a.reverseTransitiveReduction(
- lowCoverageThreshold = int(config['MarkerGraph']['lowCoverageThreshold']),
- highCoverageThreshold = int(config['MarkerGraph']['highCoverageThreshold']),
- maxDistance = int(config['MarkerGraph']['maxDistance'])
- )
-
-
diff --git a/scripts/SelectKmers2.py b/scripts/SelectKmers2.py
deleted file mode 100755
index 08a43e3..0000000
--- a/scripts/SelectKmers2.py
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/python3
-
-import shasta
-import GetConfig
-
-# Read the config file.
-config = GetConfig.getConfig()
-
-# Initialize the assembler and access what we need.
-a = shasta.Assembler()
-
-# Generate the k-mers and write them out.
-a.selectKmers2(
- k = int(config['Kmers']['k']),
- markerDensity = float(config['Kmers']['probability']),
- enrichmentThreshold = float(config['Kmers']['enrichmentThreshold']))
-
diff --git a/scripts/SelectKmers4.py b/scripts/SelectKmers4.py
deleted file mode 100755
index 817095c..0000000
--- a/scripts/SelectKmers4.py
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/python3
-
-import shasta
-
-# Read the config file.
-import GetConfig
-config = GetConfig.getConfig()
-
-# Create the assembler.
-a = shasta.Assembler()
-
-# select k-mers.
-a.selectKmers4(
- k = int(config['Kmers']['k']),
- markerDensity = float(config['Kmers']['probability']),
- distanceThreshold = int(config['Kmers']['distanceThreshold']))
-
diff --git a/scripts/SelectKmersBasedOnFrequency.py b/scripts/SelectKmersBasedOnFrequency.py
deleted file mode 100755
index 70e152c..0000000
--- a/scripts/SelectKmersBasedOnFrequency.py
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/python3
-
-import shasta
-import GetConfig
-
-# Read the config file.
-config = GetConfig.getConfig()
-
-# Initialize the assembler and access what we need.
-a = shasta.Assembler()
-
-# Generate the k-mers and write them out.
-a.selectKmersBasedOnFrequency(
- k = int(config['Kmers']['k']),
- markerDensity = float(config['Kmers']['probability']),
- enrichmentThreshold = float(config['Kmers']['enrichmentThreshold']))
-
diff --git a/scripts/VertexCoverageStatisticsByKmerId.py b/scripts/VertexCoverageStatisticsByKmerId.py
deleted file mode 100755
index 70e04f2..0000000
--- a/scripts/VertexCoverageStatisticsByKmerId.py
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/python3
-
-import shasta
-import GetConfig
-
-config = GetConfig.getConfig()
-
-# To get meaningful results from this, use
-# the following options when running the assembly,
-# to make sure all vertices are generated:
-# --MarkerGraph.allowDuplicateMarkers
-# --MarkerGraph.minCoverage 1
-# --MarkerGraph.minCoverage 1000000000
-
-
-a = shasta.Assembler()
-a.accessKmers()
-a.accessMarkers()
-a.accessMarkerGraphVertices()
-
-a.vertexCoverageStatisticsByKmerId()
-
-
diff --git a/scripts/WriteAlignmentDetails.py b/scripts/WriteAlignmentDetails.py
deleted file mode 100755
index 435a55b..0000000
--- a/scripts/WriteAlignmentDetails.py
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/python3
-
-import shasta
-import argparse
-
-
-parser = argparse.ArgumentParser(description=
- 'Write CSVs with details for each alignment')
-
-arguments = parser.parse_args()
-
-a = shasta.Assembler()
-a.accessMarkers()
-a.accessAlignmentCandidates()
-a.accessCompressedAlignments()
-a.accessAlignmentData()
-a.writeAlignmentDetails()
-
-
diff --git a/scripts/WriteMarkersFrequency.py b/scripts/WriteMarkersFrequency.py
deleted file mode 100755
index 8a2ff26..0000000
--- a/scripts/WriteMarkersFrequency.py
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/usr/bin/python3
-
-import shasta
-
-a = shasta.Assembler()
-a.accessMarkers()
-a.writeMarkerFrequency()
diff --git a/scripts/testGlobalMsa.py b/scripts/testGlobalMsa.py
new file mode 100644
index 0000000..6504a39
--- /dev/null
+++ b/scripts/testGlobalMsa.py
@@ -0,0 +1,33 @@
+#!/usr/bin/python3
+
+import shasta
+
+sequences = [
+ ("AGGTCCGACAGCGCGCCCAGATCCAGCCACGCCACGGTCCGCCTCTCCCGCCGCCCTGGCCTGTCCTTAGCCCCAGGC", 1),
+ ("AGGTCCGACAGCGCGCCCATACTCCAGCCACGCCACCGGTCCGCCTCTCCCGCCGCCCTGGCCTGTCCTTAGCCCCAGGC", 1),
+ ("AGGTCCGACAGCGCGCCCAGATCCAGCCACGCCACCGGTCCGCCTCTCCCGCCCTGGCCTGTCCTTAGCCCCAGGC", 1),
+ ("AGGTCCGACAGCGCGCCCAGATCCAGCCACGCCACCGGTCCGCCTCTCCCGCCGCCCTGGCCTGTCCTTAGCCCCAGGC", 5),
+ ("AGGTCCGACAGCGCCCAGATCCAGCCACGCCACCGGTCCGCCTCTCCCGCCGCCCTGGCCTGTCCTTAGCCCCAGGC", 1),
+ ("AGGTCCGACAGCGCGCCAGATCCAGCCACGCCACCGGTCCGCCTCTCCCGCCGCCCTGGCCTGTCCTTAGCCCCAGGC", 1),
+ ("AGGTCCGACAGCGCGCCCAGATCCAGCCACGCCACCGGTCCGCCTCTTCCCGCCGCCCTGGCCTGTCCTTAGCCCCAGGC", 2),
+ ("AGGTCCGACAGCGCGCCCAGATCCAGCCACGCCACCGGTCCGCCGCTCGCCCGCCGCCCTGGCCTGTCCTTAGCCCCAGGC", 1),
+ ]
+
+expectedConsensus = "AGGTCCGACAGCGCGCCCAGATCCAGCCACGCCACCGGTCCGCCTCTCCCGCCGCCCTGGCCTGTCCTTAGCCCCAGGC"
+
+
+consensus = shasta.globalMsaPython(sequences, 30, 14)
+print(consensus)
+
+pureSpoaConsensus = shasta.globalMsaPython(sequences, 1000000000, 14)
+
+if consensus == expectedConsensus:
+ print("Consensus agrees with expected consensus.")
+else:
+ print("Consensus DOES NOT AGREE with expected consensus.")
+
+if consensus == pureSpoaConsensus:
+ print("Consensus agrees with pure spoa consensus.")
+else:
+ print("Consensus DOES NOT AGREE with pure spoa consensus.")
+
diff --git a/src/Align4.cpp b/src/Align4.cpp
index 48339d6..b16b6c7 100644
--- a/src/Align4.cpp
+++ b/src/Align4.cpp
@@ -28,15 +28,15 @@ using namespace Align4;
void shasta::Align4::align(
- const array<CompressedMarkers, 2>& compressedMarkers,
- const array<span< const pair<KmerId, uint32_t> >, 2> sortedMarkers,
+ const array< span<KmerId>, 2>& kmerIds,
+ const array<span< pair<KmerId, uint32_t> >, 2> sortedMarkers,
const Options& options,
MemoryMapped::ByteAllocator& byteAllocator,
Alignment& alignment,
AlignmentInfo& alignmentInfo,
bool debug)
{
- Align4::Aligner graph(compressedMarkers, sortedMarkers,
+ Align4::Aligner graph(kmerIds, sortedMarkers,
options, byteAllocator, alignment, alignmentInfo,
debug);
}
@@ -44,15 +44,15 @@ void shasta::Align4::align(
Aligner::Aligner(
- const array<CompressedMarkers, 2>& compressedMarkers,
- const array<span< const pair<KmerId, uint32_t> >, 2> sortedMarkers,
+ const array<span<KmerId>, 2>& kmerIds,
+ const array<span< pair<KmerId, uint32_t> >, 2> sortedMarkers,
const Options& options,
MemoryMapped::ByteAllocator& byteAllocator,
Alignment& alignment,
AlignmentInfo& alignmentInfo,
bool debug) :
- nx(uint32_t(compressedMarkers[0].size())),
- ny(uint32_t(compressedMarkers[1].size())),
+ nx(uint32_t(kmerIds[0].size())),
+ ny(uint32_t(kmerIds[1].size())),
deltaX(int32_t(options.deltaX)),
deltaY(int32_t(options.deltaY)),
byteAllocator(byteAllocator)
@@ -109,7 +109,7 @@ Aligner::Aligner(
}
vector< pair<Alignment, AlignmentInfo> > alignments;
computeBandedAlignments(
- compressedMarkers,
+ kmerIds,
options.minAlignedMarkerCount,
options.minAlignedFraction,
options.maxSkip,
@@ -192,7 +192,7 @@ SignedCoordinates Aligner::getxy(Coordinates XY) const
-void Aligner::createAlignmentMatrix(const array<span< const pair<KmerId, uint32_t> >, 2> sortedMarkers)
+void Aligner::createAlignmentMatrix(const array<span< pair<KmerId, uint32_t> >, 2> sortedMarkers)
{
alignmentMatrix.clear();
@@ -873,7 +873,7 @@ void Aligner::findActiveCellsConnectedComponents()
// active cells. Return the ones that match requirements on
// minAlignedMarkerCount, minAlignedFraction, maxSkip, maxDrift, maxTrim.
void Aligner::computeBandedAlignments(
- const array<CompressedMarkers, 2>& compressedMarkers,
+ const array<span<KmerId>, 2>& kmerIds,
uint64_t minAlignedMarkerCount,
double minAlignedFraction,
uint64_t maxSkip,
@@ -936,7 +936,7 @@ void Aligner::computeBandedAlignments(
// Compute an alignment with this band.
Alignment alignment;
AlignmentInfo alignmentInfo;
- computeBandedAlignment(compressedMarkers, bandMin, bandMax,
+ computeBandedAlignment(kmerIds, bandMin, bandMax,
alignment, alignmentInfo, debug);
// Skip it, if it does not satisfy the requirements on
@@ -991,7 +991,7 @@ void Aligner::computeBandedAlignments(
// Compute a banded alignment with a given band.
bool Aligner::computeBandedAlignment(
- const array<CompressedMarkers, 2>& compressedMarkers,
+ const array<span<KmerId>, 2>& kmerIds,
int32_t bandMin,
int32_t bandMax,
Alignment& alignment,
@@ -1014,8 +1014,8 @@ bool Aligner::computeBandedAlignment(
// Add 100 to kMerIds to prevent collision from the seqan gap value.
array<TSequence, 2> sequences;
for(uint64_t i=0; i<2; i++) {
- for(const CompressedMarker& marker: compressedMarkers[i]) {
- appendValue(sequences[i], marker.kmerId + 100);
+ for(const KmerId& kmerId: kmerIds[i]) {
+ appendValue(sequences[i], kmerId + 100);
}
}
@@ -1056,7 +1056,7 @@ bool Aligner::computeBandedAlignment(
i<alignmentLength and ordinal0<nx and ordinal1<ny; i++) {
if( align[i] != seqanGapValue and
align[i + alignmentLength] != seqanGapValue and
- compressedMarkers[0][ordinal0].kmerId == compressedMarkers[1][ordinal1].kmerId) {
+ kmerIds[0][ordinal0] == kmerIds[1][ordinal1]) {
alignment.ordinals.push_back(array<uint32_t, 2>{ordinal0, ordinal1});
}
if(align[i] != seqanGapValue) {
diff --git a/src/Align4.hpp b/src/Align4.hpp
index 892385a..1ba94dc 100644
--- a/src/Align4.hpp
+++ b/src/Align4.hpp
@@ -80,14 +80,12 @@ namespace shasta {
// we can end up with negative values.
using SignedCoordinates = pair<uint32_t, uint32_t>;
- // The markers of an oriented read.
- using CompressedMarkers = span<const CompressedMarker>;
-
- // Compute the alginment.
- // The sorted markers are pairs(KmerId, ordinal) sorted by KmnerId.
+ // Compute the alignment.
+ // The KmerIds are the KmerIds for the two reads, in position order.
+ // The sorted markers are pairs(KmerId, ordinal) sorted by KmerId.
void align(
- const array<CompressedMarkers, 2>&,
- const array<span< const pair<KmerId, uint32_t> >, 2> sortedMarkers,
+ const array< span<KmerId>, 2>& kmerIds,
+ const array<span<pair<KmerId, uint32_t> >, 2> sortedMarkers,
const Align4::Options&,
MemoryMapped::ByteAllocator&,
Alignment&,
@@ -135,10 +133,11 @@ class shasta::Align4::Aligner {
public:
// The constructor does all the work.
- // The sorted markers are pairs(KmerId, ordinal) sorted by KmnerId.
+ // The kmerIds are in position orders.
+ // The sorted markers are pairs(KmerId, ordinal) sorted by KmerId.
Aligner(
- const array<CompressedMarkers, 2>& compressedMarkers,
- const array<span< const pair<KmerId, uint32_t> >, 2> sortedMarkers,
+ const array< span<KmerId>, 2>& kmerIds,
+ const array<span< pair<KmerId, uint32_t> >, 2> sortedMarkers,
const Options&,
MemoryMapped::ByteAllocator&,
Alignment&,
@@ -174,7 +173,7 @@ private:
using AlignmentMatrixEntryVector = vector<AlignmentMatrixEntry, AlignmentMatrixAllocator>; // For one iY
using AlignmentMatrix = vector<AlignmentMatrixEntryVector>; // Indexed by iY.
AlignmentMatrix alignmentMatrix;
- void createAlignmentMatrix(const array<span< const pair<KmerId, uint32_t> >, 2> sortedMarkers);
+ void createAlignmentMatrix(const array<span< pair<KmerId, uint32_t> >, 2> sortedMarkers);
void writeAlignmentMatrixCsv(const string& fileName) const;
void writeAlignmentMatrixPng(
const string& fileName,
@@ -280,7 +279,7 @@ private:
// active cells. Return the ones that match requirements on
// minAlignedMarkerCount, minAlignedFraction, maxSkip, maxDrift, maxTrim.
void computeBandedAlignments(
- const array<CompressedMarkers, 2>& compressedMarkers,
+ const array<span<KmerId>, 2>& kmerIds,
uint64_t minAlignedMarkerCount,
double minAlignedFraction,
uint64_t maxSkip,
@@ -292,7 +291,7 @@ private:
// Compute a banded alignment with a given band.
bool computeBandedAlignment(
- const array<CompressedMarkers, 2>& compressedMarkers,
+ const array<span<KmerId>, 2>& kmerIds,
int32_t bandMin,
int32_t bandMax,
Alignment&,
diff --git a/src/Alignment.hpp b/src/Alignment.hpp
index 6915ae6..13d72f5 100644
--- a/src/Alignment.hpp
+++ b/src/Alignment.hpp
@@ -193,6 +193,10 @@ public:
// Flag that is set if this alignment is used in the read graph.
uint8_t isInReadGraph : 1;
+ // Uniqueness metric (alignment method 5 only).
+ // See Assembler::alignOrientedReads5.
+ float uniquenessMetric = std::numeric_limits<float>::signaling_NaN();
+
void clearFlags()
{
isInReadGraph = 0;
diff --git a/src/Assembler.cpp b/src/Assembler.cpp
index 56f2b27..073e03e 100644
--- a/src/Assembler.cpp
+++ b/src/Assembler.cpp
@@ -1,7 +1,10 @@
#include "Assembler.hpp"
+#include "AssemblerOptions.hpp"
#include "buildId.hpp"
#include "Coverage.hpp"
+#include "KmerCheckerFactory.hpp"
#include "MedianConsensusCaller.hpp"
+#include "MurmurHash2.hpp"
#include "Reads.hpp"
#include "SimpleConsensusCaller.hpp"
#include "SimpleBayesianConsensusCaller.hpp"
@@ -13,14 +16,13 @@ template class MultithreadedObject<Assembler>;
// Constructor to be called one to create a new run.
Assembler::Assembler(
- const string& largeDataFileNamePrefix,
+ const string& largeDataFileNamePrefixArgument,
bool createNew,
uint64_t readRepresentation, // 0 = raw sequence, 1 = RLE sequence. Only used if createNew.
size_t largeDataPageSizeArgument) :
-
- MultithreadedObject(*this),
- largeDataFileNamePrefix(largeDataFileNamePrefix)
+ MultithreadedObject(*this)
{
+ largeDataFileNamePrefix = largeDataFileNamePrefixArgument;
if(createNew) {
@@ -171,3 +173,62 @@ void Assembler::storePeakMemoryUsage(uint64_t peakMemoryUsage) {
assemblerInfo->peakMemoryUsage = peakMemoryUsage;
}
+
+
+void Assembler::createKmerChecker(
+ const KmersOptions& kmersOptions,
+ uint64_t threadCount)
+{
+ if(threadCount == 0) {
+ threadCount = std::thread::hardware_concurrency();
+ }
+
+ assemblerInfo->k = kmersOptions.k;
+ assemblerInfo->kmerGenerationMethod = kmersOptions.generationMethod;
+
+ kmerChecker = KmerCheckerFactory::createNew(
+ kmersOptions,
+ threadCount,
+ getReads(),
+ *this);
+}
+
+
+
+void Assembler::accessKmerChecker()
+{
+ kmerChecker = KmerCheckerFactory::createFromBinaryData(
+ assemblerInfo->k,
+ assemblerInfo->kmerGenerationMethod,
+ getReads(),
+ *this);
+}
+
+
+
+// Hash a KmerId in such a way that it has the same hash as its reverse
+// complement. This is used by alignment method 3 to downsample markers.
+uint32_t Assembler::hashKmerId(KmerId kmerId) const
+{
+ const uint64_t k = assemblerInfo->k;
+
+ // Construct the k-mer and its reverse complement.
+ const Kmer kmer(kmerId, k);
+ const Kmer kmerRc = kmer.reverseComplement(k);
+
+ // Compute the id of the reverse complement k-mer.
+ const KmerId kmerIdRc = KmerId(kmerRc.id(k));
+
+ // Hash the sum of the two KmerIds.
+ // This guarantees that we return the same hash
+ // for a k-mer and its reverse complement.
+ const uint64_t sum = kmerId + kmerIdRc;
+
+ return MurmurHash2(&sum, sizeof(sum), 13477);
+}
+
+
+
+
+
+
diff --git a/src/Assembler.hpp b/src/Assembler.hpp
index d6e69e8..d160025 100644
--- a/src/Assembler.hpp
+++ b/src/Assembler.hpp
@@ -6,9 +6,12 @@
#include "AlignmentCandidates.hpp"
#include "AssemblyGraph2Statistics.hpp"
#include "HttpServer.hpp"
+#include "invalid.hpp"
#include "Kmer.hpp"
+#include "MappedMemoryOwner.hpp"
#include "Marker.hpp"
#include "MarkerGraph.hpp"
+#include "MarkerGraphEdgePairInfo.hpp"
#include "MemoryMappedObject.hpp"
#include "MultithreadedObject.hpp"
#include "ReadGraph.hpp"
@@ -25,7 +28,6 @@ namespace shasta {
class Assembler;
class AssemblerInfo;
- class AssemblyGraph;
class Alignment;
class AlignmentData;
class AlignmentGraph;
@@ -38,21 +40,28 @@ namespace shasta {
class ConsensusCaller;
class Histogram2;
class InducedAlignment;
+ class KmerChecker;
+ class KmersOptions;
class LocalAssemblyGraph;
class LocalAlignmentCandidateGraph;
class LocalAlignmentGraph;
- class LocalMarkerGraph;
+ class LocalMarkerGraph0;
class LocalReadGraph;
class LocalReadGraphTriangles;
- class LocalMarkerGraphRequestParameters;
+ class LocalMarkerGraph0RequestParameters;
class LongBaseSequences;
class MarkerConnectivityGraph;
class MarkerConnectivityGraphVertexMap;
class Mode2AssemblyOptions;
+ class Mode3AssemblyOptions;
+ class Mode3Assembler;
class OrientedReadPair;
class Reads;
class ReferenceOverlapMap;
+ namespace mode0 {
+ class AssemblyGraph;
+ }
namespace MemoryMapped {
class ByteAllocator;
@@ -70,10 +79,6 @@ namespace shasta {
class Options;
}
- namespace mode3 {
- class AssemblyGraph;
- }
-
extern template class MultithreadedObject<Assembler>;
}
@@ -102,6 +107,9 @@ public:
// The length of k-mers used to define markers.
size_t k;
+ // The method used to generate kmers (--Kmers.generationMethod).
+ uint64_t kmerGenerationMethod;
+
// The page size in use for this run.
size_t largeDataPageSize;
@@ -187,8 +195,9 @@ public:
class shasta::Assembler :
- public MultithreadedObject<Assembler>
- , public HttpServer {
+ public MultithreadedObject<Assembler>,
+ public MappedMemoryOwner,
+ public HttpServer {
public:
@@ -230,8 +239,6 @@ public:
void findMarkers(size_t threadCount);
void accessMarkers();
void writeMarkers(ReadId, Strand, const string& fileName);
- vector<KmerId> getMarkers(ReadId, Strand);
- void writeMarkerFrequency();
// Write the reads that overlap a given read.
void writeOverlappingReads(ReadId, Strand, const string& fileName);
@@ -270,7 +277,6 @@ public:
);
void accessAlignmentData();
void accessAlignmentDataReadWrite();
- void writeAlignmentDetails() const;
// Loop over all alignments in the read graph
@@ -322,9 +328,6 @@ public:
vector< tuple<ReadId, Strand, uint32_t> >
getGlobalMarkerGraphVertexMarkers(MarkerGraph::VertexId) const;
- // Compute marker graph vertex coverage statistics by KmerId.
- void vertexCoverageStatisticsByKmerId() const;
-
// Approximate transitive reduction of the marker graph.
@@ -358,45 +361,6 @@ public:
- // Approximate reverse transitive reduction of the marker graph.
- // The goal is to remove local back-edges.
- // This works similarly to transitive reduction,
- // but in the opposite direction.
- // This does the following:
- // - Edges with coverage greater than lowCoverageThreshold
- // and less then highCoverageThreshold are processed in
- // ordered of increasing coverage:
- // * For each such edge A->B, we look for a path of length
- // at most maxDistance starting at B and ending at A
- // that does not use edge A->B and also does not use any
- // edges already marked wasRemovedByTransitiveReduction.
- // * If such a path is found, the edge is marked
- // wasRemovedByTransitiveReduction.
- void reverseTransitiveReduction(
- size_t lowCoverageThreshold,
- size_t highCoverageThreshold,
- size_t maxDistance);
-
-
-
-private:
-
- // Data filled in by the constructor.
- string largeDataFileNamePrefix;
- size_t largeDataPageSize;
-
- // Function to construct names for binary objects.
- string largeDataName(const string& name) const
- {
- if(largeDataFileNamePrefix.empty()) {
- return ""; // Anonymous;
- } else {
- return largeDataFileNamePrefix + name;
- }
- }
-
-
-
// Various pieces of assembler information stored in shared memory.
// See class AssemblerInfo for more information.
public:
@@ -425,160 +389,44 @@ public:
void computeReadIdsSortedByName();
+ // Find duplicate reads, as determined by name (not sequence).
+ // This also sets the isDuplicate and discardDueToDuplicates read flags
+ // and summarizes what it found Duplicates.csv.
+ void findDuplicateReads(const string& handleDuplicates);
-private:
-
-
-
- // Table of all k-mers of length k.
- // Among all 4^k k-mers of length k, we choose a subset
- // that we call "markers".
- // The value of k used is stored in assemblerInfo.
- // The k-mer table is a vector of 4^k pairs,
- // indexed by k-mer id as computed using Kmer::id(k).
- // The markers are selected at the beginning of an assembly
- // and never changed, and selected in such a way that,
- // if (and only if) a k-mer is a marker, its reverse complement
- // is also a marker. That is, for all permitted values of i, 0 <= i < 4^k:
- // kmerTable[i].isMarker == kmerTable[kmerTable[i].reverseComplementKmerId].isMarker
- MemoryMapped::Vector<KmerInfo> kmerTable;
- void checkKmersAreOpen() const;
-
-public:
- void accessKmers();
- void writeKmers(const string& fileName) const;
-
- // Select marker k-mers randomly.
- void randomlySelectKmers(
- size_t k, // k-mer length.
- double probability, // The probability that a k-mer is selected as a marker.
- int seed // For random number generator.
- );
-
-
-
- // Select marker k-mers randomly, but excluding
- // the ones that have high frequency in the reads.
- void selectKmersBasedOnFrequency(
-
- // k-mer length.
- size_t k,
-
- // The desired marker density
- double markerDensity,
-
- // Seed for random number generator.
- int seed,
-
- // Exclude k-mers enriched by more than this amount.
- // Enrichment is the ratio of k-mer frequency in reads
- // over what a random distribution would give.
- double enrichmentThreshold,
-
- size_t threadCount
- );
-
-
-
- // In this version, marker k-mers are selected randomly, but excluding
- // any k-mer that is over-enriched even in a single oriented read.
- void selectKmers2(
-
- // k-mer length.
- size_t k,
- // The desired marker density
- double markerDensity,
-
- // Seed for random number generator.
- int seed,
-
- // Exclude k-mers enriched by more than this amount,
- // even in a single oriented read.
- // Enrichment is the ratio of k-mer frequency in reads
- // over what a random distribution would give.
- double enrichmentThreshold,
-
- size_t threadCount
- );
private:
- class SelectKmers2Data {
- public:
-
- double enrichmentThreshold;
-
- // The number of times each k-mer appears in an oriented read.
- // Indexed by KmerId.
- MemoryMapped::Vector<uint64_t> globalFrequency;
-
- // The number of oriented reads that each k-mer is
- // over-enriched in by more than a factor enrichmentThreshold.
- // Indexed by KmerId.
- MemoryMapped::Vector<ReadId> overenrichedReadCount;
-
- };
- SelectKmers2Data selectKmers2Data;
- void selectKmers2ThreadFunction(size_t threadId);
-
-
-
- // In this version, marker k-mers are selected randomly, but excluding
- // k-mers that appear repeated at short distances in any oriented read.
- // More precisely, for each k-mer we compute the minimum distance
- // (in RLE bases) at which any two copies of that k-mer appear in any oriented read.
- // K-mers for which this minimum distance is less than distanceThreshold
- // are not used as markers. Marker k-mers are selected randomly among the
- // remaining k-mers, until the desired marker density is achieved.
-public:
- void selectKmers4(
-
- // k-mer length.
- uint64_t k,
- // The desired marker density
- double markerDensity,
- // Seed for random number generator.
- uint64_t seed,
- // Exclude k-mers that appear in any read in two copies,
- // with the two copies closer than this distance (in RLE bases).
- uint64_t distanceThreshold,
-
- size_t threadCount
- );
-private:
- void selectKmers4ThreadFunction(size_t threadId);
- class SelectKmers4Data {
+ // The KmerChecker can find out if a given KmerId is a marker.
+ shared_ptr<KmerChecker> kmerChecker;
public:
+ void createKmerChecker(
+ const KmersOptions& kmersOptions,
+ uint64_t threadCount);
+ void accessKmerChecker();
+
+ // This one should eventually go away, but there are several scripts
+ // that depend on it.
+ void accessKmers()
+ {
+ accessKmerChecker();
+ }
- // The number of times each k-mer appears in an oriented read.
- // Indexed by KmerId.
- MemoryMapped::Vector<uint64_t> globalFrequency;
-
- // The minimum distance at which two copies of each k-mer
- // appear in any oriented read.
- // Indexed by KmerId.
- MemoryMapped::Vector< pair<std::mutex, uint32_t> > minimumDistance;
-
- };
- SelectKmers4Data selectKmers4Data;
-
-
-
- // Read the k-mers from file.
-public:
- void readKmersFromFile(uint64_t k, const string& fileName);
private:
- void computeKmerFrequency(size_t threadId);
- void initializeKmerTable();
+ // Hash a KmerId in such a way that it has the same hash as its reverse
+ // complement. This is used by alignment method 3 to downsample markers.
+ uint32_t hashKmerId(KmerId) const;
// The markers on all oriented reads. Indexed by OrientedReadId::getValue().
+public:
MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t> markers;
+private:
void checkMarkersAreOpen() const;
// Get markers sorted by KmerId for a given OrientedReadId.
@@ -588,7 +436,9 @@ private:
// Given a marker by its OrientedReadId and ordinal,
// return the corresponding global marker id.
+public:
MarkerId getMarkerId(OrientedReadId, uint32_t ordinal) const;
+private:
MarkerId getReverseComplementMarkerId(OrientedReadId, uint32_t ordinal) const;
MarkerId getMarkerId(const MarkerDescriptor& m) const
{
@@ -606,20 +456,95 @@ private:
// an additional 4 bytes per marker.
public:
pair<OrientedReadId, uint32_t> findMarkerId(MarkerId) const;
-private:
+ // KmerIds for all markers. Indexed by OrientedReadId::getValue().
+ // Only stored during alignment computation, and then freed.
+ MemoryMapped::VectorOfVectors<KmerId, uint64_t> markerKmerIds;
+ void computeMarkerKmerIds(uint64_t threadCount);
+ void cleanupMarkerKmerIds();
+private:
+ void computeMarkerKmerIdsThreadFunction(size_t threadId);
+
// Pairs (KmerId, ordinal), sorted by KmerId, for each oriented read.
// Indexed by orientedReadId.getValue().
// Used by alignment method 4.
- MemoryMapped::VectorOfVectors< pair<KmerId, uint32_t>, uint64_t> sortedMarkers;
public:
+ MemoryMapped::VectorOfVectors< pair<KmerId, uint32_t>, uint64_t> sortedMarkers;
void computeSortedMarkers(uint64_t threadCount);
bool accessSortedMarkers();
private:
- void computeSortedMarkersThreadFunction1(size_t threadId);
- void computeSortedMarkersThreadFunction2(size_t threadId);
+ void computeSortedMarkersThreadFunction(size_t threadId);
+ // void computeSortedMarkersThreadFunction1(size_t threadId);
+ // void computeSortedMarkersThreadFunction2(size_t threadId);
+
+
+
+ // Low frequency markers for each oriented read.
+ // This stores, for each oriented read, the ordinals corresponding
+ // to marker with low frequency (up to maxMarkerFrequency), sorted by KmerId.
+ // Used by alignment method 5. It is only stored durign alignment
+ // computation.
+public:
+ MemoryMapped::VectorOfVectors<uint32_t, uint64_t> lowFrequencyMarkers;
+ void computeLowFrequencyMarkers(uint64_t maxMarkerFrequency, uint64_t threadCount);
+ void computeLowFrequencyMarkers(
+ const span<const KmerId>&, // The marker k-mers for the oriented reads (sorted by ordinal)
+ uint64_t maxMarkerFrequency,
+ vector<uint32_t>&); // The ordinals of the low frequency markers, sorted by KmerId.
+private:
+ void computeLowFrequencyMarkersThreadFunctionPass1(uint64_t threadId);
+ void computeLowFrequencyMarkersThreadFunctionPass2(uint64_t threadId);
+ void computeLowFrequencyMarkersThreadFunctionPass12(uint64_t pass);
+ class ComputeLowFrequencyMarkersData {
+ public:
+ uint64_t maxMarkerFrequency;
+ };
+ ComputeLowFrequencyMarkersData computeLowFrequencyMarkersData;
+
+
+
+ // Low level functions to get marker Kmers/KmerIds of an oriented read.
+ // They are obtained from the reads and not from CompressedMarker::kmerId,
+ // which will soon go away.
+
+ // Get the marker Kmer for an oriented read and ordinal.
+ Kmer getOrientedReadMarkerKmer(OrientedReadId, uint32_t ordinal) const;
+ Kmer getOrientedReadMarkerKmerStrand0(ReadId, uint32_t ordinal) const;
+ Kmer getOrientedReadMarkerKmerStrand1(ReadId, uint32_t ordinal) const;
+
+ // Get the marker KmerId for an oriented read and ordinal.
+ KmerId getOrientedReadMarkerKmerId(OrientedReadId, uint32_t ordinal) const;
+
+ // Get all marker Kmers for an oriented read.
+ void getOrientedReadMarkerKmers(OrientedReadId, const span<Kmer>&) const;
+ void getOrientedReadMarkerKmersStrand0(ReadId, const span<Kmer>&) const;
+ void getOrientedReadMarkerKmersStrand1(ReadId, const span<Kmer>&) const;
+
+ // Get all marker KmerIds for an oriented read.
+ void getOrientedReadMarkerKmerIds(OrientedReadId, const span<KmerId>&) const;
+ void getOrientedReadMarkerKmerIdsStrand0(ReadId, const span<KmerId>&) const;
+ void getOrientedReadMarkerKmerIdsStrand1(ReadId, const span<KmerId>&) const;
+
+ // Get all MarkerWithOrdinals for an oriented read (includes position, KmerId, and ordinal).
+ void getOrientedReadMarkers(OrientedReadId, const span<MarkerWithOrdinal>&) const;
+ void getOrientedReadMarkersStrand0(ReadId, const span<MarkerWithOrdinal>&) const;
+ void getOrientedReadMarkersStrand1(ReadId, const span<MarkerWithOrdinal>&) const;
+
+ // Get all marker Kmers/KmerIds for a read in both orientations.
+ void getReadMarkerKmers(
+ ReadId,
+ const span<Kmer>& Kmers0,
+ const span<Kmer>& Kmers1) const;
+ void getReadMarkerKmerIds(
+ ReadId,
+ const span<KmerId>& kmerIds0,
+ const span<KmerId>& kmerIds1) const;
+
+ // Get the Kmer/KmerId for an oriented read at a given marker ordinal.
+ Kmer getOrientedReadMarkerKmer(OrientedReadId, uint64_t ordinal) const;
+ KmerId getOrientedReadMarkerKmerId(OrientedReadId, uint64_t ordinal) const;
@@ -698,16 +623,6 @@ public:
size_t minFrequency, // Minimum number of lowHash hits for a pair to become a candidate.
size_t threadCount
);
- void findAlignmentCandidatesLowHash1(
- size_t m, // Number of consecutive k-mers that define a feature.
- double hashFraction, // Low hash threshold.
- size_t minHashIterationCount,
- size_t log2MinHashBucketCount, // Base 2 log of number of buckets for lowHash.
- size_t minBucketSize, // The minimum size for a bucket to be used.
- size_t maxBucketSize, // The maximum size for a bucket to be used.
- size_t minFrequency, // Minimum number of lowHash hits for a pair to become a candidate.
- size_t threadCount
- );
void markAlignmentCandidatesAllPairs();
void accessAlignmentCandidates();
void accessAlignmentCandidateTable();
@@ -850,7 +765,7 @@ public:
uint64_t maxBand,
int64_t matchScore,
int64_t mismatchScore,
- int64_t gapScore) const;
+ int64_t gapScore);
// Align two reads using alignment method 4.
// If debug is true, detailed output to html is produced.
@@ -862,7 +777,7 @@ public:
MemoryMapped::ByteAllocator&,
Alignment&,
AlignmentInfo&,
- bool debug) const;
+ bool debug);
// Intermediate level version used by the http server.
void alignOrientedReads4(
@@ -883,7 +798,20 @@ public:
int64_t gapScore,
Alignment&,
AlignmentInfo&
- ) const;
+ );
+
+ // Alignment method 5.
+ void alignOrientedReads5(
+ OrientedReadId,
+ OrientedReadId,
+ int matchScore,
+ int mismatchScore,
+ int gapScore,
+ double driftRateTolerance,
+ uint64_t minBandExtend,
+ Alignment&,
+ AlignmentInfo&,
+ ostream& html);
private:
@@ -939,7 +867,9 @@ private:
// The good alignments we found.
// They are stored with readId0<readId1 and with strand0==0.
// The order in compressedAlignments matches that in alignmentData.
+public:
MemoryMapped::Vector<AlignmentData> alignmentData;
+private:
MemoryMapped::VectorOfVectors<char, uint64_t> compressedAlignments;
void checkAlignmentDataAreOpen() const;
@@ -1011,8 +941,8 @@ private:
// Read graph and related functions and data.
// For more information, see comments in ReadGraph.hpp.
- ReadGraph readGraph;
public:
+ ReadGraph readGraph;
void createReadGraph(
uint32_t maxAlignmentCount,
uint32_t maxTrim);
@@ -1291,6 +1221,7 @@ public:
void accessMarkerGraphVertices(bool readWriteAccess = false);
void accessMarkerGraphReverseComplementVertex(bool readWriteAccess = false);
void removeMarkerGraphVertices();
+ void accessDisjointSetsHistogram();
private:
void findMarkerGraphReverseComplementVerticesThreadFunction1(size_t threadId);
void findMarkerGraphReverseComplementVerticesThreadFunction2(size_t threadId);
@@ -1309,7 +1240,8 @@ private:
uint32_t maxSkip,
vector<MarkerGraphVertexId>&) const;
-
+ // Find the common KmerId for all the markers of a marker graph vertex.
+ KmerId getMarkerGraphVertexKmerId(MarkerGraphVertexId) const;
// Clean up marker graph vertices that have duplicate markers
// (more than one marker on the same oriented reads).
@@ -1461,6 +1393,38 @@ public:
+ // Analyze and compare the read compositions of two marker graph edges.
+ // This can only be done if the two edges have no duplicate OrientedReadIds
+ // in the markers. In that case, each OrientedReadId of an edge
+ // corresponds to one and only one markerInterval for each edge.
+ bool analyzeMarkerGraphEdgePair(
+ MarkerGraphEdgeId,
+ MarkerGraphEdgeId,
+ MarkerGraphEdgePairInfo&
+ ) const;
+ void writeHtmlMarkerGraphEdgePairInfo(
+ ostream& html,
+ MarkerGraphEdgeId,
+ MarkerGraphEdgeId,
+ const MarkerGraphEdgePairInfo&
+ ) const;
+
+ // Count the number of common oriented reads between two marker graph edges.
+ // This assumes, WITHOUT CHECKING, that each of the two edges has no duplicate
+ // oriented reads. This assumption is satisfied for primary marker graph edges
+ // in Mode 3 assembly.
+ uint64_t countCommonOrientedReadsUnsafe(MarkerGraphEdgeId, MarkerGraphEdgeId) const;
+
+ // Estimate the offset, in bases, between two marker graph edges.
+ // This assumes, WITHOUT CHECKING, that each of the two edges has no duplicate
+ // oriented reads. This assumption is satisfied for primary marker graph edges
+ // in Mode 3 assembly.
+ // If there are common oriented reads between the two edges, this uses
+ // countCommonOrientedReadsUnsafe.
+ // This can fail, in which case it returns invalid<uint64_t>.
+ uint64_t estimateBaseOffsetUnsafe(MarkerGraphEdgeId, MarkerGraphEdgeId) const;
+
+
// Function createMarkerGraphSecondaryEdges can be called after createMarkerGraphEdgesStrict
// to create a minimal amount of additional non-strict edges (secondary edges)
// sufficient to restore contiguity.
@@ -1563,9 +1527,11 @@ private:
// it belongs to, plus the ordinal of the marker in the oriented read.
// If the marker is not contained in any vertex, return
// MarkerGraph::invalidVertexId.
+public:
MarkerGraph::VertexId getGlobalMarkerGraphVertex(
OrientedReadId,
uint32_t ordinal) const;
+private:
// Get pairs (ordinal, marker graph vertex id) for all markers of an oriented read.
// The pairs are returned sorted by ordinal.
@@ -1711,7 +1677,7 @@ private:
bool useSuperBubbleEdges,
bool useLowCoverageCrossEdges,
bool useRemovedSecondaryEdges,
- LocalMarkerGraph&
+ LocalMarkerGraph0&
);
bool extractLocalMarkerGraph(
MarkerGraph::VertexId,
@@ -1724,7 +1690,7 @@ private:
bool useSuperBubbleEdges,
bool useLowCoverageCrossEdges,
bool useRemovedSecondaryEdges,
- LocalMarkerGraph&
+ LocalMarkerGraph0&
);
// Compute consensus sequence for a vertex of the marker graph.
@@ -1777,20 +1743,6 @@ private:
- // Get the RLE sequence implied by a MarkerInterval.
- // If the markers overlap, returns the number of
- // overlapping RLE bases in overlappingRleBaseCount
- // and empty rleSequence.
- // Otherwise, returns zero overlappingRleBaseCount
- // and the intervening sequence in rleSequence
- // (which can be empty if the two markers are exactly adjacent).
- void getMarkerIntervalRleSequence(
- const MarkerInterval&,
- uint64_t& overlappingRleBaseCount,
- vector<Base>& rleSequence) const;
-
-
-
// Use spoa to compute consensus sequence for an edge of the marker graph.
// This does not include the bases corresponding to the flanking markers.
void computeMarkerGraphEdgeConsensusSequenceUsingSpoa(
@@ -1840,7 +1792,7 @@ public:
// A directed vertex A->B is created if the last marker graph vertex
// of the edge chain corresponding to A coincides with the
// first marker graph vertex of the edge chain corresponding to B.
- shared_ptr<AssemblyGraph> assemblyGraphPointer;
+ shared_ptr<mode0::AssemblyGraph> assemblyGraphPointer;
void removeAssemblyGraph()
{
assemblyGraphPointer.reset();
@@ -2045,7 +1997,17 @@ public:
private:
+
+ // Assemble Mode 3 sequence for all marker graph edges.
+ // See the comments before MarkerGraph::edgeSequence for more information.
+ // For now this is done sequentially.
+public:
+ void assembleMarkerGraphEdgesMode3();
+
+
+
// Assemble sequence for an edge of the assembly graph.
+private:
void assembleAssemblyGraphEdge(
AssemblyGraphEdgeId,
bool storeCoverageData,
@@ -2085,24 +2047,9 @@ private:
- // Assemble the RLE sequence of a path of the marker graph, under the assumption
- // that, for each edge, all oriented reads have exactly the same sequence.
- // This will be the case if edges were created by Assembler::createMarkerGraphEdgesStrict.
-public:
- void assembleMarkerGraphPathRleStrict(
- span<const MarkerGraphEdgeId> path,
- vector<Base>& rleSequence
- ) const;
- // Same, but for an assembly graph edge.
- void assembleAssemblyGraphEdgeRleStrict(
- AssemblyGraphEdgeId,
- vector<Base>& rleSequence
- ) const;
-
-
-
// Write the assembly graph in GFA 1.0 format defined here:
// https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md
+public:
void writeGfa1(const string& fileName);
void writeGfa1BothStrands(const string& fileName);
void writeGfa1BothStrandsNoSequence(const string& fileName);
@@ -2151,11 +2098,6 @@ public:
- // Analyze pseudo-paths of oriented reads.
- void alignPseudoPaths(ReadId, Strand, ReadId, Strand);
-
-
-
// Data and functions used for the http server.
// This function puts the server into an endless loop
// of processing requests.
@@ -2247,12 +2189,14 @@ public:
// Functions and data used by the http server
// for display of the local marker graph.
- void exploreMarkerGraph(const vector<string>&, ostream&);
- void getLocalMarkerGraphRequestParameters(
+ void exploreMarkerGraph0(const vector<string>&, ostream&);
+ void exploreMarkerGraph1(const vector<string>&, ostream&);
+ void getLocalMarkerGraph0RequestParameters(
const vector<string>&,
- LocalMarkerGraphRequestParameters&) const;
+ LocalMarkerGraph0RequestParameters&) const;
void exploreMarkerGraphVertex(const vector<string>&, ostream&);
void exploreMarkerGraphEdge(const vector<string>&, ostream&);
+ void exploreMarkerGraphEdgePair(const vector<string>&, ostream&);
void exploreMarkerCoverage(const vector<string>&, ostream&);
void exploreMarkerGraphInducedAlignment(const vector<string>&, ostream&);
void followReadInMarkerGraph(const vector<string>&, ostream&);
@@ -2275,6 +2219,8 @@ public:
uint64_t align4DeltaY,
uint64_t align4MinEntryCountPerCell,
uint64_t align4MaxDistanceFromBoundary,
+ double align5DriftRateTolerance,
+ uint64_t align5MinBandExtend,
ostream& html
);
void writeColorPicker(ostream& html, string svgId);
@@ -2305,7 +2251,7 @@ public:
// Compute all alignments for a given read.
// This can be slow for large assemblies,
- // and therefore the computation in multithreaded.
+ // and therefore the computation is multithreaded.
void computeAllAlignments(const vector<string>&, ostream&);
void computeAllAlignmentsThreadFunction(size_t threadId);
class ComputeAllAlignmentsData {
@@ -2329,6 +2275,8 @@ public:
uint64_t align4DeltaY;
uint64_t align4MinEntryCountPerCell;
uint64_t align4MaxDistanceFromBoundary;
+ double align5DriftRateTolerance;
+ uint64_t align5MinBandExtend;
// The alignments found by each thread.
vector< vector< pair<OrientedReadId, AlignmentInfo> > > threadAlignments;
};
@@ -2374,16 +2322,6 @@ private:
void exploreAssemblyGraphEdgesSupport(const vector<string>&, ostream&);
- // Http server functionality specific to mode 3 assembly.
- void exploreMode3AssemblyGraph(const vector<string>&, ostream&);
- void exploreMode3AssemblyGraphSegment(const vector<string>&, ostream&);
- void exploreMode3AssemblyGraphSegmentPair(const vector<string>&, ostream&);
- void exploreMode3AssemblyGraphLink(const vector<string>&, ostream&);
- void exploreMode3MetaAlignment(const vector<string>&, ostream&);
- void exploreMode3AssemblyPath(const vector<string>&, ostream&);
- void exploreMode3LinkAssembly(const vector<string>&, ostream&);
-
-
// Set up the ConsensusCaller used to compute the "best"
// base and repeat count at each assembly position.
@@ -2415,16 +2353,30 @@ public:
bool debug);
+
// Mode 3 assembly.
- void mode3Assembly(
- size_t threadCount);
- shared_ptr<mode3::AssemblyGraph> assemblyGraph3Pointer;
- void accessMode3AssemblyGraph();
- void analyzeMode3Subgraph(const vector<uint64_t>& segmentIds);
- void createMode3PathGraph();
- void createMode3Detangler();
+ shared_ptr<Mode3Assembler> mode3Assembler;
+ void flagPrimaryMarkerGraphEdges(
+ uint64_t minPrimaryCoverage,
+ uint64_t maxPrimaryCoverage,
+ uint64_t threadCount);
+ // Assemble sequence between two primary edges.
+ void fillMode3AssemblyPathStep(const vector<string>&, ostream&);
+ // Top level function for Mode 3 assembly, starting from the MarkerGraph.
+ void mode3Assembly(
+ uint64_t threadCount,
+ const Mode3AssemblyOptions&,
+ bool debug
+ );
+
+ void mode3AssembleComponent(
+ const string& fileName,
+ uint64_t threadCount,
+ const Mode3AssemblyOptions&,
+ bool assembleSequence,
+ bool debug) const;
public:
void test();
diff --git a/src/AssemblerAlign.cpp b/src/AssemblerAlign.cpp
index 7f9a1bc..88990f7 100644
--- a/src/AssemblerAlign.cpp
+++ b/src/AssemblerAlign.cpp
@@ -214,13 +214,14 @@ void Assembler::computeAlignments(
size_t threadCount
)
{
+
const auto tBegin = steady_clock::now();
performanceLog << timestamp << "Begin computing alignments for ";
performanceLog << alignmentCandidates.candidates.size() << " alignment candidates." << endl;
// Check that we have what we need.
reads->checkReadsAreOpen();
- checkKmersAreOpen();
+ SHASTA_ASSERT(kmerChecker);
checkMarkersAreOpen();
checkAlignmentCandidatesAreOpen();
@@ -239,6 +240,12 @@ void Assembler::computeAlignments(
computeSortedMarkers(threadCount);
}
+ // For alignment method 5, compute low frequency markers.
+ if(alignOptions.alignMethod == 5) {
+ cout << timestamp << "Computing unique markers." << endl;
+ computeLowFrequencyMarkers(1, threadCount);
+ }
+
// Pick the batch size for computing alignments.
size_t batchSize = 10;
if(batchSize > alignmentCandidates.candidates.size()/threadCount) {
@@ -254,9 +261,11 @@ void Assembler::computeAlignments(
data.threadCompressedAlignments.resize(threadCount);
performanceLog << timestamp << "Alignment computation begins." << endl;
+ cout << timestamp << "Alignment computation begins." << endl;
setupLoadBalancing(alignmentCandidates.candidates.size(), batchSize);
runThreads(&Assembler::computeAlignmentsThreadFunction, threadCount);
performanceLog << timestamp << "Alignment computation completed." << endl;
+ cout << timestamp << "Alignment computation completed." << endl;
// Store the alignments found by each thread.
performanceLog << timestamp << "Storing the alignment found by each thread." << endl;
@@ -286,10 +295,13 @@ void Assembler::computeAlignments(
alignmentData.unreserve();
compressedAlignments.unreserve();
- // For alignment method 4, remove the sorted markers.
+ // Cleanup.
if(alignOptions.alignMethod == 4) {
sortedMarkers.remove();
}
+ if(alignOptions.alignMethod == 5) {
+ lowFrequencyMarkers.remove();
+ }
cout << "Found and stored " << alignmentData.size() << " good alignments." << endl;
performanceLog << timestamp << "Creating alignment table." << endl;
@@ -331,6 +343,8 @@ void Assembler::computeAlignmentsThreadFunction(size_t threadId)
const int bandExtend = data.alignOptions->bandExtend;
const int maxBand = data.alignOptions->maxBand;
const bool suppressContainments = data.alignOptions->suppressContainments;
+ const double align5DriftRateTolerance = data.alignOptions->align5DriftRateTolerance;
+ const uint64_t align5MinBandExtend = data.alignOptions->align5MinBandExtend;
// Align4-specific items.
@@ -365,13 +379,10 @@ void Assembler::computeAlignmentsThreadFunction(size_t threadId)
largeDataName("tmp-ThreadGlobalCompressedAlignments-" + to_string(threadId)),
largeDataPageSize);
+ const uint64_t messageFrequency = min(1000000UL, alignmentCandidates.candidates.size()/20);
+
uint64_t begin, end;
while(getNextBatch(begin, end)) {
- if((begin % 1000000) == 0){
- std::lock_guard<std::mutex> lock(mutex);
- performanceLog << timestamp << "Working on alignment " << begin;
- performanceLog << " of " << alignmentCandidates.candidates.size() << endl;
- }
for(size_t i=begin; i!=end; i++) {
const OrientedReadPair& candidate = alignmentCandidates.candidates[i];
@@ -381,6 +392,13 @@ void Assembler::computeAlignmentsThreadFunction(size_t threadId)
orientedReadIds[0] = OrientedReadId(candidate.readIds[0], 0);
orientedReadIds[1] = OrientedReadId(candidate.readIds[1], candidate.isSameStrand ? 0 : 1);
+ if((i % messageFrequency) == 0){
+ std::lock_guard<std::mutex> lock(mutex);
+ performanceLog << timestamp << "Working on alignment " << i;
+ performanceLog << " of " << alignmentCandidates.candidates.size();
+ // performanceLog << ": " << orientedReadIds[0] << " " << orientedReadIds[1];
+ performanceLog << endl;
+ }
// Compute the alignment.
@@ -413,6 +431,13 @@ void Assembler::computeAlignmentsThreadFunction(size_t threadId)
alignment, alignmentInfo,
false);
SHASTA_ASSERT(byteAllocator.isEmpty());
+ } else if(alignmentMethod == 5) {
+ ofstream nullStream;
+ alignOrientedReads5(orientedReadIds[0], orientedReadIds[1],
+ matchScore, mismatchScore, gapScore,
+ align5DriftRateTolerance, align5MinBandExtend,
+ alignment, alignmentInfo,
+ nullStream);
} else {
SHASTA_ASSERT(0);
}
@@ -1008,70 +1033,6 @@ uint32_t Assembler::countCommonMarkersWithOffsetIn(
}
-void Assembler::writeAlignmentDetails() const
-{
- string directoryName = "Alignments/";
- string header = "kmerId,ordinal0,ordinal1,rlePosition0,rlePosition1,";
-
- SHASTA_ASSERT(std::filesystem::create_directory(directoryName));
-
- for (uint32_t alignmentIndex=0; alignmentIndex<alignmentData.size(); alignmentIndex++){
- // Access the stored information we have about this alignment.
- AlignmentData alignmentDatum = alignmentData[alignmentIndex];
- span<const char> compressedAlignment = compressedAlignments[alignmentIndex];
-
- Alignment alignment;
- decompress(compressedAlignment, alignment);
- OrientedReadId orientedReadId0 = OrientedReadId(alignmentDatum.readIds[0], 0);
- OrientedReadId orientedReadId1 = OrientedReadId(alignmentDatum.readIds[1], alignmentDatum.isSameStrand ? 0 : 1);
-
- string name0 = string(reads->getReadName(orientedReadId0.getReadId()).begin(),
- reads->getReadName(orientedReadId0.getReadId()).end());
-
- string name1 = string(reads->getReadName(orientedReadId1.getReadId()).begin(),
- reads->getReadName(orientedReadId1.getReadId()).end());
-
- string filename = name0 + "_" + name1 + "_" + (alignmentDatum.isSameStrand ? "1" : "0") + ".csv";
-
- // Create a writeable a csv file
- ofstream csv(directoryName + filename);
- if (not (csv.is_open() and csv.good())){
- throw runtime_error("ERROR: file could not be written: " + directoryName + filename);
- }
-
- csv << header << '\n';
-
- // Access the markers for the two oriented reads.
- const auto markers0 = markers[orientedReadId0.getValue()];
- const auto markers1 = markers[orientedReadId1.getValue()];
-
- // Compute the raw position corresponding to each RLE position.
- const vector<uint32_t> rawPositions0 = reads->getRawPositions(orientedReadId0);
- const vector<uint32_t> rawPositions1 = reads->getRawPositions(orientedReadId1);
-
- // Loop over all markers.
- for(const auto& ordinals: alignment.ordinals) {
- const auto ordinal0 = ordinals[0];
- const auto ordinal1 = ordinals[1];
-
- const auto& marker0 = markers0[ordinal0];
- const auto& marker1 = markers1[ordinal1];
-
- const uint32_t rlePosition0 = marker0.position;
- const uint32_t rlePosition1 = marker1.position;
-
- const auto kmerId = marker0.kmerId;
- SHASTA_ASSERT(marker1.kmerId == kmerId);
-
- csv << kmerId << ','
- << ordinal0 << ','
- << ordinal1 << ','
- << rlePosition0 << ','
- << rlePosition1 << ',' << '\n';
- }
- }
-}
-
// Check if an alignment between two reads should be suppressed,
// bases on the setting of command line option
diff --git a/src/AssemblerAlign1.cpp b/src/AssemblerAlign1.cpp
index a407d0f..adeea60 100644
--- a/src/AssemblerAlign1.cpp
+++ b/src/AssemblerAlign1.cpp
@@ -72,52 +72,41 @@ void Assembler::alignOrientedReads1(
using TDepStringSet = StringSet<TSequence, Dependent<> >;
using TAlignGraph = Graph<Alignment<TDepStringSet> >;
+#if 0
// Access the markers of our oriented reads.
const span<CompressedMarker> markers0 =
markers[orientedReadId0.getValue()];
const span<CompressedMarker> markers1 =
markers[orientedReadId1.getValue()];
+#endif
-
-
- // Seqan uses the integer 45 to represent a gap
- // and I did not find a good way to control that.
- // So if KmerId 45 is a marker we replace it with the first KmerId
- // that does not represent a marker.
- // This is messy but I did not find a better solution.
- bool replacementIsNeeded = false;
- const KmerId seqanGapValue = 45;
- KmerId replacementValue = seqanGapValue;
- if(kmerTable[seqanGapValue].isMarker) {
- replacementIsNeeded = true;
- for(uint64_t i=0; i<kmerTable.size(); i++) {
- if(!kmerTable[i].isMarker) {
- replacementValue = KmerId(i);
- break;
- }
- }
- // cout << "Replacement value " << replacementValue << endl;
- SHASTA_ASSERT(replacementValue != seqanGapValue);
+ // Get the marker KmerIds for the two oriented reads.
+ array<span<KmerId>, 2> allMarkerKmerIds;
+ array<vector<KmerId>, 2> allMarkerKmerIdsVectors;
+ if(markerKmerIds.isOpen()) {
+ allMarkerKmerIds[0] = markerKmerIds[orientedReadId0.getValue()];
+ allMarkerKmerIds[1] = markerKmerIds[orientedReadId1.getValue()];
+ } else {
+ // This is slower and will happen if markerKmerIds is not available.
+ // Resize the vectors and make the spans point to the vectors.
+ // Then call getOrientedReadMarkerKmerIds to fill them in.
+ allMarkerKmerIdsVectors[0].resize(markers.size(orientedReadId0.getValue()));
+ allMarkerKmerIdsVectors[1].resize(markers.size(orientedReadId1.getValue()));
+ allMarkerKmerIds[0] = span<KmerId>(allMarkerKmerIdsVectors[0]);
+ allMarkerKmerIds[1] = span<KmerId>(allMarkerKmerIdsVectors[1]);
+ getOrientedReadMarkerKmerIds(orientedReadId0, allMarkerKmerIds[0]);
+ getOrientedReadMarkerKmerIds(orientedReadId1, allMarkerKmerIds[1]);
}
-
-
// Construct the sequences of KmerId's we want to align.
+ // SeqAn uses 45 to represent gaps, so we add 100 to the KmerIds passed to SeqAn.
TSequence seq0;
- for(const CompressedMarker marker: markers0) {
- if(replacementIsNeeded && marker.kmerId == seqanGapValue) {
- appendValue(seq0, replacementValue);
- } else {
- appendValue(seq0, marker.kmerId);
- }
+ for(const KmerId kmerId: allMarkerKmerIds[0]) {
+ appendValue(seq0, kmerId + 100);
}
TSequence seq1;
- for(const CompressedMarker marker: markers1) {
- if(replacementIsNeeded && marker.kmerId == seqanGapValue) {
- appendValue(seq1, replacementValue);
- } else {
- appendValue(seq1, marker.kmerId);
- }
+ for(const KmerId kmerId: allMarkerKmerIds[1]) {
+ appendValue(seq1, kmerId + 100);
}
// Store them in a SeqAn string set.
@@ -157,11 +146,15 @@ void Assembler::alignOrientedReads1(
alignment.clear();
uint32_t ordinal0 = 0;
uint32_t ordinal1 = 0;
+ const uint32_t seqanGapValue = 45;
for(int i=0;
- i<alignmentLength and ordinal0<markers0.size() and ordinal1<markers1.size(); i++) {
+ i<alignmentLength and
+ ordinal0<allMarkerKmerIds[0].size() and
+ ordinal1<allMarkerKmerIds[1].size();
+ i++) {
if( align[i] != seqanGapValue and
align[i + alignmentLength] != seqanGapValue and
- markers0[ordinal0].kmerId == markers1[ordinal1].kmerId) {
+ allMarkerKmerIds[0][ordinal0] == allMarkerKmerIds[1][ordinal1]) {
alignment.ordinals.push_back(array<uint32_t, 2>{ordinal0, ordinal1});
}
if(align[i] != seqanGapValue) {
@@ -173,7 +166,7 @@ void Assembler::alignOrientedReads1(
}
// Store the alignment info.
- alignmentInfo.create(alignment, uint32_t(markers0.size()), uint32_t(markers1.size()));
+ alignmentInfo.create(alignment, uint32_t(allMarkerKmerIds[0].size()), uint32_t(allMarkerKmerIds[1].size()));
// Debugging.
@@ -186,8 +179,8 @@ void Assembler::alignOrientedReads1(
alignment[0].resize(alignmentLength);
alignment[1].resize(alignmentLength);
for(int i=0; i<alignmentLength; i++) {
- alignment[0][i] = align[i];
- alignment[1][i] = align[i + alignmentLength];
+ alignment[0][i] = align[i] - 100;
+ alignment[1][i] = align[i + alignmentLength] - 100;
}
diff --git a/src/AssemblerAlign3.cpp b/src/AssemblerAlign3.cpp
index 9b54d27..435e25b 100644
--- a/src/AssemblerAlign3.cpp
+++ b/src/AssemblerAlign3.cpp
@@ -53,11 +53,25 @@ void Assembler::alignOrientedReads3(
using TDepStringSet = StringSet<TSequence, Dependent<> >;
using TAlignGraph = Graph<Alignment<TDepStringSet> >;
+ // Get the marker KmerIds for the two oriented reads.
+ array<span<KmerId>, 2> allMarkerKmerIds;
+ array<vector<KmerId>, 2> allMarkerKmerIdsVectors;
+ if(markerKmerIds.isOpen()) {
+ allMarkerKmerIds[0] = markerKmerIds[orientedReadId0.getValue()];
+ allMarkerKmerIds[1] = markerKmerIds[orientedReadId1.getValue()];
+ } else {
+ // This is slower and will happen if markerKmerIds is not available.
+ // Resize the vectors and make the spans point to the vectors.
+ // Then call getOrientedReadMarkerKmerIds to fill them in.
+ allMarkerKmerIdsVectors[0].resize(markers.size(orientedReadId0.getValue()));
+ allMarkerKmerIdsVectors[1].resize(markers.size(orientedReadId1.getValue()));
+ allMarkerKmerIds[0] = span<KmerId>(allMarkerKmerIdsVectors[0]);
+ allMarkerKmerIds[1] = span<KmerId>(allMarkerKmerIdsVectors[1]);
+ getOrientedReadMarkerKmerIds(orientedReadId0, allMarkerKmerIds[0]);
+ getOrientedReadMarkerKmerIds(orientedReadId1, allMarkerKmerIds[1]);
+ }
+
- // Get the markers for the two oriented reads.
- array<span<CompressedMarker>, 2> allMarkers;
- allMarkers[0] = markers[orientedReadId0.getValue()];
- allMarkers[1] = markers[orientedReadId1.getValue()];
// Vectors to contain downsampled markers.
// For each of the two reads we store vectors of
@@ -66,14 +80,14 @@ void Assembler::alignOrientedReads3(
array<TSequence, 2> downsampledSequences;
// Fill in downsampled markers.
- // SeqAn uses 45 to represent gaps, so we add 45 to the KmerIds passed to SeqAn.
+ // SeqAn uses 45 to represent gaps, so we add 100 to the KmerIds passed to SeqAn.
// This means that we can't do k=16.
const uint32_t hashThreshold =
uint32_t(downsamplingFactor * double(std::numeric_limits<uint32_t>::max()));
for(uint64_t i=0; i<2; i++) {
- for(uint32_t ordinal=0; ordinal<uint32_t(allMarkers[i].size()); ordinal++) {
- const KmerId kmerId = allMarkers[i][ordinal].kmerId;
- if(kmerTable[kmerId].hash < hashThreshold) {
+ for(uint32_t ordinal=0; ordinal<uint32_t(allMarkerKmerIds[i].size()); ordinal++) {
+ const KmerId kmerId = allMarkerKmerIds[i][ordinal];
+ if(hashKmerId(kmerId) < hashThreshold) {
downsampledMarkers[i].push_back(make_pair(ordinal, kmerId));
appendValue(downsampledSequences[i], kmerId + 100);
}
@@ -82,7 +96,7 @@ void Assembler::alignOrientedReads3(
if(debug) {
cout << "Aligning two oriented reads with " <<
- allMarkers[0].size() << " and " << allMarkers[1].size() << " markers." << endl;
+ allMarkerKmerIds[0].size() << " and " << allMarkerKmerIds[1].size() << " markers." << endl;
cout << "Downsampled markers for step 1 to " <<
downsampledMarkers[0].size() << " and " <<
downsampledMarkers[1].size() << " markers." << endl;
@@ -101,7 +115,7 @@ void Assembler::alignOrientedReads3(
// One of the downsampled sequences is empty. Return an empty alignment.
alignment.clear();
alignmentInfo.create(
- alignment, uint32_t(allMarkers[0].size()), uint32_t(allMarkers[1].size()));
+ alignment, uint32_t(allMarkerKmerIds[0].size()), uint32_t(allMarkerKmerIds[1].size()));
return;
}
@@ -186,7 +200,7 @@ void Assembler::alignOrientedReads3(
downsampledMarkers[0].size() + downsampledMarkers[1].size()) {
alignment.clear();
alignmentInfo.create(
- alignment, uint32_t(allMarkers[0].size()), uint32_t(allMarkers[1].size()));
+ alignment, uint32_t(allMarkerKmerIds[0].size()), uint32_t(allMarkerKmerIds[1].size()));
return;
}
@@ -234,7 +248,7 @@ void Assembler::alignOrientedReads3(
if((bandMax - bandMin) > maxBand) {
alignment.clear();
alignmentInfo.create(
- alignment, uint32_t(allMarkers[0].size()), uint32_t(allMarkers[1].size()));
+ alignment, uint32_t(allMarkerKmerIds[0].size()), uint32_t(allMarkerKmerIds[1].size()));
return;
}
@@ -243,8 +257,8 @@ void Assembler::alignOrientedReads3(
// Now, do a alignment using this band and all markers.
array<TSequence, 2> sequences;
for(uint64_t i=0; i<2; i++) {
- for(uint32_t ordinal=0; ordinal<uint32_t(allMarkers[i].size()); ordinal++) {
- const KmerId kmerId = allMarkers[i][ordinal].kmerId;
+ for(uint32_t ordinal=0; ordinal<uint32_t(allMarkerKmerIds[i].size()); ordinal++) {
+ const KmerId kmerId = allMarkerKmerIds[i][ordinal];
appendValue(sequences[i], kmerId + 100);
}
}
@@ -280,10 +294,10 @@ void Assembler::alignOrientedReads3(
uint32_t ordinal0 = 0;
uint32_t ordinal1 = 0;
for(int i=0;
- i<alignmentLength and ordinal0<allMarkers[0].size() and ordinal1<allMarkers[1].size(); i++) {
+ i<alignmentLength and ordinal0<allMarkerKmerIds[0].size() and ordinal1<allMarkerKmerIds[1].size(); i++) {
if( align[i] != seqanGapValue and
align[i + alignmentLength] != seqanGapValue and
- allMarkers[0][ordinal0].kmerId == allMarkers[1][ordinal1].kmerId) {
+ allMarkerKmerIds[0][ordinal0] == allMarkerKmerIds[1][ordinal1]) {
alignment.ordinals.push_back(array<uint32_t, 2>{ordinal0, ordinal1});
}
if(align[i] != seqanGapValue) {
@@ -308,7 +322,7 @@ void Assembler::alignOrientedReads3(
}
// Store the alignment info.
- alignmentInfo.create(alignment, uint32_t(allMarkers[0].size()), uint32_t(allMarkers[1].size()));
+ alignmentInfo.create(alignment, uint32_t(allMarkerKmerIds[0].size()), uint32_t(allMarkerKmerIds[1].size()));
}
diff --git a/src/AssemblerAlign4.cpp b/src/AssemblerAlign4.cpp
index 31823ee..666eb81 100644
--- a/src/AssemblerAlign4.cpp
+++ b/src/AssemblerAlign4.cpp
@@ -25,7 +25,7 @@ void Assembler::alignOrientedReads4(
uint64_t maxBand,
int64_t matchScore,
int64_t mismatchScore,
- int64_t gapScore) const
+ int64_t gapScore)
{
// Fill in the options.
Align4::Options options;
@@ -81,7 +81,7 @@ void Assembler::alignOrientedReads4(
int64_t gapScore,
Alignment& alignment,
AlignmentInfo& alignmentInfo
- ) const
+ )
{
// Fill in the options.
Align4::Options options;
@@ -122,18 +122,32 @@ void Assembler::alignOrientedReads4(
MemoryMapped::ByteAllocator& byteAllocator,
Alignment& alignment,
AlignmentInfo& alignmentInfo,
- bool debug) const
+ bool debug)
{
- // Access the markers for the two oriented reads.
- array<span<const CompressedMarker>, 2> orientedReadMarkers;
- orientedReadMarkers[0] = markers[orientedReadId0.getValue()];
- orientedReadMarkers[1] = markers[orientedReadId1.getValue()];
+
+ // Get the marker KmerIds for the two oriented reads.
+ array<span<KmerId>, 2> orientedReadKmerIds;
+ array<vector<KmerId>, 2> orientedReadKmerIdsVectors;
+ if(markerKmerIds.isOpen()) {
+ orientedReadKmerIds[0] = markerKmerIds[orientedReadId0.getValue()];
+ orientedReadKmerIds[1] = markerKmerIds[orientedReadId1.getValue()];
+ } else {
+ // This is slower and will happen if markerKmerIds is not available.
+ // Resize the vectors and make the spans point to the vectors.
+ // Then call getOrientedReadMarkerKmerIds to fill them in.
+ orientedReadKmerIdsVectors[0].resize(markers.size(orientedReadId0.getValue()));
+ orientedReadKmerIdsVectors[1].resize(markers.size(orientedReadId1.getValue()));
+ orientedReadKmerIds[0] = span<KmerId>(orientedReadKmerIdsVectors[0]);
+ orientedReadKmerIds[1] = span<KmerId>(orientedReadKmerIdsVectors[1]);
+ getOrientedReadMarkerKmerIds(orientedReadId0, orientedReadKmerIds[0]);
+ getOrientedReadMarkerKmerIds(orientedReadId1, orientedReadKmerIds[1]);
+ }
// Align4 needs markers sorted by KmerId.
// Use the ones from sortedMarkers if available, or else compute them.
- array<span< const pair<KmerId, uint32_t> >, 2> orientedReadSortedMarkersSpans;
+ array<span< pair<KmerId, uint32_t> >, 2> orientedReadSortedMarkersSpans;
array<vector< pair<KmerId, uint32_t> >, 2> orientedReadSortedMarkers;
if(sortedMarkers.isOpen()) {
@@ -154,33 +168,30 @@ void Assembler::alignOrientedReads4(
for(uint64_t i=0; i<2; i++) {
// Unsorted markers for this oriented read.
- const span<const CompressedMarker>& um = orientedReadMarkers[i];
+ const span<const KmerId>& km = orientedReadKmerIds[i];
// Sorted markers for this oriented read.
vector<pair<KmerId, uint32_t> >& sm = orientedReadSortedMarkers[i];
// Copy the unsorted markers.
- const uint64_t n = um.size();
+ const uint64_t n = km.size();
sm.resize(n);
for(uint64_t ordinal=0; ordinal<n; ordinal++) {
- const CompressedMarker& cm = um[ordinal];
- sm[ordinal] = make_pair(cm.kmerId, uint32_t(ordinal));
+ sm[ordinal] = make_pair(km[ordinal], uint32_t(ordinal));
}
// Sort them.
sort(sm.begin(), sm.end(), OrderPairsByFirstOnly<KmerId, uint32_t>());
// Make the span point to the data in the vector.
- const pair<KmerId, uint32_t> * const smBegin = &sm.front();
- orientedReadSortedMarkersSpans[i] =
- span< const pair<KmerId, uint32_t> >(smBegin, smBegin + n);
+ orientedReadSortedMarkersSpans[i] = sm;
}
}
// Compute the alignment.
- Align4::align(orientedReadMarkers, orientedReadSortedMarkersSpans,
+ Align4::align(orientedReadKmerIds, orientedReadSortedMarkersSpans,
options, byteAllocator, alignment, alignmentInfo, debug);
}
@@ -192,6 +203,8 @@ void Assembler::computeSortedMarkers(uint64_t threadCount)
// Check that we have what we need.
checkMarkersAreOpen();
const uint64_t orientedReadCount = markers.size();
+ SHASTA_ASSERT(markerKmerIds.isOpen());
+ SHASTA_ASSERT(markerKmerIds.size() == orientedReadCount);
// Adjust the numbers of threads, if necessary.
if(threadCount == 0) {
@@ -200,18 +213,48 @@ void Assembler::computeSortedMarkers(uint64_t threadCount)
// Do it.
sortedMarkers.createNew(largeDataName("SortedMarkers"), largeDataPageSize);
- sortedMarkers.beginPass1(orientedReadCount);
- const uint64_t batchSize = 10000;
- setupLoadBalancing(orientedReadCount, batchSize);
- runThreads(&Assembler::computeSortedMarkersThreadFunction1, threadCount);
- sortedMarkers.beginPass2();
- sortedMarkers.endPass2(false);
+ for(uint64_t i=0; i<orientedReadCount; i++) {
+ sortedMarkers.appendVector(markers[i].size());
+ }
+ const uint64_t batchSize = 100;
setupLoadBalancing(orientedReadCount, batchSize);
- runThreads(&Assembler::computeSortedMarkersThreadFunction2, threadCount);
+ runThreads(&Assembler::computeSortedMarkersThreadFunction, threadCount);
+}
+
+
+
+void Assembler::computeSortedMarkersThreadFunction(size_t threadId)
+{
+ // Loop over all batches assigned to this thread.
+ uint64_t begin, end;
+ while(getNextBatch(begin, end)) {
+
+ // Loop over oriented reads in this batch.
+ for(uint64_t i=begin; i!=end; i++) {
+
+ // Access the marker KmerIs and sorted markers for this oriented read.
+ const auto kmerIds = markerKmerIds[i];
+ const uint64_t markerCount = kmerIds.size();
+ const span< pair<KmerId, uint32_t> > sm = sortedMarkers[i];
+ SHASTA_ASSERT(sm.size() == markerCount);
+
+ // Copy the KmerId's and ordinals.
+ for(uint32_t ordinal=0; ordinal<markerCount; ordinal++) {
+ auto& p = sm[ordinal];
+ p.first = kmerIds[ordinal];
+ p.second = ordinal;
+ }
+
+ // Sort them by KmerId.
+ sort(sm.begin(), sm.end(), OrderPairsByFirstOnly<KmerId, uint32_t>());
+ }
+ }
+
}
+#if 0
void Assembler::computeSortedMarkersThreadFunction1(size_t threadId)
{
// Loop over all batches assigned to this thread.
@@ -259,6 +302,8 @@ void Assembler::computeSortedMarkersThreadFunction2(size_t threadId)
}
}
+#endif
+
bool Assembler::accessSortedMarkers()
diff --git a/src/AssemblerAlign5.cpp b/src/AssemblerAlign5.cpp
new file mode 100644
index 0000000..c9d267f
--- /dev/null
+++ b/src/AssemblerAlign5.cpp
@@ -0,0 +1,737 @@
+#include "Assembler.hpp"
+#include "deduplicate.hpp"
+#include "Reads.hpp"
+#include "seqan.hpp"
+using namespace shasta;
+
+
+
+// Version that uses banded alignments.
+void Assembler::alignOrientedReads5(
+ OrientedReadId orientedReadId0,
+ OrientedReadId orientedReadId1,
+ int matchScore,
+ int mismatchScore,
+ int gapScore,
+ double driftRateTolerance,
+ uint64_t minBandExtend,
+ Alignment& alignment,
+ AlignmentInfo& alignmentInfo,
+ ostream& html)
+{
+
+ // Get the marker KmerIds for the two oriented reads.
+ array<span<KmerId>, 2> allMarkerKmerIds;
+ array<vector<KmerId>, 2> allMarkerKmerIdsVectors;
+ if(markerKmerIds.isOpen()) {
+ allMarkerKmerIds[0] = markerKmerIds[orientedReadId0.getValue()];
+ allMarkerKmerIds[1] = markerKmerIds[orientedReadId1.getValue()];
+ } else {
+ // This is slower and will happen if markerKmerIds is not available.
+ // Resize the vectors and make the spans point to the vectors.
+ // Then call getOrientedReadMarkerKmerIds to fill them in.
+ allMarkerKmerIdsVectors[0].resize(markers.size(orientedReadId0.getValue()));
+ allMarkerKmerIdsVectors[1].resize(markers.size(orientedReadId1.getValue()));
+ allMarkerKmerIds[0] = span<KmerId>(allMarkerKmerIdsVectors[0]);
+ allMarkerKmerIds[1] = span<KmerId>(allMarkerKmerIdsVectors[1]);
+ getOrientedReadMarkerKmerIds(orientedReadId0, allMarkerKmerIds[0]);
+ getOrientedReadMarkerKmerIds(orientedReadId1, allMarkerKmerIds[1]);
+ }
+
+
+ // Get the low frequency markers in the two oriented reads, sorted by KmerId.
+ array< span<uint32_t>, 2> lowFrequencyOrdinals;
+ array< vector<uint32_t>, 2> lowFrequencyOrdinalsVectors;
+ if(lowFrequencyMarkers.isOpen()) {
+ // Use the stored copy.
+ lowFrequencyOrdinals[0] = lowFrequencyMarkers[orientedReadId0.getValue()];
+ lowFrequencyOrdinals[1] = lowFrequencyMarkers[orientedReadId1.getValue()];
+ }
+ else {
+ // Compute them and store in the local vectors, then have the spans point to them.
+ for(uint64_t i=0; i<2; i++) {
+ computeLowFrequencyMarkers(allMarkerKmerIds[i], 1, lowFrequencyOrdinalsVectors[i]);
+ lowFrequencyOrdinals[i] = span<uint32_t>(lowFrequencyOrdinalsVectors[i]);
+ }
+ }
+
+
+
+ if(html) {
+ for(uint64_t i=0; i<2; i++) {
+ html << "<br>" << (i==0 ? orientedReadId0 : orientedReadId1) << " has " << allMarkerKmerIds[i].size() <<
+ " markers of which " << lowFrequencyOrdinals[i].size() << " are unique." << endl;
+ }
+ }
+
+
+
+ // Find pairs of ordinals in the two oriented reads that correspond to
+ // the same low frequency k-mers.
+ class CommonKmerInfo {
+ public:
+ uint32_t ordinal0;
+ uint32_t ordinal1;
+ KmerId kmerId;
+ uint64_t rank0 = invalid<uint64_t>;
+ uint64_t rank1 = invalid<uint64_t>;
+ uint64_t ordinalSum() const
+ {
+ return ordinal0 + ordinal1;
+ }
+ int64_t ordinalOffset() const
+ {
+ return int64_t(ordinal0) - int64_t(ordinal1);
+ }
+ };
+ vector<CommonKmerInfo> commonKmerInfos;
+
+ // Joint loop over the ordinals corresponding to low frequency markers.
+ // They are both sorted by KmerId.
+ const auto begin0 = lowFrequencyOrdinals[0].begin();
+ const auto begin1 = lowFrequencyOrdinals[1].begin();
+ const auto end0 = lowFrequencyOrdinals[0].end();
+ const auto end1 = lowFrequencyOrdinals[1].end();
+ auto it0 = begin0;
+ auto it1 = begin1;
+ while((it0 != end0) and (it1 != end1)) {
+ const uint32_t ordinal0 = *it0;
+ const uint32_t ordinal1 = *it1;
+ const KmerId kmerId0 = allMarkerKmerIds[0][ordinal0];
+ const KmerId kmerId1 = allMarkerKmerIds[1][ordinal1];
+
+ if(kmerId0 < kmerId1) {
+
+ // Go past the streak with this KmerId in lowFrequencyOrdinals[0].
+ while(it0 != end0 and allMarkerKmerIds[0][*it0] == kmerId0) {
+ ++it0;
+ }
+
+
+ } else if(kmerId1 < kmerId0) {
+
+ // Go past the streak with this KmerId in lowFrequencyOrdinals[1].
+ while(it1 != end1 and allMarkerKmerIds[1][*it1] == kmerId1) {
+ ++it1;
+ }
+
+ } else {
+
+ // We found a common low frequency marker k-mer.
+ SHASTA_ASSERT(kmerId0 == kmerId1);
+ const KmerId kmerId = kmerId0;
+
+ // Look for the streak with this KmerId in lowFrequencyOrdinals[0].
+ auto streakBegin0 = it0;
+ auto streakEnd0 = it0 + 1;
+ while(streakEnd0 != end0 and allMarkerKmerIds[0][*streakEnd0] == kmerId) {
+ ++streakEnd0;
+ }
+
+ // Look for the streak with this KmerId in lowFrequencyOrdinals[1].
+ auto streakBegin1 = it1;
+ auto streakEnd1 = it1 + 1;
+ while(streakEnd1 != end1 and allMarkerKmerIds[1][*streakEnd1] == kmerId) {
+ ++streakEnd1;
+ }
+
+ // Look over pairs of markers in these streaks.
+ for(auto jt0=streakBegin0; jt0!=streakEnd0; jt0++) {
+ for(auto jt1=streakBegin1; jt1!=streakEnd1; jt1++) {
+ commonKmerInfos.push_back({*jt0, *jt1, kmerId});
+ }
+ }
+
+ // Point to the next marker in lowFrequencyOrdinals[0] and lowFrequencyOrdinals[1].
+ it0 = streakEnd0;
+ it1 = streakEnd1;
+ }
+ }
+
+
+
+ // Write the common unique markers.
+ if(html) {
+ html << "<h3>Common unique markers</h3>";
+ html << "There are " << commonKmerInfos.size() << " common unique markers." << endl;
+ html << "<p><table>"
+ "<tr><th>Ordinal0<th>Ordinal1<th>Ordinal<br>offset<th>Ordinal<br>sum<th>KmerId<th>Kmer";
+ const uint64_t k = assemblerInfo->k;
+ for(const CommonKmerInfo& commonKmerInfo: commonKmerInfos) {
+ const Kmer kmer(commonKmerInfo.kmerId, k);
+ html << "<tr>"
+ "<td class=centered>" << commonKmerInfo.ordinal0 <<
+ "<td class=centered>" << commonKmerInfo.ordinal1 <<
+ "<td class=centered>" << commonKmerInfo.ordinalOffset() <<
+ "<td class=centered>" << commonKmerInfo.ordinalSum();
+
+ // Write the KmerId in hex with the appropriate number of digits.
+ const char oldFill = html.fill('0');
+ html << "<td class=centered style='font-family:monospace'>" <<
+ std::hex << std::setw(int(k/2)) << commonKmerInfo.kmerId << std::dec;
+ html.fill(oldFill);
+
+ // Write the Kmer.
+ html << "<td class=centered style='font-family:monospace'>";
+ kmer.write(html, k);
+ }
+ html << "</table>";
+ }
+
+
+
+ // Create a histogram of ordinal offsets for the common unique markers.
+ std::map<int64_t, uint64_t> histogramMap;
+ for(const CommonKmerInfo& commonKmerInfo: commonKmerInfos) {
+ const int64_t offset = commonKmerInfo.ordinalOffset();
+ auto it = histogramMap.find(offset);
+ if(it == histogramMap.end()) {
+ histogramMap.insert({offset, 1});
+ } else {
+ ++it->second;
+ }
+ }
+ vector< pair<int64_t, uint64_t> > histogram;
+ copy(histogramMap.begin(), histogramMap.end(), back_inserter(histogram));
+ if(html) {
+ html << "<h3>Histogram of ordinal offsets for the common unique markers</h3>"
+ "<table>"
+ "<tr><th>Ordinal<br>offset<th>Frequency";
+ for(const auto& p: histogram) {
+ html << "<tr>"
+ "<td class=centered>" << p.first <<
+ "<td class=centered>" << p.second;
+ }
+ html << "</table>";
+ }
+
+
+
+ // Find clusters of ordinal offsets.
+ class Cluster {
+ public:
+ int64_t firstOffset;
+ int64_t lastOffset;
+ uint64_t uniqueMarkerCount;
+ };
+ vector<Cluster> clusters;
+ const uint64_t minMarkerCount = min(allMarkerKmerIds[0].size(), allMarkerKmerIds[1].size());
+ const int64_t offsetDeltaTolerance = int64_t(std::round(driftRateTolerance * double(minMarkerCount)));
+ for(uint64_t i=0; i<histogram.size(); /* Increment later */) {
+ Cluster cluster;
+ const uint64_t firstOffsetIndexInHistogram = i;
+ cluster.firstOffset = histogram[firstOffsetIndexInHistogram].first;
+ for(++i; i < histogram.size(); ++i) {
+ if(histogram[i].first > histogram[i-1].first + offsetDeltaTolerance) {
+ break;
+ }
+ }
+ const uint64_t lastOffsetIndexInHistogram = i-1;
+ cluster.lastOffset = histogram[lastOffsetIndexInHistogram].first;
+ cluster.uniqueMarkerCount = 0;
+ for(uint64_t j=firstOffsetIndexInHistogram; j<=lastOffsetIndexInHistogram; j++) {
+ cluster.uniqueMarkerCount += histogram[j].second;
+ }
+ clusters.push_back(cluster);
+ }
+
+ // Find the largest cluster.
+ uint64_t largestClusterIndex = invalid<uint64_t>;
+ uint64_t largestClusterSize = 0;
+ for(uint64_t i=0; i<clusters.size(); i++) {
+ const uint64_t clusterSize = clusters[i].uniqueMarkerCount;
+ if(clusterSize > largestClusterSize) {
+ largestClusterSize = clusterSize;
+ largestClusterIndex = i;
+ }
+ }
+ const Cluster& largestCluster = clusters[largestClusterIndex];
+
+ // Write the clusters.
+ if(html) {
+ html << "<h3>Ordinal offset clusters</h3>";
+ html << "<p>Ordinal offset clusters were computed using offset tolerance " << offsetDeltaTolerance;
+ html << "<table><tr><th>First<br>offset<th>Last<br>offset<th>Size";
+ for(uint64_t i=0; i<clusters.size(); i++) {
+ const Cluster& cluster = clusters[i];
+ html << "<tr";
+ if(i == largestClusterIndex) {
+ html << " style='background-color:pink'";
+ }
+ html << ">"
+ "<td class=centered>" << cluster.firstOffset <<
+ "<td class=centered>" << cluster.lastOffset <<
+ "<td class=centered>" << cluster.uniqueMarkerCount;
+ }
+ html << "</table>";
+ }
+
+
+
+ // The active markers are the common unique markers on the largest cluster.
+ // These are the ones that will be used to compute the alignment.
+ vector<CommonKmerInfo> activeKmerInfos;
+ for(const CommonKmerInfo& commonKmerInfo: commonKmerInfos) {
+ const int64_t offset = commonKmerInfo.ordinalOffset();
+ if(offset >= largestCluster.firstOffset and offset <= largestCluster.lastOffset) {
+ activeKmerInfos.push_back(commonKmerInfo);
+ }
+ }
+
+
+
+ // Fill in the ordinal ranks.
+ std::ranges::sort(activeKmerInfos, std::ranges::less(), &CommonKmerInfo::ordinal0);
+ for(uint64_t rank=0; rank<activeKmerInfos.size(); rank++) {
+ activeKmerInfos[rank].rank0 = rank;
+ }
+ std::ranges::sort(activeKmerInfos, std::ranges::less(), &CommonKmerInfo::ordinal1);
+ for(uint64_t rank=0; rank<activeKmerInfos.size(); rank++) {
+ activeKmerInfos[rank].rank1 = rank;
+ }
+
+
+ // If there are any markers that don't have the same rank, remove them.
+ {
+ vector<CommonKmerInfo> newActiveKmerInfos;
+ for(const CommonKmerInfo& commonKmerInfo: activeKmerInfos) {
+ if(commonKmerInfo.rank0 == commonKmerInfo.rank1) {
+ newActiveKmerInfos.push_back(commonKmerInfo);
+ }
+ }
+ activeKmerInfos.swap(newActiveKmerInfos);
+
+ }
+
+
+
+ // Sort them by ordinalSum.
+ class OrderByOrdinalSum {
+ public:
+ bool operator()(const CommonKmerInfo& x, const CommonKmerInfo& y) const
+ {
+ return x.ordinalSum() < y.ordinalSum();
+ }
+ };
+ sort(activeKmerInfos.begin(), activeKmerInfos.end(), OrderByOrdinalSum());
+
+
+
+ // Write the active markers we kept.
+ if(html) {
+ html << "<h3>Active common unique markers</h3>";
+ html << "There are " << activeKmerInfos.size() << " active common unique markers, "
+ "shown in the table sorted by ordinal sum."
+ "<p><table>"
+ "<tr><th>Ordinal0<th>Ordinal1<th>Ordinal<br>offset<th>Ordinal<br>sum<th>Rank0<th>Rank1<th>KmerId<th>Kmer";
+ const uint64_t k = assemblerInfo->k;
+ for(const CommonKmerInfo& commonKmerInfo: activeKmerInfos) {
+ const Kmer kmer(commonKmerInfo.kmerId, k);
+ html << "<tr>"
+ "<td class=centered>" << commonKmerInfo.ordinal0 <<
+ "<td class=centered>" << commonKmerInfo.ordinal1 <<
+ "<td class=centered>" << commonKmerInfo.ordinalOffset() <<
+ "<td class=centered>" << commonKmerInfo.ordinalSum() <<
+ "<td class=centered>" << commonKmerInfo.rank0 <<
+ "<td class=centered>" << commonKmerInfo.rank1;
+
+ // Write the KmerId in hex with the appropriate number of digits.
+ const char oldFill = html.fill('0');
+ html << "<td class=centered style='font-family:monospace'>" <<
+ std::hex << std::setw(int(k/2)) << commonKmerInfo.kmerId << std::dec;
+ html.fill(oldFill);
+
+ // Write the Kmer.
+ html << "<td class=centered style='font-family:monospace'>";
+ kmer.write(html, k);
+ }
+ html << "</table>";
+ }
+
+
+
+ // We should remove common unique markers that have a different rank
+ // in the two oriented reads. This does not happen frequently and
+ // for now just check for them.
+ for(const CommonKmerInfo& commonKmerInfo: activeKmerInfos) {
+ SHASTA_ASSERT(commonKmerInfo.rank0 == commonKmerInfo.rank1);
+ }
+
+
+ if(activeKmerInfos.size() < 2) {
+ alignment.clear();
+ alignmentInfo.create(alignment, uint32_t(allMarkerKmerIds[0].size()), uint32_t(allMarkerKmerIds[1].size()));
+ alignmentInfo.uniquenessMetric = 0.;
+ return;
+ }
+
+
+
+ // Create the alignment by stitching together alignments computed
+ // between each pair of consecutive unique k-mers that survived
+ // the above process (the "active" markers).
+ alignment.clear();
+ SHASTA_ASSERT(activeKmerInfos.size() > 1);
+
+ // First, do an alignment between the beginning and the
+ // first active unique marker.
+ // This alignment is constrained on the right only.
+ {
+ const CommonKmerInfo& firstCommonKmerInfo = activeKmerInfos.front();
+ const uint32_t ordinalB0 = firstCommonKmerInfo.ordinal0;
+ const uint32_t ordinalB1 = firstCommonKmerInfo.ordinal1;
+ if(ordinalB0 > 0 and ordinalB1 > 0) {
+ const span<const KmerId> kmerIds0(&allMarkerKmerIds[0][0], &allMarkerKmerIds[0][ordinalB0]);
+ const span<const KmerId> kmerIds1(&allMarkerKmerIds[1][0], &allMarkerKmerIds[1][ordinalB1]);
+
+ // Compute the band.
+ int64_t bandMin = int64_t(ordinalB0) - int64_t(ordinalB1);
+ int64_t bandMax = bandMin;
+ const uint64_t totalBandExtend = minBandExtend +
+ uint64_t(std::round(0.5 * driftRateTolerance * double(min(ordinalB0, ordinalB1))));
+ bandMin -= int64_t(totalBandExtend);
+ bandMax += int64_t(totalBandExtend);
+
+ if(html) {
+ html << "<br>Initial step: alignment lengths " << kmerIds0.size() << " " << kmerIds1.size() <<
+ ", band " << bandMin << " " << bandMax;
+ }
+
+ vector< pair<bool, bool> > seqanAlignment;
+ seqanAlign(
+ kmerIds0.begin(), kmerIds0.end(),
+ kmerIds1.begin(), kmerIds1.end(),
+ matchScore, mismatchScore, gapScore,
+ bandMin, bandMax,
+ true, false, // Free on left
+ seqanAlignment);
+ uint32_t ordinal0 = 0;
+ uint32_t ordinal1 = 0;
+ for(const auto& p: seqanAlignment) {
+ if(p.first and p.second and allMarkerKmerIds[0][ordinal0] == allMarkerKmerIds[1][ordinal1]) {
+ alignment.ordinals.push_back({ordinal0, ordinal1});
+ }
+ if(p.first) {
+ ++ordinal0;
+ }
+ if(p.second) {
+ ++ordinal1;
+ }
+ }
+ SHASTA_ASSERT(ordinal0 == ordinalB0);
+ SHASTA_ASSERT(ordinal1 == ordinalB1);
+ }
+ }
+
+
+ for(uint64_t step=1; step<activeKmerInfos.size(); step++) {
+ const CommonKmerInfo& commonKmerInfoA = activeKmerInfos[step-1];
+ const CommonKmerInfo& commonKmerInfoB = activeKmerInfos[step];
+ SHASTA_ASSERT(commonKmerInfoB.rank0 > commonKmerInfoA.rank0);
+ SHASTA_ASSERT(commonKmerInfoB.rank1 > commonKmerInfoA.rank1);
+
+ const uint32_t ordinalA0 = commonKmerInfoA.ordinal0;
+ const uint32_t ordinalA1 = commonKmerInfoA.ordinal1;
+ const uint32_t ordinalB0 = commonKmerInfoB.ordinal0;
+ const uint32_t ordinalB1 = commonKmerInfoB.ordinal1;
+
+ // Get the KmerIds between A and B for the two reads.
+ // These are the Kmers that we will align in this step.
+ const span<const KmerId> kmerIds0(&allMarkerKmerIds[0][ordinalA0 + 1], &allMarkerKmerIds[0][ordinalB0]);
+ const span<const KmerId> kmerIds1(&allMarkerKmerIds[1][ordinalA1 +1 ], &allMarkerKmerIds[1][ordinalB1]);
+
+ // Add to the alignment the first marker of this step.
+ alignment.ordinals.push_back({commonKmerInfoA.ordinal0, commonKmerInfoA.ordinal1});
+
+ // If there is nothing to align, we are done for this step,
+ if(kmerIds0.empty() or kmerIds1.empty()) {
+ continue;
+ }
+
+
+ // Use seqan to compute the alignment for this step.
+ // This alignment is constrained on both sides and banded.
+
+ // Compute the band.
+ int64_t bandMin, bandMax;
+ if(kmerIds0.size() <= kmerIds1.size()) {
+ bandMin = -int64_t(kmerIds1.size() - kmerIds0.size());
+ bandMax = 0;
+ } else {
+ bandMin = 0;
+ bandMax = int64_t(kmerIds0.size() - kmerIds1.size());
+ }
+ const uint64_t totalBandExtend = minBandExtend +
+ uint64_t(std::round(0.5 * driftRateTolerance * double(min(kmerIds0.size(), kmerIds1.size()))));
+ bandMin -= int64_t(totalBandExtend);
+ bandMax += int64_t(totalBandExtend);
+
+ if(html) {
+ html << "<br>Step " << step << " alignment lengths " << kmerIds0.size() << " " << kmerIds1.size() <<
+ ", band " << bandMin << " " << bandMax;
+ }
+
+ // Do the banded alignment.
+ vector< pair<bool, bool> > seqanAlignment;
+ const int64_t alignmentScore = seqanAlign(
+ kmerIds0.begin(), kmerIds0.end(),
+ kmerIds1.begin(), kmerIds1.end(),
+ matchScore, mismatchScore, gapScore,
+ bandMin, bandMax,
+ false, false,
+ seqanAlignment);
+ if(html) {
+ html << "<br>Alignment score " << alignmentScore;
+ }
+
+ // Add to the alignment the ordinals of matching alignment positions.
+ uint32_t ordinal0 = ordinalA0 + 1;
+ uint32_t ordinal1 = ordinalA1 + 1;
+ for(const auto& p: seqanAlignment) {
+ if(p.first and p.second and allMarkerKmerIds[0][ordinal0] == allMarkerKmerIds[1][ordinal1]) {
+ alignment.ordinals.push_back({ordinal0, ordinal1});
+ }
+ if(p.first) {
+ ++ordinal0;
+ }
+ if(p.second) {
+ ++ordinal1;
+ }
+ }
+ SHASTA_ASSERT(ordinal0 == ordinalB0);
+ SHASTA_ASSERT(ordinal1 == ordinalB1);
+ }
+
+ // Add the last active marker.
+ const CommonKmerInfo& lastCommonKmerInfo = activeKmerInfos.back();
+ alignment.ordinals.push_back({lastCommonKmerInfo.ordinal0, lastCommonKmerInfo.ordinal1});
+
+
+
+ // Do an alignment between the last active unique marker and the end.
+ // This alignment is constrained on the left only.
+ {
+ const CommonKmerInfo& lastCommonKmerInfo = activeKmerInfos.back();
+ const uint32_t ordinalA0 = lastCommonKmerInfo.ordinal0 + 1;
+ const uint32_t ordinalA1 = lastCommonKmerInfo.ordinal1 + 1;
+ const uint32_t ordinalB0 = uint32_t(allMarkerKmerIds[0].size());
+ const uint32_t ordinalB1 = uint32_t(allMarkerKmerIds[1].size());
+ if( ordinalA0 < ordinalB0 and ordinalA1 < ordinalB1) {
+ const span<const KmerId> kmerIds0(&allMarkerKmerIds[0][ordinalA0], &allMarkerKmerIds[0][ordinalB0]);
+ const span<const KmerId> kmerIds1(&allMarkerKmerIds[1][ordinalA1], &allMarkerKmerIds[1][ordinalB1]);
+
+ // Compute the band.
+ int64_t bandMin = 0;
+ int64_t bandMax = 0;
+ const uint64_t totalBandExtend = minBandExtend +
+ uint64_t(std::round(0.5 * driftRateTolerance * double(min(kmerIds0.size(), kmerIds1.size()))));
+ bandMin -= int64_t(totalBandExtend);
+ bandMax += int64_t(totalBandExtend);
+
+ if(html) {
+ html << "<br>Final step: alignment lengths " << kmerIds0.size() << " " << kmerIds1.size() <<
+ ", band " << bandMin << " " << bandMax;
+ }
+
+ vector< pair<bool, bool> > seqanAlignment;
+ seqanAlign(
+ kmerIds0.begin(), kmerIds0.end(),
+ kmerIds1.begin(), kmerIds1.end(),
+ matchScore, mismatchScore, gapScore,
+ bandMin, bandMax,
+ false, true, // Free on right
+ seqanAlignment);
+ uint32_t ordinal0 = ordinalA0;
+ uint32_t ordinal1 = ordinalA1;
+ for(const auto& p: seqanAlignment) {
+ if(p.first and p.second and allMarkerKmerIds[0][ordinal0] == allMarkerKmerIds[1][ordinal1]) {
+ alignment.ordinals.push_back({ordinal0, ordinal1});
+ }
+ if(p.first) {
+ ++ordinal0;
+ }
+ if(p.second) {
+ ++ordinal1;
+ }
+ }
+ SHASTA_ASSERT(ordinal0 == ordinalB0);
+ SHASTA_ASSERT(ordinal1 == ordinalB1);
+ }
+ }
+
+
+ // Compute the uniqueness metric defines as k/(2*sqrt(n))
+ // where k is the number of active markers and
+ // n is the number of common unique markers
+ // IN THE OVERLAP REGION ONLY.
+ float uniquenessMetric = 0;
+ {
+ const uint64_t k = activeKmerInfos.size();
+
+ const array<uint32_t, 2>& alignmentOrdinalsFirst = alignment.ordinals.front();
+ const array<uint32_t, 2>& alignmentOrdinalsLast = alignment.ordinals.back();
+ const uint32_t alignmentOrdinalFirst0 = alignmentOrdinalsFirst[0];
+ const uint32_t alignmentOrdinalFirst1 = alignmentOrdinalsFirst[1];
+ const uint32_t alignmentOrdinalLast0 = alignmentOrdinalsLast[0];
+ const uint32_t alignmentOrdinalLast1 = alignmentOrdinalsLast[1];
+ uint64_t n = 0;
+ for(const CommonKmerInfo& commonKmerInfo: commonKmerInfos) {
+ if(
+ commonKmerInfo.ordinal0 >= alignmentOrdinalFirst0 and
+ commonKmerInfo.ordinal0 <= alignmentOrdinalLast0 and
+ commonKmerInfo.ordinal1 >= alignmentOrdinalFirst1 and
+ commonKmerInfo.ordinal1 <= alignmentOrdinalLast1
+ ) {
+ ++n;
+ }
+ }
+
+ uniquenessMetric = float(double(k) / (2. * sqrt(double(n))));
+ }
+
+
+ // Store the alignment info.
+ alignmentInfo.create(alignment, uint32_t(allMarkerKmerIds[0].size()), uint32_t(allMarkerKmerIds[1].size()));
+ alignmentInfo.uniquenessMetric = uniquenessMetric;
+
+}
+
+
+
+void Assembler::computeLowFrequencyMarkers(
+ uint64_t maxMarkerFrequency,
+ uint64_t threadCount)
+{
+ // Check that we have what we need.
+ SHASTA_ASSERT(markerKmerIds.isOpen());
+
+ // Get the number of reads.
+ const uint64_t readCount = getReads().readCount();
+
+ // Adjust the number of threads, if necessary.
+ if(threadCount == 0) {
+ threadCount = std::thread::hardware_concurrency();
+ }
+
+ // Store the maxMarkerFrequency so all threads can see it.
+ computeLowFrequencyMarkersData.maxMarkerFrequency = maxMarkerFrequency;
+
+ // Initialize the low frequency markers.
+ lowFrequencyMarkers.createNew(largeDataName("LowFrequencyMarkers"), largeDataPageSize);
+
+ // Pass 1 just counts the number of low frequency markers for each oriented read.
+ const uint64_t batchSize = 1;
+ lowFrequencyMarkers.beginPass1(2 * readCount);
+ setupLoadBalancing(readCount, batchSize);
+ runThreads(&Assembler::computeLowFrequencyMarkersThreadFunctionPass1, threadCount);
+
+ // Pass 2 stores the low frequency markers for each oriented read.
+ setupLoadBalancing(getReads().readCount(), batchSize);
+ lowFrequencyMarkers.beginPass2();
+ runThreads(&Assembler::computeLowFrequencyMarkersThreadFunctionPass2, threadCount);
+ lowFrequencyMarkers.endPass2(false, true);
+}
+
+
+
+void Assembler::computeLowFrequencyMarkersThreadFunctionPass1(uint64_t threadId)
+{
+ computeLowFrequencyMarkersThreadFunctionPass12(1);
+}
+void Assembler::computeLowFrequencyMarkersThreadFunctionPass2(uint64_t threadId)
+{
+ computeLowFrequencyMarkersThreadFunctionPass12(2);
+}
+void Assembler::computeLowFrequencyMarkersThreadFunctionPass12(uint64_t pass)
+{
+ const uint64_t maxMarkerFrequency = computeLowFrequencyMarkersData.maxMarkerFrequency;
+ vector<uint32_t> lowFrequencyOrdinals;
+
+ // Loop over all batches assigned to this thread.
+ uint64_t begin, end;
+ while(getNextBatch(begin, end)) {
+
+ // Loop over oriented reads in this batch.
+ for(uint32_t readId=ReadId(begin); readId!=ReadId(end); ++readId) {
+ for(uint32_t strand=0; strand<2; strand++) {
+ const OrientedReadId orientedReadId(readId, strand);
+
+ // Compute the low frequency markers.
+ computeLowFrequencyMarkers(
+ markerKmerIds[orientedReadId.getValue()],
+ maxMarkerFrequency,
+ lowFrequencyOrdinals);
+
+ if(pass == 1) {
+ // Just make space for them.
+ lowFrequencyMarkers.incrementCountMultithreaded(
+ orientedReadId.getValue(),
+ lowFrequencyOrdinals.size());
+ } else {
+ // Store them.
+ copy(lowFrequencyOrdinals.begin(), lowFrequencyOrdinals.end(),
+ lowFrequencyMarkers.begin(orientedReadId.getValue()));
+ }
+ }
+ }
+ }
+}
+
+
+
+// Compute low frequency markers for a single oriented read.
+// On return, the lowFrequencyOrdinals vector contains the ordinals corresponding
+// to low frequency markers, sorted by KmerId.
+// Low frequency markers are the ones that occur up to maxMarkerFrequency
+// times on the oriented read.
+void Assembler::computeLowFrequencyMarkers(
+ const span<const KmerId>& kmerIds, // The marker KmerIds for the oriented reads, sorted by ordinal
+ uint64_t maxMarkerFrequency,
+ vector<uint32_t>& lowFrequencyOrdinals) // The ordinals of the low frequency markers, sorted by KmerId
+{
+
+ // Create a vector of ordinals, sorted by ordinal.
+ const uint64_t markerCount = kmerIds.size();
+ vector<uint32_t> allOrdinals(markerCount);
+ std::iota(allOrdinals.begin(), allOrdinals.end(), uint32_t(0));
+
+ // Now sort them by KmerId.
+ class SortHelper {
+ public:
+ SortHelper(const span<const KmerId>& kmerIds) : kmerIds(kmerIds) {}
+ bool operator()(uint32_t ordinal0, uint32_t ordinal1) const
+ {
+ return kmerIds[ordinal0] < kmerIds[ordinal1];
+ }
+ private:
+ const span<const KmerId>& kmerIds;
+ };
+ sort(allOrdinals.begin(), allOrdinals.end(), SortHelper(kmerIds));
+
+
+
+ // Loop over streaks with the same KmerId.
+ lowFrequencyOrdinals.clear();
+ for(uint64_t streakBegin=0; streakBegin<markerCount; /* Increment later */) {
+ const KmerId kmerId = kmerIds[allOrdinals[streakBegin]];
+
+ // Find the streak with this KmerId.
+ uint64_t streakEnd = streakBegin + 1;
+ while(true) {
+ if(streakEnd == markerCount or kmerIds[allOrdinals[streakEnd]] != kmerId) {
+ break;
+ }
+ ++streakEnd;
+ }
+ const uint64_t streakLength = streakEnd - streakBegin;
+
+ // If short enough, copy to the low frequency ordinals.
+ if(streakLength <= maxMarkerFrequency) {
+ copy(allOrdinals.begin() + streakBegin, allOrdinals.begin() + streakEnd,
+ back_inserter(lowFrequencyOrdinals));
+ }
+
+ // Prepare to process the next streak.
+ streakBegin = streakEnd;
+ }
+}
diff --git a/src/AssemblerAnalyzePaths.cpp b/src/AssemblerAnalyzePaths.cpp
index 4989eab..79d164e 100644
--- a/src/AssemblerAnalyzePaths.cpp
+++ b/src/AssemblerAnalyzePaths.cpp
@@ -6,6 +6,7 @@
// Shasta.
#include "seqan.hpp"
using namespace shasta;
+using namespace mode0;
// Standard library.
#include "array.hpp"
@@ -139,131 +140,3 @@ void Assembler::getPseudoPathSegments(
segmentIds.push_back(pseudoPathEntry.segmentId);
}
}
-
-
-
-void Assembler::alignPseudoPaths(
- ReadId readId0, Strand strand0,
- ReadId readId1, Strand strand1)
-{
- using SegmentId = AssemblyGraph::EdgeId;
- const AssemblyGraph& assemblyGraph = *assemblyGraphPointer;
-
- // Parameters that control the process below. EXPOSE WHEN CODE STABILIZES. *********
- const int matchScore = 1;
- const int mismatchScore = -1;
- const int gapScore = -1;
-
- // Gather the oriented read ids.
- const array<OrientedReadId, 2> orientedReadIds =
- {OrientedReadId(readId0, strand0), OrientedReadId(readId1, strand1)};
- cout << "Aligning pseudo-paths of " << orientedReadIds[0] <<
- " and " << orientedReadIds[1] << endl;
-
-
- // Compute the two pseudo-paths.
- vector<MarkerGraph::EdgeId> path;
- vector< pair<uint32_t, uint32_t> > pathOrdinals;
- PseudoPath pseudoPath;
- array<vector<SegmentId>, 2> pseudoPathSegments;
- for(uint64_t i=0; i<2; i++) {
- computePseudoPath(orientedReadIds[i], path, pathOrdinals,
- pseudoPath);
- getPseudoPathSegments(pseudoPath, pseudoPathSegments[i]);
- cout << "The pseudo-path of " << orientedReadIds[i] <<
- " has " << pseudoPathSegments[i].size() << " segments." << endl;
- }
-
- // Align them.
- vector< pair<bool, bool> > alignment;
- const uint64_t alignmentScore = shasta::seqanAlign(
- pseudoPathSegments[0].begin(), pseudoPathSegments[0].end(),
- pseudoPathSegments[1].begin(), pseudoPathSegments[1].end(),
- matchScore,
- mismatchScore,
- gapScore,
- true, true,
- alignment);
- cout << "Alignment score " << alignmentScore << endl;
- cout << "Alignment length " << alignment.size() << endl;
-
-
-
- // Write out the alignment.
- uint64_t position0 = 0;
- uint64_t position1 = 0;
- uint64_t weakMatchCount =0;
- uint64_t strongMatchCount =0;
- uint64_t mismatchCount =0;
- uint64_t gapCount =0;
- uint64_t leftUnalignedCount =0;
- uint64_t rightUnalignedCount =0;
- ofstream csv("PseudoPathsAlignment.csv");
- for(const auto& p: alignment) {
- if(p.first) {
- const SegmentId segment0 = pseudoPathSegments[0][position0];
- csv << segment0;
- }
- csv << ",";
- if(p.second) {
- const SegmentId segment1 = pseudoPathSegments[1][position1];
- csv << segment1;
- }
- csv << ",";
-
- // Write an annotation column.
- if(p.first and p.second) {
- if(pseudoPathSegments[0][position0] != pseudoPathSegments[1][position1]) {
- csv << "Mismatch";
- ++mismatchCount;
- } else {
- // Match.
- // Decide if it is a strong or weak match.
- const SegmentId segmentId = pseudoPathSegments[0][position0];
- const AssemblyGraph::Edge& edge = assemblyGraph.edges[segmentId];
- const AssemblyGraph::VertexId v0 = edge.source;
- const AssemblyGraph::VertexId v1 = edge.target;
- const auto out0 = assemblyGraph.outDegree(v0);
- const auto in1 = assemblyGraph.inDegree(v1);
- if(out0==1 and in1==1) {
- csv << "Weak match";
- ++weakMatchCount;
- } else {
- csv << "Strong match";
- ++strongMatchCount;
- }
- }
- } else if(position0 == 0 or position1==0) {
- csv << "Left unaligned portion";
- ++leftUnalignedCount;
- } else if(
- position0 == pseudoPathSegments[0].size() or
- position1 == pseudoPathSegments[1].size()) {
- csv << "Right unaligned portion";
- ++rightUnalignedCount;
- } else if(not (p.first and p.second)) {
- csv << "Gap";
- ++gapCount;
- }
- csv << "\n";
-
- if(p.first) {
- ++position0;
- }
- if(p.second) {
- ++position1;
- }
- }
- SHASTA_ASSERT(position0 == pseudoPathSegments[0].size());
- SHASTA_ASSERT(position1 == pseudoPathSegments[1].size());
-
- const uint64_t matchCount = weakMatchCount + strongMatchCount;
- cout << "Total match "<< matchCount << endl;
- cout << "Strong match "<< strongMatchCount << endl;
- cout << "Weak match "<< weakMatchCount << endl;
- cout << "Mismatch "<< mismatchCount << endl;
- cout << "Gap "<< gapCount << endl;
- cout << "Left unaligned "<< leftUnalignedCount << endl;
- cout << "Right unaligned "<< rightUnalignedCount << endl;
- cout << "Mismatch/match ratio " << double(mismatchCount)/double(matchCount) << endl;
-}
diff --git a/src/AssemblerAssemblyGraph.cpp b/src/AssemblerAssemblyGraph.cpp
index b21f16f..091aeee 100644
--- a/src/AssemblerAssemblyGraph.cpp
+++ b/src/AssemblerAssemblyGraph.cpp
@@ -9,6 +9,7 @@
#include "Reads.hpp"
#include "timestamp.hpp"
using namespace shasta;
+using namespace mode0;
// Boost libraries.
#include <boost/graph/iteration_macros.hpp>
@@ -37,6 +38,7 @@ using namespace shasta;
// - assemblyGraph.markerToAssemblyTable
void Assembler::createAssemblyGraphEdges()
{
+
// Some shorthands.
// using VertexId = AssemblyGraph::VertexId;
using EdgeId = AssemblyGraph::EdgeId;
@@ -87,6 +89,7 @@ void Assembler::createAssemblyGraphEdges()
if(debug) {
cout << "Working on start edge " << startEdgeId;
cout << " " << startEdge.source << "->" << startEdge.target << endl;
+ startEdge.writeFlags(cout);
}
// If this edge is not part of cleaned up marker graph, skip it.
@@ -153,6 +156,19 @@ void Assembler::createAssemblyGraphEdges()
}
std::reverse(reverseComplementedChain.begin(), reverseComplementedChain.end());
+ if(debug) {
+ cout << "Chain:";
+ for(const auto edgeId: chain) {
+ cout << " " << edgeId;
+ }
+ cout << endl;
+ cout << "Reverse complemented chain:";
+ for(const auto edgeId: reverseComplementedChain) {
+ cout << " " << edgeId;
+ }
+ cout << endl;
+ }
+
// Figure out if the reverse complemented chain is the same
@@ -262,6 +278,7 @@ void Assembler::createAssemblyGraphEdges()
}
}
#endif
+
}
@@ -402,12 +419,12 @@ void Assembler::createAssemblyGraphVertices()
for(uint64_t i=0; i<chain.size(); i++) {
const MarkerGraph::EdgeId markerGraphEdgeId = chain[i];
const MarkerGraph::Edge& markerGraphEdge = markerGraph.edges[markerGraphEdgeId];
- const uint32_t edgeCoverage = markerGraphEdge.coverage;
+ const uint64_t edgeCoverage = markerGraph.edgeCoverage(markerGraphEdgeId);
edgeCoverageSum += edgeCoverage;
assemblyGraphEdge.minEdgeCoverage =
- min(assemblyGraphEdge.minEdgeCoverage, edgeCoverage);
+ min(assemblyGraphEdge.minEdgeCoverage, uint32_t(edgeCoverage));
assemblyGraphEdge.maxEdgeCoverage =
- max(assemblyGraphEdge.maxEdgeCoverage, edgeCoverage);
+ max(assemblyGraphEdge.maxEdgeCoverage, uint32_t(edgeCoverage));
if(i != 0) {
const MarkerGraph::EdgeId markerGraphVertexId = markerGraphEdge.source;
@@ -460,6 +477,24 @@ void Assembler::removeLowCoverageCrossEdges(uint32_t crossEdgeCoverageThreshold)
SHASTA_ASSERT(assemblyGraphPointer);
AssemblyGraph& assemblyGraph = *assemblyGraphPointer;
+#if 0
+ // Sanity check on assembly graph edges.
+ for(AssemblyGraph::EdgeId edgeId=0; edgeId!=assemblyGraph.edges.size(); edgeId++) {
+ const AssemblyGraph::EdgeId edgeIdRc = assemblyGraph.reverseComplementEdge[edgeId];
+ SHASTA_ASSERT(assemblyGraph.reverseComplementEdge[edgeIdRc] == edgeId);
+ const auto markerGraphEdges = assemblyGraph.edgeLists[edgeId];
+ const auto markerGraphEdgesRc = assemblyGraph.edgeLists[edgeIdRc];
+ const uint64_t n = markerGraphEdges.size();
+ SHASTA_ASSERT(markerGraphEdgesRc.size() == n);
+ for(uint64_t i=0; i<n; i++) {
+ const MarkerGraphEdgeId markerGraphEdgeId = markerGraphEdges[i];
+ const MarkerGraphEdgeId markerGraphEdgeIdRc = markerGraphEdgesRc[n - 1 - i];
+ SHASTA_ASSERT(markerGraph.reverseComplementEdge[markerGraphEdgeId] == markerGraphEdgeIdRc);
+ SHASTA_ASSERT(markerGraph.reverseComplementEdge[markerGraphEdgeIdRc] == markerGraphEdgeId);
+ }
+ }
+#endif
+
// We want to process edges in order of increasing coverage.
// Gather edges by coverage.
vector< vector<AssemblyGraph::EdgeId> > edgesByCoverage(crossEdgeCoverageThreshold+1);
@@ -473,7 +508,7 @@ void Assembler::removeLowCoverageCrossEdges(uint32_t crossEdgeCoverageThreshold)
const bool debug = false;
ofstream out;
if(debug) {
- out.open("LowCoverageCrossEdges.csv");
+ out.open("LowCoverageCrossEdges.txt");
}
// Process assembly graph edges in order of increasing coverage.
@@ -482,6 +517,10 @@ void Assembler::removeLowCoverageCrossEdges(uint32_t crossEdgeCoverageThreshold)
for(const vector<AssemblyGraph::EdgeId>& edges: edgesByCoverage) {
for(const AssemblyGraph::EdgeId edgeId: edges) {
AssemblyGraph::Edge& edge = assemblyGraph.edges[edgeId];
+ if(edge.removalReason == AssemblyGraph::Edge::RemovalReason::LowCoverageCrossEdge) {
+ // Was already marked because it is the reverse complement of another marked edge.
+ continue;
+ }
const AssemblyGraph::VertexId v0 = edge.source;
const AssemblyGraph::VertexId v1 = edge.target;
@@ -514,14 +553,46 @@ void Assembler::removeLowCoverageCrossEdges(uint32_t crossEdgeCoverageThreshold)
++removedAssemblyGraphEdgeCount;
// Mark the corresponding marker graph edges.
+ if(debug) {
+ out << "Assembly graph edge A" << edgeId << " marked as low coverage edge "
+ "together with its marker graph edges:";
+ }
for(const MarkerGraph::EdgeId markerGraphEdgeId: assemblyGraph.edgeLists[edgeId]) {
markerGraph.edges[markerGraphEdgeId].isLowCoverageCrossEdge = 1;
if(debug) {
- out << markerGraphEdgeId << "\n";
+ out << " M" << markerGraphEdgeId << " ";
}
++removedMarkerGraphEdgeCount;
}
+ if(debug) {
+ out << endl;
+ }
+
+ // Also mark the reverse complement edge.
+ // This is necessary to keep the assembly graph and marker graph
+ // in variant under reverse complementing.
+ const AssemblyGraph::EdgeId reverseComplementEdgeId = assemblyGraph.reverseComplementEdge[edgeId];
+ if(reverseComplementEdgeId != edgeId) {
+ AssemblyGraph::Edge& reverseComplementEdge = assemblyGraph.edges[reverseComplementEdgeId];
+ reverseComplementEdge.removalReason = AssemblyGraph::Edge::RemovalReason::LowCoverageCrossEdge;
+ ++removedAssemblyGraphEdgeCount;
+ if(debug) {
+ out << "Reverse complement assembly graph edge A" << reverseComplementEdgeId << " marked as low coverage edge "
+ "together with its marker graph edges:";
+ }
+ // Mark the corresponding marker graph edges.
+ for(const MarkerGraph::EdgeId markerGraphEdgeId: assemblyGraph.edgeLists[reverseComplementEdgeId]) {
+ markerGraph.edges[markerGraphEdgeId].isLowCoverageCrossEdge = 1;
+ if(debug) {
+ out << " M" << markerGraphEdgeId << " ";
+ }
+ ++removedMarkerGraphEdgeCount;
+ }
+ if(debug) {
+ out << endl;
+ }
+ }
}
}
@@ -651,7 +722,7 @@ void Assembler::assemble(
AssemblyGraph& assemblyGraph = *assemblyGraphPointer;
// Check that we have what we need.
- checkKmersAreOpen();
+ SHASTA_ASSERT(kmerChecker);
reads->checkReadsAreOpen();
checkMarkersAreOpen();
checkMarkerGraphVerticesAreAvailable();
@@ -1785,7 +1856,7 @@ void Assembler::assembleAssemblyGraphEdge(
AssembledSegment& assembledSegment)
{
assembleMarkerGraphPath(
- assemblerInfo->readRepresentation, assemblerInfo->k, markers, markerGraph, markerGraphPath,
+ assemblerInfo->readRepresentation, assemblerInfo->k, *reads, markers, markerGraph, markerGraphPath,
storeCoverageData, assembledSegment);
}
diff --git a/src/AssemblerAssemblyGraph2.cpp b/src/AssemblerAssemblyGraph2.cpp
index a6cf6a1..ae94e4a 100644
--- a/src/AssemblerAssemblyGraph2.cpp
+++ b/src/AssemblerAssemblyGraph2.cpp
@@ -34,6 +34,7 @@ void Assembler::createAssemblyGraph2(
assemblerInfo->readRepresentation,
assemblerInfo->k,
getReads().getFlags(),
+ getReads(),
markers,
markerGraph,
pruneLength,
diff --git a/src/AssemblerCreateReadGraphUsingPseudoPaths.cpp b/src/AssemblerCreateReadGraphUsingPseudoPaths.cpp
index 5feb896..27c8174 100644
--- a/src/AssemblerCreateReadGraphUsingPseudoPaths.cpp
+++ b/src/AssemblerCreateReadGraphUsingPseudoPaths.cpp
@@ -6,15 +6,15 @@
#include "seqan.hpp"
#include "timestamp.hpp"
using namespace shasta;
+using namespace mode0;
// Standard library.
#include "fstream.hpp"
-// This use PseudoPaths to decide which alignments
+// This uses PseudoPaths to decide which alignments
// should be included in the read graph.
-// See Assembler::alignPseudoPaths in AssemblerAnalyzePaths.cpp.
void Assembler::createReadGraphUsingPseudoPaths(
int64_t matchScore,
int64_t mismatchScore,
diff --git a/src/AssemblerDetangle.cpp b/src/AssemblerDetangle.cpp
index 1759444..d66335c 100644
--- a/src/AssemblerDetangle.cpp
+++ b/src/AssemblerDetangle.cpp
@@ -5,6 +5,7 @@
#include "performanceLog.hpp"
#include "timestamp.hpp"
using namespace shasta;
+using namespace mode0;
// Boost libraries.
#include <boost/graph/iteration_macros.hpp>
diff --git a/src/AssemblerHttpServer-Alignments.cpp b/src/AssemblerHttpServer-Alignments.cpp
index 793e409..c15d6c1 100644
--- a/src/AssemblerHttpServer-Alignments.cpp
+++ b/src/AssemblerHttpServer-Alignments.cpp
@@ -13,6 +13,7 @@
#include "Reads.hpp"
#include "ReferenceOverlapMap.hpp"
using namespace shasta;
+using namespace mode0;
// Boost libraries.
#include <boost/icl/interval_map.hpp>
@@ -1105,6 +1106,12 @@ void Assembler::exploreAlignment(
uint64_t align4MaxDistanceFromBoundary = httpServerData.assemblerOptions->alignOptions.align4MaxDistanceFromBoundary;
getParameterValue(request, "align4MaxDistanceFromBoundary", align4MaxDistanceFromBoundary);
+ // Parameters for alignment method 5.
+ double align5DriftRateTolerance = httpServerData.assemblerOptions->alignOptions.align5DriftRateTolerance;
+ getParameterValue(request, "align5DriftRateTolerance", align5DriftRateTolerance);
+ uint64_t align5MinBandExtend = httpServerData.assemblerOptions->alignOptions.align5MinBandExtend;
+ getParameterValue(request, "align5MinBandExtend", align5MinBandExtend);
+
string displayMatrixString;
bool displayMatrix = getParameterValue(request, "displayMatrix", displayMatrixString);
@@ -1114,6 +1121,8 @@ void Assembler::exploreAlignment(
getParameterValue(request, "magnifyFactor", magnifyFactor);
string displayDetailsString;
bool displayDetails = getParameterValue(request, "displayDetails", displayDetailsString);
+ string displayDebugInfoString;
+ bool displayDebugInfo = getParameterValue(request, "displayDebugInfo", displayDebugInfoString);
// Write the form.
@@ -1151,6 +1160,8 @@ void Assembler::exploreAlignment(
align4DeltaY,
align4MinEntryCountPerCell,
align4MaxDistanceFromBoundary,
+ align5DriftRateTolerance,
+ align5MinBandExtend,
html
);
@@ -1163,6 +1174,8 @@ void Assembler::exploreAlignment(
"> times."
"<br><input type=checkbox name=displayDetails" << (displayDetails ? " checked=checked" : "") <<
"> Display alignment details"
+ "<br><input type=checkbox name=displayDebugInfo" << (displayDebugInfo ? " checked=checked" : "") <<
+ "> Display debug information"
"</form>";
@@ -1229,6 +1242,13 @@ void Assembler::exploreAlignment(
mismatchScore,
gapScore,
alignment, alignmentInfo);
+ } else if(method == 5) {
+ ofstream nullStream;
+ alignOrientedReads5(
+ orientedReadId0, orientedReadId1,
+ matchScore, mismatchScore, gapScore,
+ align5DriftRateTolerance, align5MinBandExtend,
+ alignment, alignmentInfo, displayDebugInfo ? html : nullStream);
} else {
SHASTA_ASSERT(0);
}
@@ -1338,7 +1358,7 @@ void Assembler::exploreAlignment(
const auto markers0 = markers[orientedReadId0.getValue()];
const auto markers1 = markers[orientedReadId1.getValue()];
- // Compute the raw position corresponding to each RLE position.
+ // Compute the positions of each marker in the two oriented reads.
const vector<uint32_t> rawPositions0 = reads->getRawPositions(orientedReadId0);
const vector<uint32_t> rawPositions1 = reads->getRawPositions(orientedReadId1);
@@ -1350,8 +1370,10 @@ void Assembler::exploreAlignment(
const auto& marker0 = markers0[ordinal0];
const auto& marker1 = markers1[ordinal1];
- const auto kmerId = marker0.kmerId;
- SHASTA_ASSERT(marker1.kmerId == kmerId);
+ const KmerId kmerId0 = getOrientedReadMarkerKmerId(orientedReadId0, ordinal0);
+ const KmerId kmerId1 = getOrientedReadMarkerKmerId(orientedReadId1, ordinal1);
+ SHASTA_ASSERT(kmerId0 == kmerId1);
+ const KmerId kmerId = kmerId0;
const Kmer kmer(kmerId, assemblerInfo->k);
const uint32_t rlePosition0 = marker0.position;
@@ -1759,6 +1781,8 @@ void Assembler::renderEditableAlignmentConfig(
uint64_t align4DeltaY,
uint64_t align4MinEntryCountPerCell,
uint64_t align4MaxDistanceFromBoundary,
+ double align5DriftRateTolerance,
+ uint64_t align5MinBandExtend,
ostream& html
) {
const auto& descriptions = httpServerData.assemblerOptions->allOptionsDescription;
@@ -1775,7 +1799,9 @@ void Assembler::renderEditableAlignmentConfig(
"<input type=radio name=method value=3" <<
(method==3 ? " checked=checked" : "") << "> 3 (SeqAn, banded)<br>"
"<input type=radio name=method value=4" <<
- (method==4 ? " checked=checked" : "") << "> 4 (Experimental)"
+ (method==4 ? " checked=checked" : "") << "> 4 (Experimental)<br>"
+ "<input type=radio name=method value=5" <<
+ (method==5 ? " checked=checked" : "") << "> 5 (Experimental)"
"<td class=smaller>" << descriptions.find("Align.alignMethod", false).description();
html << "<tr><th class=left>maxSkip"
@@ -1873,6 +1899,18 @@ void Assembler::renderEditableAlignmentConfig(
"<input type=text style='text-align:center;border:none' name=align4MaxDistanceFromBoundary size=16 value=" << align4MaxDistanceFromBoundary << ">"
"<td class=smaller>" << descriptions.find("Align.align4.maxDistanceFromBoundary", false).description();
+ html << "<tr>"
+ "<th class=left>align5.driftRateTolerance"
+ "<td class=centered>"
+ "<input type=text style='text-align:center;border:none' name=align5DriftRateTolerance size=16 value=" << align5DriftRateTolerance << ">"
+ "<td class=smaller>" << descriptions.find("Align.align5.driftRateTolerance", false).description();
+
+ html << "<tr>"
+ "<th class=left>align5.minBandExtend"
+ "<td class=centered>"
+ "<input type=text style='text-align:center;border:none' name=align5MinBandExtend size=16 value=" << align5MinBandExtend << ">"
+ "<td class=smaller>" << descriptions.find("Align.align5.minBandExtend", false).description();
+
html << "</table>";
}
@@ -1928,6 +1966,12 @@ void Assembler::computeAllAlignments(
computeAllAlignmentsData.align4MaxDistanceFromBoundary = httpServerData.assemblerOptions->alignOptions.align4MaxDistanceFromBoundary;
getParameterValue(request, "align4MaxDistanceFromBoundary", computeAllAlignmentsData.align4MaxDistanceFromBoundary);
+ // Parameters for alignment method 5.
+ computeAllAlignmentsData.align5DriftRateTolerance = httpServerData.assemblerOptions->alignOptions.align5DriftRateTolerance;
+ getParameterValue(request, "align5DriftRateTolerance", computeAllAlignmentsData.align5DriftRateTolerance);
+ computeAllAlignmentsData.align5MinBandExtend = httpServerData.assemblerOptions->alignOptions.align5MinBandExtend;
+ getParameterValue(request, "align5MinBandExtend", computeAllAlignmentsData.align5MinBandExtend);
+
// Write the form.
html <<
@@ -1958,6 +2002,8 @@ void Assembler::computeAllAlignments(
computeAllAlignmentsData.align4DeltaY,
computeAllAlignmentsData.align4MinEntryCountPerCell,
computeAllAlignmentsData.align4MaxDistanceFromBoundary,
+ computeAllAlignmentsData.align5DriftRateTolerance,
+ computeAllAlignmentsData.align5MinBandExtend,
html
);
@@ -2386,6 +2432,11 @@ void Assembler::assessAlignments(
computeAllAlignmentsData. align4MaxDistanceFromBoundary = httpServerData.assemblerOptions->alignOptions.align4MaxDistanceFromBoundary;
getParameterValue(request, "align4MaxDistanceFromBoundary", computeAllAlignmentsData.align4MaxDistanceFromBoundary);
+ // Parameters for alignment method 5.
+ computeAllAlignmentsData.align5DriftRateTolerance = httpServerData.assemblerOptions->alignOptions.align5DriftRateTolerance;
+ getParameterValue(request, "align5DriftRateTolerance", computeAllAlignmentsData.align5DriftRateTolerance);
+ computeAllAlignmentsData.align5MinBandExtend = httpServerData.assemblerOptions->alignOptions.align5MinBandExtend;
+ getParameterValue(request, "align5MinBandExtend", computeAllAlignmentsData.align5MinBandExtend);
html << "<h1>Alignment statistics</h1>";
@@ -2445,6 +2496,8 @@ void Assembler::assessAlignments(
computeAllAlignmentsData.align4DeltaY,
computeAllAlignmentsData.align4MinEntryCountPerCell,
computeAllAlignmentsData.align4MaxDistanceFromBoundary,
+ computeAllAlignmentsData.align5DriftRateTolerance,
+ computeAllAlignmentsData.align5MinBandExtend,
html
);
@@ -2737,6 +2790,8 @@ void Assembler::computeAllAlignmentsThreadFunction(size_t threadId)
const uint64_t align4DeltaY = computeAllAlignmentsData.align4DeltaY;
const uint64_t align4MinEntryCountPerCell = computeAllAlignmentsData.align4MinEntryCountPerCell;
const uint64_t align4MaxDistanceFromBoundary = computeAllAlignmentsData.align4MaxDistanceFromBoundary;
+ const double align5DriftRateTolerance = computeAllAlignmentsData.align5DriftRateTolerance;
+ const uint64_t align5MinBandExtend = computeAllAlignmentsData.align5MinBandExtend;
// Vector where this thread will store the alignments it finds.
vector< pair<OrientedReadId, AlignmentInfo> >& alignments =
@@ -2822,6 +2877,13 @@ void Assembler::computeAllAlignmentsThreadFunction(size_t threadId)
alignment, alignmentInfo,
false);
SHASTA_ASSERT(byteAllocator.isEmpty());
+ } else if(method == 5) {
+ ofstream nullStream;
+ alignOrientedReads5(orientedReadId0, orientedReadId1,
+ matchScore, mismatchScore, gapScore,
+ align5DriftRateTolerance, align5MinBandExtend,
+ alignment, alignmentInfo,
+ nullStream);
} else {
SHASTA_ASSERT(0);
}
diff --git a/src/AssemblerHttpServer-AssemblyGraph.cpp b/src/AssemblerHttpServer-AssemblyGraph.cpp
index 14db868..c17de7a 100644
--- a/src/AssemblerHttpServer-AssemblyGraph.cpp
+++ b/src/AssemblerHttpServer-AssemblyGraph.cpp
@@ -4,6 +4,7 @@
#include "LocalAssemblyGraph.hpp"
#include "platformDependent.hpp"
using namespace shasta;
+using namespace mode0;
// Boost libraries.
#include <boost/algorithm/string.hpp>
diff --git a/src/AssemblerHttpServer-CompressedAssemblyGraph.cpp b/src/AssemblerHttpServer-CompressedAssemblyGraph.cpp
index c5abd04..16b8b3c 100644
--- a/src/AssemblerHttpServer-CompressedAssemblyGraph.cpp
+++ b/src/AssemblerHttpServer-CompressedAssemblyGraph.cpp
@@ -5,6 +5,7 @@
#include "runCommandWithTimeout.hpp"
#include "timestamp.hpp"
using namespace shasta;
+using namespace mode0;
// Boost libraries.
#include <boost/graph/iteration_macros.hpp>
diff --git a/src/AssemblerHttpServer-MarkerGraph.cpp b/src/AssemblerHttpServer-MarkerGraph0.cpp
index 1cd654e..d7ede14 100644
--- a/src/AssemblerHttpServer-MarkerGraph.cpp
+++ b/src/AssemblerHttpServer-MarkerGraph0.cpp
@@ -6,11 +6,12 @@
#include "Coverage.hpp"
#include "hsv.hpp"
#include "InducedAlignment.hpp"
-#include "LocalMarkerGraph.hpp"
+#include "LocalMarkerGraph0.hpp"
#include "MarkerConnectivityGraph.hpp"
#include "MurmurHash2.hpp"
#include "platformDependent.hpp"
using namespace shasta;
+using namespace mode0;
// Boost libraries.
#include <boost/algorithm/string.hpp>
@@ -31,13 +32,13 @@ using namespace shasta;
-void Assembler::exploreMarkerGraph(
+void Assembler::exploreMarkerGraph0(
const vector<string>& request,
ostream& html)
{
// Get the request parameters.
- LocalMarkerGraphRequestParameters requestParameters;
- getLocalMarkerGraphRequestParameters(request, requestParameters);
+ LocalMarkerGraph0RequestParameters requestParameters;
+ getLocalMarkerGraph0RequestParameters(request, requestParameters);
// Write the form.
html << "<h1>Display a local subgraph of the global marker graph</h3>";
@@ -72,12 +73,13 @@ void Assembler::exploreMarkerGraph(
// Create the local marker graph.
- LocalMarkerGraph graph(
+ LocalMarkerGraph0 graph(
assemblerInfo->readRepresentation,
uint32_t(assemblerInfo->k),
assemblerInfo->assemblyMode,
getReads(),
markers,
+ markerGraph,
markerGraph.vertexTable,
*consensusCaller);
const auto createStartTime = steady_clock::now();
@@ -162,7 +164,7 @@ void Assembler::exploreMarkerGraph(
// Color legend for vertices when colored by distance.
if(requestParameters.vertexColoring == "byDistance") {
html << "<h3>Color legend for vertices</h3>";
- LocalMarkerGraph::writeColorLegendVerticesByDistance(html);
+ LocalMarkerGraph0::writeColorLegendVerticesByDistance(html);
}
@@ -308,8 +310,8 @@ void Assembler::exploreMarkerGraph(
// Make the vertices clickable: Ctrl-click recenters
// the graph at that vertex, right click shows vertex details.
html << "<script>\n";
- BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph) {
- const LocalMarkerGraphVertex& vertex = graph[v];
+ BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph0) {
+ const LocalMarkerGraph0Vertex& vertex = graph[v];
SHASTA_ASSERT(!vertex.markerInfos.empty());
const string url = requestParameters.urlForVertex(vertex.vertexId);
html <<
@@ -332,10 +334,10 @@ void Assembler::exploreMarkerGraph(
// Make the edges clickable: Ctrl-click recenters
// the graph at the source vertex of that edge, right click shows edge details.
html << "<script>\n";
- BGL_FORALL_EDGES(e, graph, LocalMarkerGraph) {
- const LocalMarkerGraphEdge& edge = graph[e];
- const LocalMarkerGraph::vertex_descriptor v0 = source(e, graph);
- const LocalMarkerGraphVertex& vertex0 = graph[v0];
+ BGL_FORALL_EDGES(e, graph, LocalMarkerGraph0) {
+ const LocalMarkerGraph0Edge& edge = graph[e];
+ const LocalMarkerGraph0::vertex_descriptor v0 = source(e, graph);
+ const LocalMarkerGraph0Vertex& vertex0 = graph[v0];
const string url = requestParameters.urlForVertex(vertex0.vertexId);
html <<
"element = document.getElementById('edge" << edge.edgeId << "');\n"
@@ -365,9 +367,9 @@ void Assembler::exploreMarkerGraph(
// Extract from the request the parameters for the display
// of the local marker graph.
-void Assembler::getLocalMarkerGraphRequestParameters(
+void Assembler::getLocalMarkerGraph0RequestParameters(
const vector<string>& request,
- LocalMarkerGraphRequestParameters& parameters) const
+ LocalMarkerGraph0RequestParameters& parameters) const
{
parameters.vertexId = 0;
parameters.vertexIdIsPresent = getParameterValue(
@@ -477,7 +479,7 @@ void Assembler::getLocalMarkerGraphRequestParameters(
// highlightedOrientedReads. Each oriented read is assigned a hue
// via hashing of the OrientedReadId. This way, an oriented read
// is always highlighted in the same color.
-void LocalMarkerGraphRequestParameters::parseHighlightedOrientedReads()
+void LocalMarkerGraph0RequestParameters::parseHighlightedOrientedReads()
{
highlightedOrientedReads.clear();
if(highlightedOrientedReadsString.empty()) {
@@ -501,7 +503,7 @@ void LocalMarkerGraphRequestParameters::parseHighlightedOrientedReads()
-void LocalMarkerGraphRequestParameters::writeForm(
+void LocalMarkerGraph0RequestParameters::writeForm(
ostream& html,
MarkerGraph::VertexId vertexCount) const
{
@@ -719,7 +721,7 @@ void LocalMarkerGraphRequestParameters::writeForm(
-bool LocalMarkerGraphRequestParameters::hasMissingRequiredParameters() const
+bool LocalMarkerGraph0RequestParameters::hasMissingRequiredParameters() const
{
return
!vertexIdIsPresent ||
@@ -729,7 +731,7 @@ bool LocalMarkerGraphRequestParameters::hasMissingRequiredParameters() const
-string LocalMarkerGraphRequestParameters::vertexScalingFactorString() const
+string LocalMarkerGraph0RequestParameters::vertexScalingFactorString() const
{
if(vertexScalingFactorIsPresent) {
std::ostringstream s;
@@ -742,7 +744,7 @@ string LocalMarkerGraphRequestParameters::vertexScalingFactorString() const
-string LocalMarkerGraphRequestParameters::arrowScalingFactorString() const
+string LocalMarkerGraph0RequestParameters::arrowScalingFactorString() const
{
if(arrowScalingFactorIsPresent) {
std::ostringstream s;
@@ -755,7 +757,7 @@ string LocalMarkerGraphRequestParameters::arrowScalingFactorString() const
-string LocalMarkerGraphRequestParameters::edgeThicknessScalingFactorString() const
+string LocalMarkerGraph0RequestParameters::edgeThicknessScalingFactorString() const
{
if(edgeThicknessScalingFactorIsPresent) {
std::ostringstream s;
@@ -768,10 +770,10 @@ string LocalMarkerGraphRequestParameters::edgeThicknessScalingFactorString() con
-string LocalMarkerGraphRequestParameters::url() const
+string LocalMarkerGraph0RequestParameters::url() const
{
return
- string("exploreMarkerGraph") +
+ string("exploreMarkerGraph0") +
"?vertexId=" + to_string(vertexId) +
"&maxDistance=" + to_string(maxDistance) +
"&minVertexCoverage=" + to_string(minVertexCoverage) +
@@ -799,16 +801,16 @@ string LocalMarkerGraphRequestParameters::url() const
-string LocalMarkerGraphRequestParameters::urlForVertex(uint64_t newVertexId) const
+string LocalMarkerGraph0RequestParameters::urlForVertex(uint64_t newVertexId) const
{
- LocalMarkerGraphRequestParameters newParameters = *this;
+ LocalMarkerGraph0RequestParameters newParameters = *this;
newParameters.vertexId = newVertexId;
return newParameters.url();
}
-string LocalMarkerGraphRequestParameters::vertexLabelsString() const
+string LocalMarkerGraph0RequestParameters::vertexLabelsString() const
{
switch(vertexLabels) {
case 0: return "none";
@@ -820,7 +822,7 @@ string LocalMarkerGraphRequestParameters::vertexLabelsString() const
-string LocalMarkerGraphRequestParameters::edgeLabelsString() const
+string LocalMarkerGraph0RequestParameters::edgeLabelsString() const
{
switch(edgeLabels) {
case 0: return "none";
@@ -862,7 +864,7 @@ void Assembler::exploreMarkerGraphVertex(const vector<string>& request, ostream&
SHASTA_ASSERT(markerCount > 0);
// Get the marker sequence.
- const KmerId kmerId = markers.begin()[markerIds[0]].kmerId;
+ const KmerId kmerId = getMarkerGraphVertexKmerId(vertexId);
const size_t k = assemblerInfo->k;
const Kmer kmer(kmerId, k);
@@ -953,7 +955,7 @@ void Assembler::exploreMarkerGraphVertex(const vector<string>& request, ostream&
// Page title.
const string titleUrl =
- "exploreMarkerGraph?vertexId=" + to_string(vertexId) +
+ "exploreMarkerGraph0?vertexId=" + to_string(vertexId) +
"&maxDistance=3"
"&useWeakEdges=on"
"&usePrunedEdges=on"
@@ -1212,7 +1214,7 @@ void Assembler::exploreMarkerGraphEdge(const vector<string>& request, ostream& h
// Access the edge.
const MarkerGraph::Edge& edge = markerGraph.edges[edgeId];
array<MarkerGraph::VertexId, 2> vertexIds = {edge.source, edge.target};
- const size_t markerCount = edge.coverage;
+ const size_t markerCount = markerGraph.edgeCoverage(edgeId);
// The marker intervals of this edge.
const span<MarkerInterval> markerIntervals = markerGraph.edgeMarkerIntervals[edgeId];
@@ -1304,7 +1306,7 @@ void Assembler::exploreMarkerGraphEdge(const vector<string>& request, ostream& h
// Page title.
const string titleUrl =
- "exploreMarkerGraph?vertexId=" + to_string(vertexIds[0]) +
+ "exploreMarkerGraph0?vertexId=" + to_string(vertexIds[0]) +
"&maxDistance=3"
"&useWeakEdges=on"
"&usePrunedEdges=on"
@@ -1766,6 +1768,10 @@ void Assembler::exploreMarkerCoverage(
const bool readIdIsPresent = getParameterValue(request, "readId", readId);
Strand strand = 0;
const bool strandIsPresent = getParameterValue(request, "strand", strand);
+ uint32_t firstOrdinal = 0;
+ getParameterValue(request, "firstOrdinal", firstOrdinal);
+ uint32_t lastOrdinal = 0;
+ getParameterValue(request, "lastOrdinal", lastOrdinal);
int width = 600;
getParameterValue(request, "width", width);
int height = 400;
@@ -1781,6 +1787,10 @@ void Assembler::exploreMarkerCoverage(
"<tr><td>Strand<td class=centered>";
writeStrandSelection(html, "strand", strandIsPresent && strand==0, strandIsPresent && strand==1);
html <<
+ "<tr><td>First ordinal<td class=centered>"
+ "<input type=text name=firstOrdinal style='text-align:center' size=8 value='" << firstOrdinal << "'>"
+ "<tr><td>Last ordinal<br>(0 for unlimited)<td class=centered>"
+ "<input type=text name=lastOrdinal style='text-align:center' size=8 value='" << lastOrdinal << "'>"
"<tr><td>Plot width<td class=centered>"
"<input type=text name=width style='text-align:center' size=8 value='" << width << "'>"
"<tr><td>Plot height<td class=centered>"
@@ -1805,7 +1815,11 @@ void Assembler::exploreMarkerCoverage(
"plot '-' with points pointtype 7 pointsize 0.5 linecolor rgb '#0000ff' notitle\n";
const uint32_t markerCount = uint32_t(markers.size(orientedReadId.getValue()));
- for(uint32_t ordinal=0; ordinal<markerCount; ordinal++) {
+ if(lastOrdinal == 0) {
+ lastOrdinal = markerCount - 1;
+ }
+ SHASTA_ASSERT(lastOrdinal >= firstOrdinal);
+ for(uint32_t ordinal=firstOrdinal; ordinal<=lastOrdinal; ordinal++) {
const MarkerGraph::VertexId vertexId =
getGlobalMarkerGraphVertex(orientedReadId, ordinal);
if(vertexId == MarkerGraph::invalidCompressedVertexId) {
@@ -2085,6 +2099,10 @@ void Assembler::exploreMarkerConnectivity(
const bool ordinalIsPresent = getParameterValue(request, "ordinal", ordinal);
string whichAlignments = "ReadGraphAlignments";
getParameterValue(request, "whichAlignments", whichAlignments);
+ string labelsString;
+ const bool labels = getParameterValue(request, "labels", labelsString);
+ double timeout = 30;
+ getParameterValue(request, "timeout", timeout);
// Write the form.
html <<
@@ -2107,7 +2125,14 @@ void Assembler::exploreMarkerConnectivity(
html << "<br><input type=radio name=whichAlignments value=ReadGraphAlignments" <<
(whichAlignments=="ReadGraphAlignments" ? " checked=checked" : "") <<
"> Only use alignments in the read graph.";
- html << "</form>";
+ html << "<br><input type=checkbox name=labels" <<
+ (labels ? " checked" : "") <<
+ "> Labels"
+ "<br>Timeout (seconds) for graph layout"
+ " <input type=text required name=timeout size=8 style='text-align:center'" <<
+ " value='" << timeout <<
+ "'>"
+ "</form>";
const bool useReadGraphAlignmentsOnly = (whichAlignments == "ReadGraphAlignments");
// If the required parameters are missing, stop here.
@@ -2139,6 +2164,10 @@ void Assembler::exploreMarkerConnectivity(
++frequencyMap[orientedReadId];
}
+ html << "<br>The marker connectivity graph has " <<
+ num_vertices(graph) << " vertices and " <<
+ num_edges(graph) << " edges.";
+
// Write the graph out in graphviz format.
const string uuid = to_string(boost::uuids::random_generator()());
@@ -2149,15 +2178,19 @@ void Assembler::exploreMarkerConnectivity(
const MarkerDescriptor markerDescriptor = graph[v];
const OrientedReadId orientedReadId1 = markerDescriptor.first;
const uint32_t ordinal1 = markerDescriptor.second;
- dotFile << "\"" << orientedReadId1 << "-" << ordinal1 << "\""
- " [label=\"" << orientedReadId1 << "\\n" << ordinal1 <<
- "\"";
- if(frequencyMap[orientedReadId1] != 1) {
- dotFile << " style=filled fillcolor=pink";
- } else {
- dotFile << " style=filled fillcolor=cornsilk";
+ dotFile << "\"" << orientedReadId1 << "-" << ordinal1 << "\"";
+ if(labels) {
+ dotFile <<
+ " [label=\"" << orientedReadId1 << "\\n" << ordinal1 <<
+ "\"";
+ if(frequencyMap[orientedReadId1] != 1) {
+ dotFile << " style=filled fillcolor=pink";
+ } else {
+ dotFile << " style=filled fillcolor=cornsilk";
+ }
+ dotFile << "]";
}
- dotFile << "];\n";
+ dotFile << ";\n";
}
BGL_FORALL_EDGES(e, graph, MarkerConnectivityGraph) {
const auto v0 = source(e, graph);
@@ -2174,8 +2207,9 @@ void Assembler::exploreMarkerConnectivity(
// Use graphviz to render it to svg.
- const string command = timeoutCommand() + " 30 sfdp -O -T svg " + dotFileName +
- " -Goverlap=false -Gsplines=true -Gsmoothing=triangle";
+ const string command = timeoutCommand() + " " + to_string(int(timeout)) + " sfdp -O -T svg " + dotFileName +
+ ( labels ? " -Goverlap=false -Gsplines=true -Gsmoothing=triangle" :
+ " -Nshape=point -Gsize=10 -Gratio=expand -Epenwidth=0.4");
const int commandStatus = ::system(command.c_str());
if(WIFEXITED(commandStatus)) {
const int exitStatus = WEXITSTATUS(commandStatus);
diff --git a/src/AssemblerHttpServer-MarkerGraph1.cpp b/src/AssemblerHttpServer-MarkerGraph1.cpp
new file mode 100644
index 0000000..f2e536c
--- /dev/null
+++ b/src/AssemblerHttpServer-MarkerGraph1.cpp
@@ -0,0 +1,667 @@
+// Shasta.
+#include "Assembler.hpp"
+#include "html.hpp"
+#include "invalid.hpp"
+#include "LocalMarkerGraph1.hpp"
+#include "platformDependent.hpp"
+#include "Reads.hpp"
+using namespace shasta;
+
+// Boost libraries.
+#include <boost/uuid/uuid.hpp>
+#include <boost/uuid/uuid_generators.hpp>
+#include <boost/uuid/uuid_io.hpp>
+
+// Standard library.
+#include "fstream.hpp"
+
+
+
+void Assembler::exploreMarkerGraph1(
+ const vector<string>& request,
+ ostream& html)
+{
+ if(assemblerInfo->assemblyMode != 3) {
+ throw runtime_error("This is only available for assembly mode 3.");
+ }
+
+ // This makes the following assumptions.
+ SHASTA_ASSERT(getReads().representation == 0); // No RLE.
+ SHASTA_ASSERT((assemblerInfo->k % 2) == 0); // Marker length is even.
+
+
+ // Get the request parameters.
+ uint64_t vertexId = invalid<uint64_t>;
+ getParameterValue(request, "vertexId", vertexId);
+
+ uint64_t maxDistance = 2;
+ getParameterValue( request, "maxDistance", maxDistance);
+
+ uint64_t minVertexCoverage = 0;
+ getParameterValue(request, "minVertexCoverage", minVertexCoverage);
+
+ uint64_t minEdgeCoverage = 0;
+ getParameterValue(request, "minEdgeCoverage", minEdgeCoverage);
+
+ uint64_t maxPruneCoverage = 0;
+ getParameterValue(request, "maxPruneCoverage", maxPruneCoverage);
+
+ uint64_t maxLongChainCoverage = 0;
+ getParameterValue(request, "maxLongChainCoverage", maxLongChainCoverage);
+
+ uint64_t minLongChainLength = 100;
+ getParameterValue(request, "minLongChainLength", minLongChainLength);
+
+ uint64_t sizePixels = 600;
+ getParameterValue(request, "sizePixels", sizePixels);
+
+ double thicknessScaling = 1.;
+ getParameterValue(request, "thicknessScaling", thicknessScaling);
+
+ uint64_t layoutQuality = 2;
+ getParameterValue(request, "layoutQuality", layoutQuality);
+
+ double edgeResolution = 1.;
+ getParameterValue(request, "edgeResolution", edgeResolution);
+
+ uint64_t redCoverage = 1;
+ getParameterValue(request, "redCoverage", redCoverage);
+
+ uint64_t greenCoverage = 5;
+ getParameterValue(request, "greenCoverage", greenCoverage);
+
+ string coloring;
+ getParameterValue(request, "coloring", coloring);
+
+ uint64_t readFollowingStartEdgeId = 0;
+ getParameterValue(request, "readFollowingStartEdgeId", readFollowingStartEdgeId);
+
+ int64_t firstMarkerOffset = 0;
+ getParameterValue(request, "firstMarkerOffset", firstMarkerOffset);
+
+ int64_t lastMarkerOffset = 0;
+ getParameterValue(request, "lastMarkerOffset", lastMarkerOffset);
+
+ string showLabelsString;
+ const bool showLabels = getParameterValue(request, "showLabels", showLabelsString);
+
+ double timeout = 30;
+ getParameterValue(request, "timeout", timeout);
+
+ string outputType = "svg";
+ getParameterValue(request, "outputType", outputType);
+
+
+ // Write the form.
+ html <<
+ "<form>"
+
+ "<h2>Local marker graph</h2>"
+ "<table>"
+
+ "<tr>"
+ "<td>Start vertex id"
+ "<td class=centered><input type=text required name=vertexId size=8 style='text-align:center'"
+ << ((vertexId == invalid<uint64_t>) ? "" : ("value='" + to_string(vertexId) + "'")) <<
+ ">"
+
+ "<tr title='Maximum distance from start vertex (number of edges)'>"
+ "<td>Maximum distance"
+ "<td class=centered><input type=text required name=maxDistance size=8 style='text-align:center'"
+ "value='" << maxDistance << "'>"
+
+ "<tr>"
+ "<td>Minimum vertex coverage"
+ "<td class=centered><input type=text required name=minVertexCoverage size=8 style='text-align:center'"
+ "value='" << minVertexCoverage << "'>"
+
+ "<tr>"
+ "<td>Minimum edge coverage"
+ "<td class=centered><input type=text required name=minEdgeCoverage size=8 style='text-align:center'"
+ "value='" << minEdgeCoverage << "'>"
+
+ "<tr>"
+ "<td>Prune leaves with coverage up to"
+ "<td class=centered><input type=text required name=maxPruneCoverage size=8 style='text-align:center'"
+ "value='" << maxPruneCoverage << "'>"
+
+ "<tr>"
+ "<td>Prune long linear sections<br>with low coverage"
+ "<td>"
+ "<input type=text required name=maxLongChainCoverage size=8 style='text-align:center'"
+ "value='" << maxLongChainCoverage << "'> Maximum coverage"
+ "<br><input type=text required name=minLongChainLength size=8 style='text-align:center'"
+ "value='" << minLongChainLength << "'> Minimum length (markers)"
+
+ "<tr>"
+ "<td>Graphics size in pixels"
+ "<td class=centered><input type=text required name=sizePixels size=8 style='text-align:center'"
+ " value='" << sizePixels << "'>"
+
+ "<tr>"
+ "<td>Thickness scaling factor"
+ "<td class=centered><input type=text required name=thicknessScaling size=8 style='text-align:center'"
+ " value='" << thicknessScaling << "'>"
+
+ "<tr>"
+ "<td>Layout quality"
+ "<td class=centered>"
+ "<select name=layoutQuality style='text-align:center'>"
+ "<option value=0" << (layoutQuality==0 ? " selected" : "") <<
+ ">Best speed</option>"
+ "<option value=1" << (layoutQuality==1 ? " selected" : "") <<
+ ">Intermediate quality and speed</option>"
+ "<option value=2" << (layoutQuality==2 ? " selected" : "") <<
+ ">Best quality</option>"
+ "</select>"
+
+ "<tr>"
+ "<td>Edge resolution ";
+ writeInformationIcon(html, "Affects edge smoothness and speed of layout computation.");
+
+ html <<
+ "<td class=centered><input type=text required name=edgeResolution size=8 style='text-align:center'"
+ " value='" << edgeResolution << "'>"
+
+ "<tr>"
+ "<td>Coloring"
+ "<td>"
+ "<select name=coloring style='text-align:center'>"
+ "<option value=random" << (coloring == "random" ? " selected" : "") <<
+ ">Random</option>"
+ "<option value=byCoverage" << (coloring == "byCoverage" ? " selected" : "") <<
+ ">By coverage</option>"
+ "<option value=readFollowing" << (coloring == "readFollowing" ? " selected" : "") <<
+ ">Read following</option>"
+ "</select>"
+ "<br><input type=text required name=redCoverage size=8 style='text-align:center'"
+ " value='" << redCoverage << "'> Red coverage"
+ "<br><input type=text required name=greenCoverage size=8 style='text-align:center'"
+ " value='" << greenCoverage << "'> Green coverage"
+ "<hr><span style='text-align:center'>Read following</span>"
+ "<br><input type=text required name=readFollowingStartEdgeId size=8 style='text-align:center'"
+ " value='" << readFollowingStartEdgeId << "'> Start edge for read following"
+ "<br><input type=text required name=firstMarkerOffset size=8 style='text-align:center'"
+ " value='" << firstMarkerOffset << "'> First marker offset"
+ "<br><input type=text required name=lastMarkerOffset size=8 style='text-align:center'"
+ " value='" << lastMarkerOffset << "'> Last marker offset"
+
+ "<tr>"
+ "<td>Show labels"
+ "<td class=centered><input type=checkbox name=showLabels" <<
+ (showLabels ? " checked" : "") <<
+ ">"
+
+ "<tr>"
+ "<td>Timeout in seconds"
+ "<td class=centered><input type=text required name=timeout size=8 style='text-align:center'"
+ " value='" << timeout << "'>"
+
+ "<tr>"
+ "<td>Output"
+ "<td>"
+ "<input type=radio name=outputType value='noOutput'" <<
+ (outputType == "noOutput" ? " checked=on" : "") <<
+ ">Show the number of vertices and edges"
+ "<br><input type=radio name=outputType value='createGfa'" <<
+ (outputType == "createGfa" ? " checked=on" : "") <<
+ ">Create a GFA file"
+ "<br><input type=radio name=outputType value='createAndOpenGfa'" <<
+ (outputType == "createAndOpenGfa" ? " checked=on" : "") <<
+ ">Create a GFA file and open it in Bandage";
+
+ html <<
+ "<br><input type=radio name=outputType value='fastCanvas'" <<
+ (outputType == "fastCanvas" ? " checked=on" : "") <<
+ ">Display vertices only, not interactive ";
+ writeInformationIcon(html, "The fastest choice. "
+ "Fast display with one pixel per vertex and no edges, done using canvas. "
+ "Best for large subgraphs.");
+
+ html <<
+ "<br><input type=radio name=outputType value='fastSvg'" <<
+ (outputType == "fastSvg" ? " checked=on" : "") <<
+ ">Display vertices only, interactive ";
+ writeInformationIcon(html, "Fast display with one pixel per vertex and no edges, done using svg.");
+
+ html <<
+ "<br><input type=radio name=outputType value='svg'" <<
+ (outputType == "svg" ? " checked=on" : "") <<
+ ">Display vertices and edges, interactive ";
+
+
+ html <<
+ "</table>"
+
+ "<br><input type=submit value='Do it'>"
+ "</form>";
+
+
+ // If the vertex id was not specified, stop here.
+ if(vertexId == invalid<uint64_t>) {
+ return;
+ }
+
+ // If the vertex id is invalid, stop here.
+ if(vertexId > markerGraph.vertexCount()) {
+ html << "<p>Invalid vertex id " << vertexId;
+ html << ". Must be between 0 and " << markerGraph.vertexCount()-1 << " inclusive.";
+ return;
+ }
+
+
+
+ // Create the local marker graph.
+ LocalMarkerGraph1 graph(
+ markers,
+ markerGraph,
+ vertexId,
+ maxDistance,
+ minVertexCoverage,
+ minEdgeCoverage);
+
+ // Do the requested graph cleanup.
+ if(maxPruneCoverage > 0) {
+ graph.pruneLowCoverageLeaves(maxPruneCoverage);
+ }
+ if(maxLongChainCoverage > 0) {
+ graph.removeLongLowCoverageChains(maxLongChainCoverage, minLongChainLength);
+ }
+
+ html << "<p>The local marker graph has " << num_vertices(graph) <<
+ " vertices and " << num_edges(graph) << " edges.";
+
+
+ if(outputType == "noOutput") {
+ return;
+ }
+
+ if(outputType == "fastCanvas") {
+ graph.writeHtml0(html, sizePixels, layoutQuality, timeout, false);
+ }
+
+ else if(outputType == "fastSvg") {
+ graph.writeHtml0(html, sizePixels, layoutQuality, timeout, true);
+ }
+
+ else if(outputType == "svg") {
+ graph.writeHtml1(html, sizePixels, thicknessScaling, layoutQuality, edgeResolution,
+ coloring, redCoverage, greenCoverage,
+ readFollowingStartEdgeId, firstMarkerOffset, lastMarkerOffset,
+ showLabels,
+ timeout);
+ }
+
+ else {
+
+ // Create a gfa file to represent the local marker graph.
+ const string gfaFileName = tmpDirectory() + to_string(boost::uuids::random_generator()()) + ".gfa";
+ graph.writeGfa(gfaFileName);
+ html << "<p>The local marker graph is in "
+ "<span id='SpanToBeCopied' style='color:Blue'>" << gfaFileName << "</span>"
+ ". Remove it when done with it."
+ "<br><button onClick='copySpanToClipboard()'>Copy GFA file name to clipboard</button>";
+ html << R"###(
+ <script>
+ function copySpanToClipboard()
+ {
+
+ // Remove any previous selection.
+ var selection = window.getSelection();
+ selection.removeAllRanges();
+
+ // Select the span.
+ var element = document.getElementById("SpanToBeCopied");
+ var range = document.createRange();
+ range.selectNodeContents(element);
+ selection.addRange(range);
+
+ // Copy it to the clipboard.
+ document.execCommand("copy");
+
+ // Unselect it.
+ selection.removeAllRanges();
+
+
+ }
+ </script>
+ )###";
+
+
+ // If requested, open it in Bandage.
+ // This is done on the server side, of course. This can have unexpected
+ // consequences if running remotely.
+ // Also, because of this the connection with the http client is not closed
+ // until Bandage terminates, so the browser thinks ore data are coming.
+ if(outputType == "createAndOpenGfa") {
+ ::system(("Bandage load " + gfaFileName + "&").c_str());
+ }
+ }
+}
+
+
+void Assembler::exploreMarkerGraphEdgePair(
+ const vector<string>& request,
+ ostream& html)
+{
+ // Check that our assumptions are satisfied.
+ if(assemblerInfo->assemblyMode != 3) {
+ throw runtime_error("This is only available for assembly mode 3.");
+ }
+ SHASTA_ASSERT(getReads().representation == 0); // No RLE.
+ SHASTA_ASSERT((assemblerInfo->k % 2) == 0); // Marker length is even.
+
+ // Get the parameters for the request
+ uint64_t edgeIdA = invalid<uint64_t>;
+ getParameterValue(request, "edgeIdA", edgeIdA);
+
+ uint64_t edgeIdB = invalid<uint64_t>;
+ getParameterValue(request, "edgeIdB", edgeIdB);
+
+ // Write the form.
+ html <<
+ "<form>"
+ "<table>"
+ "<tr><td class=centered>Edge A<td class=centered>"
+ "<input type=text required name=edgeIdA size=8 style='text-align:center' " <<
+ ((edgeIdA == invalid<uint64_t>) ? "" : ("value='" + to_string(edgeIdA) + "'")) << ">"
+ "<tr><td class=centered>Edge B<td class=centered>"
+ "<input type=text required name=edgeIdB size=8 style='text-align:center' " <<
+ ((edgeIdB == invalid<uint64_t>) ? "" : ("value='" + to_string(edgeIdB) + "'")) << ">"
+ "</table>"
+ "<br><input type=submit value='Do it'>"
+ "</form>";
+
+ // If the edge id are missing, do nothing.
+ if(edgeIdA == invalid<uint64_t> or edgeIdB == invalid<uint64_t>) {
+ return;
+ }
+
+ // Sanity checks on the edge ids.
+ if(edgeIdA >= markerGraph.edges.size()) {
+ throw runtime_error("Marker graph edge " + to_string(edgeIdA) +
+ " is not valid. Maximum valid edge id is " + to_string(markerGraph.edges.size()));
+ }
+ if(edgeIdB >= markerGraph.edges.size()) {
+ throw runtime_error("Marker graph edge " + to_string(edgeIdB) +
+ " is not valid. Maximum valid edge id is " + to_string(markerGraph.edges.size()));
+ }
+
+ // Sanity check that the two edges are distinct.
+ if(edgeIdA == edgeIdB) {
+ html << "Specify two distinct edges.";
+ return;
+ }
+
+ // This analysis can only be done if both edges have no duplicate OrientedReadIds
+ // in their MarkerIntervals.
+ if(markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdA)) {
+ html << "Marker graph edge " << edgeIdA << " has duplicate oriented reads.";
+ return;
+ }
+ if(markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdB)) {
+ html << "Marker graph edge " << edgeIdB << " has duplicate oriented reads.";
+ return;
+ }
+
+ // Write a header.
+ html << "<h1>Read composition analysis for marker graph edges " << edgeIdA <<
+ " and " << edgeIdB << "</h1>";
+
+ // Analyze read composition.
+ MarkerGraphEdgePairInfo info;
+ SHASTA_ASSERT(analyzeMarkerGraphEdgePair(edgeIdA, edgeIdB, info));
+ writeHtmlMarkerGraphEdgePairInfo(html, edgeIdA, edgeIdB, info);
+
+ if( markerGraph.edges[edgeIdA].isPrimary==1 and
+ markerGraph.edges[edgeIdB].isPrimary==1 and
+ info.common == 0) {
+ const uint64_t estimatedOffset = estimateBaseOffsetUnsafe(edgeIdA, edgeIdB);
+ if(estimatedOffset != invalid<uint64_t>) {
+ html << "<p>Estimated offset is " << estimatedOffset << " bases.";
+ }
+ }
+}
+
+
+
+void Assembler::writeHtmlMarkerGraphEdgePairInfo(
+ ostream& html,
+ MarkerGraphEdgeId edgeIdA,
+ MarkerGraphEdgeId edgeIdB,
+ const MarkerGraphEdgePairInfo& info
+ ) const
+{
+ // Begin the summary table.
+ html <<
+ "<table>"
+ "<tr><th><th>On<br>edge A<th>On<br>edge B";
+
+ // Total.
+ html <<
+ "<tr><th class=left>Total ";
+ writeInformationIcon(html, "The total number of oriented reads on each of the two edges.");
+ html << "<td class=centered>" << info.totalA << "<td class=centered>" << info.totalB;
+
+ // Common.
+ html << "<tr><th class=left>Common ";
+ writeInformationIcon(html, "The number of common oriented reads between the two edges.");
+ html <<
+ "<td class=centered colspan = 2>" << info.common;
+
+ // Only.
+ html <<
+ "<tr><th class=left>Only ";
+ writeInformationIcon(html, "The number of oriented reads that appear in one edge but not the other.");
+ html <<
+ "<td class=centered>" << info.onlyA << "<td class=centered>" << info.onlyB;
+
+ // The rest can only be written if there are common reads.
+ if(info.common > 0) {
+
+ // Only, short.
+ html <<
+ "<tr><th class=left>Only, short ";
+ writeInformationIcon(html, "The number of oriented reads that appear in one edge only "
+ " and are too short to appear on the other edge, based on the estimated base offset.");
+ html <<
+ "<td class=centered>" << info.onlyAShort << "<td class=centered>" << info.onlyBShort;
+
+ // Only, missing.
+ html <<
+ "<tr><th class=left>Only, missing ";
+ writeInformationIcon(html, "The number of oriented reads that appear in one edge only "
+ " and are not too short to appear on the other edge, based on the estimated base offset.");
+ html <<
+ "<td class=centered>" << info.onlyA - info.onlyAShort << "<td class=centered>" << info.onlyB - info.onlyBShort;
+ }
+
+ // End the summary table.
+ html << "</table>";
+
+ // Only write out the rest if there are common reads.
+ if(info.common == 0) {
+ return;
+ }
+
+ // Write the table with Jaccard similarities and estimated offsets.
+ using std::fixed;
+ using std::setprecision;
+ html <<
+ "<br><table>"
+ "<tr><th class=left>Jaccard similarity<td class=centered>" <<
+ fixed << setprecision(2) << info.jaccard() <<
+ "<tr><th class=left>Corrected Jaccard similarity<td class=centered>" <<
+ fixed << setprecision(2) << info.correctedJaccard() <<
+ "<tr><th class=left>Estimated offset in markers<td class=centered>" << info.offsetInMarkers <<
+ "<tr><th class=left>Estimated offset in bases<td class=centered>" << info.offsetInBases <<
+ "</table>";
+
+
+ // Write the details table.
+ html <<
+ "<br>In the following table, positions in red are hypothetical, based on the above "
+ "estimated base offset."
+ "<p><table>";
+
+ // Header row.
+ html <<
+ "<tr>"
+ "<th class=centered rowspan=2>Oriented<br>read id"
+ "<th class=centered colspan=2>Length"
+ "<th colspan=4>Edge A"
+ "<th colspan=4>Edge B"
+ "<th rowspan=2>Ordinal offset"
+ "<th rowspan=2>Base offset"
+ "<th rowspan=2>Classification"
+ "<tr>"
+ "<th>Markers"
+ "<th>Bases"
+ "<th>Ordinal0"
+ "<th>Ordinal1"
+ "<th>Position0"
+ "<th>Position1"
+ "<th>Ordinal0"
+ "<th>Ordinal1"
+ "<th>Position0"
+ "<th>Position1";
+
+ // Prepare for the joint loop over OrientedReadIds of the two edges.
+ const auto markerIntervalsA = markerGraph.edgeMarkerIntervals[edgeIdA];
+ const auto markerIntervalsB = markerGraph.edgeMarkerIntervals[edgeIdB];
+ const auto beginA = markerIntervalsA.begin();
+ const auto beginB = markerIntervalsB.begin();
+ const auto endA = markerIntervalsA.end();
+ const auto endB = markerIntervalsB.end();
+
+ // Joint loop over the MarkerIntervals of the two edges.
+ auto itA = beginA;
+ auto itB = beginB;
+ while(true) {
+ if(itA == endA and itB == endB) {
+ break;
+ }
+
+ else if(itB == endB or ((itA!=endA) and (itA->orientedReadId < itB->orientedReadId))) {
+ // This oriented read only appears in edge A.
+ const OrientedReadId orientedReadId = itA->orientedReadId;
+ const auto orientedReadMarkers = markers[orientedReadId.getValue()];
+ const int64_t lengthInBases = int64_t(getReads().getReadRawSequenceLength(orientedReadId.getReadId()));
+
+ // Get the positions of edge A in this oriented read.
+ const uint32_t ordinalA0 = itA->ordinals[0];
+ const uint32_t ordinalA1 = itA->ordinals[1];
+ const int64_t positionA0 = int64_t(orientedReadMarkers[ordinalA0].position);
+ const int64_t positionA1 = int64_t(orientedReadMarkers[ordinalA1].position);
+
+ // Find the hypothetical positions of edge B, assuming the estimated base offset.
+ const int64_t positionB0 = positionA0 + info.offsetInBases;
+ const int64_t positionB1 = positionA1 + info.offsetInBases;
+ const bool isShort = positionB0<0 or positionB1 >= lengthInBases;
+
+ html <<
+ "<tr><td class=centered>"
+ "<a href='exploreRead?readId=" << orientedReadId.getReadId() <<
+ "&strand=" << orientedReadId.getStrand() << "'>" << orientedReadId << "</a>"
+ "<td class=centered>" << orientedReadMarkers.size() <<
+ "<td class=centered>" << lengthInBases <<
+ "<td class=centered>" << ordinalA0 <<
+ "<td class=centered>" << ordinalA1 <<
+ "<td class=centered>" << positionA0 <<
+ "<td class=centered>" << positionA1 <<
+ "<td><td>"
+ "<td class=centered style='color:Red'>" << positionB0 <<
+ "<td class=centered style='color:Red'>" << positionB1 << "<td><td>"
+ "<td class=centered>OnlyA, " << (isShort ? "short" : "missing");
+
+ ++itA;
+ continue;
+ }
+
+ else if(itA == endA or ((itB!=endB) and (itB->orientedReadId < itA->orientedReadId))) {
+ // This oriented read only appears in edge B.
+ const OrientedReadId orientedReadId = itB->orientedReadId;
+ const auto orientedReadMarkers = markers[orientedReadId.getValue()];
+ const int64_t lengthInBases = int64_t(getReads().getReadRawSequenceLength(orientedReadId.getReadId()));
+
+ // Get the positions of edge B in this oriented read.
+ const uint32_t ordinalB0 = itB->ordinals[0];
+ const uint32_t ordinalB1 = itB->ordinals[1];
+ const int64_t positionB0 = int64_t(orientedReadMarkers[ordinalB0].position);
+ const int64_t positionB1 = int64_t(orientedReadMarkers[ordinalB1].position);
+
+ // Find the hypothetical positions of edge A, assuming the estimated base offset.
+ const int64_t positionA0 = positionB0 - info.offsetInBases;
+ const int64_t positionA1 = positionB1 - info.offsetInBases;
+ const bool isShort = positionA0<0 or positionA1 >= lengthInBases;
+
+ html <<
+ "<tr><td class=centered>"
+ "<a href='exploreRead?readId=" << orientedReadId.getReadId() <<
+ "&strand=" << orientedReadId.getStrand() << "'>" << orientedReadId << "</a>"
+ "<td class=centered>" << orientedReadMarkers.size() <<
+ "<td class=centered>" << lengthInBases <<
+ "<td><td>"
+ "<td class=centered style='color:Red'>" << positionA0 <<
+ "<td class=centered style='color:Red'>" << positionA1 <<
+ "<td class=centered>" << ordinalB0 <<
+ "<td class=centered>" << ordinalB1 <<
+ "<td class=centered>" << positionB0 <<
+ "<td class=centered>" << positionB1 << "<td><td>"
+ "<td class=centered>OnlyB, " << (isShort ? "short" : "missing");
+
+ ++itB;
+ continue;
+ }
+
+ else {
+ // This oriented read appears in both edges.
+ const OrientedReadId orientedReadId = itA->orientedReadId;
+ const auto orientedReadMarkers = markers[orientedReadId.getValue()];
+ const int64_t lengthInBases = int64_t(getReads().getReadRawSequenceLength(orientedReadId.getReadId()));
+
+ // Get the positions of edge A in this oriented read.
+ const uint32_t ordinalA0 = itA->ordinals[0];
+ const uint32_t ordinalA1 = itA->ordinals[1];
+ const int64_t positionA0 = int64_t(orientedReadMarkers[ordinalA0].position);
+ const int64_t positionA1 = int64_t(orientedReadMarkers[ordinalA1].position);
+
+ // Get the positions of edge B in this oriented read.
+ const uint32_t ordinalB0 = itB->ordinals[0];
+ const uint32_t ordinalB1 = itB->ordinals[1];
+ const int64_t positionB0 = int64_t(orientedReadMarkers[ordinalB0].position);
+ const int64_t positionB1 = int64_t(orientedReadMarkers[ordinalB1].position);
+
+ // Compute estimated offsets.
+ const int64_t ordinalOffset = uint64_t(ordinalB1) - uint64_t(ordinalA0);
+ const int64_t baseOffset = positionB1 - positionA0;
+
+ html <<
+ "<tr><td class=centered>"
+ "<a href='exploreRead?readId=" << orientedReadId.getReadId() <<
+ "&strand=" << orientedReadId.getStrand() << "'>" << orientedReadId << "</a>"
+ "<td class=centered>" << orientedReadMarkers.size() <<
+ "<td class=centered>" << lengthInBases <<
+ "<td class=centered>" << ordinalA0 <<
+ "<td class=centered>" << ordinalA1 <<
+ "<td class=centered>" << positionA0 <<
+ "<td class=centered>" << positionA1 <<
+ "<td class=centered>" << ordinalB0 <<
+ "<td class=centered>" << ordinalB1 <<
+ "<td class=centered>" << positionB0 <<
+ "<td class=centered>" << positionB1 <<
+ "<td class=centered>" << ordinalOffset <<
+ "<td class=centered>" << baseOffset <<
+ "<td class=centered>Common";
+
+ ++itA;
+ ++itB;
+ }
+ }
+
+ // Finish the details table.
+ html << "</table>";
+
+
+}
+
diff --git a/src/AssemblerHttpServer-Mode3.cpp b/src/AssemblerHttpServer-Mode3.cpp
deleted file mode 100644
index 753d5dc..0000000
--- a/src/AssemblerHttpServer-Mode3.cpp
+++ /dev/null
@@ -1,996 +0,0 @@
-// Shasta.
-#include "Assembler.hpp"
-#include "assembleMarkerGraphPath.hpp"
-#include "mode3.hpp"
-#include "mode3-AssemblyPath.hpp"
-#include "mode3-LocalAssemblyGraph.hpp"
-#include "mode3-SegmentPairInformation.hpp"
-#include "PngImage.hpp"
-using namespace shasta;
-using namespace mode3;
-
-// Boost library.
-#include <boost/icl/discrete_interval.hpp>
-#include <boost/icl/right_open_interval.hpp>
-
-// Standard library.
-#include "fstream.hpp"
-
-
-void Assembler::exploreMode3AssemblyGraph(
- const vector<string>& request,
- ostream& html)
-{
- SHASTA_ASSERT(assemblyGraph3Pointer);
-
- // Get the parameters for the request.
- mode3::LocalAssemblyGraph::SvgOptions options(request);
-
- uint64_t maxDistance = 2;
- getParameterValue(request, "maxDistance", maxDistance);
-
- uint64_t startSegmentId;
- const bool startSegmentIdIsPresent = getParameterValue(request, "startSegmentId", startSegmentId);
-
- double timeout = 30.;
- getParameterValue(request, "timeout", timeout);
-
-
-
- // Write the form.
- html <<
- "<h2>Display the local assembly graph near a given segment</h2>"
- "<form>"
- "<table>"
-
- "<tr>"
- "<td>Start segment"
- "<td class=centered><input type=text required name=startSegmentId size=8 style='text-align:center'"
- " value='" << (startSegmentIdIsPresent ? to_string(startSegmentId) : "") <<
- "'>"
-
- "<tr>"
- "<td>Maximum distance in the assembly graph (edges)"
- "<td class=centered><input type=text name=maxDistance size=8 style='text-align:center'"
- " value='" << maxDistance <<
- "'>"
-
- "<tr>"
- "<td>Timeout for graph layout (seconds)"
- "<td class=centered><input type=text name=timeout size=8 style='text-align:center'"
- " value='" << timeout <<
- "'>";
-
- options.addFormRows(html);
-
- html <<
- "</table>"
- "<br><input type=submit value='Display'>"
- "</form>";
-
-
-
- if(not startSegmentIdIsPresent) {
- return;
- }
-
- if(startSegmentId >= assemblyGraph3Pointer->markerGraphPaths.size()) {
- html << "<p>Invalid start segment id. Maximum valid value is " <<
- assemblyGraph3Pointer->markerGraphPaths.size() - 1;
- return;
- }
- if(options.referenceSegmentId >= assemblyGraph3Pointer->markerGraphPaths.size()) {
- html << "<p>Invalid reference segment id. Maximum valid value is " <<
- assemblyGraph3Pointer->markerGraphPaths.size() - 1;
- return;
- }
-
-
- html << "<h1>Local assembly graph near segment " << startSegmentId << "</h1></p>";
-
-
-
- // Create the local assembly graph, or reuse the last one, if possible.
- static shared_ptr<mode3::LocalAssemblyGraph> lastLocalAssemblyGraphPointer;
- static shared_ptr<mode3::LocalAssemblyGraph::SvgOptions> lastOptions;
- static uint64_t lastStartSegmentId = invalid<uint64_t>;
- static uint64_t lastMaxDistance = invalid<uint64_t>;
- const bool canReuse =
- lastLocalAssemblyGraphPointer and
- (startSegmentId == lastStartSegmentId) and
- (maxDistance == lastMaxDistance) and
- options.hasSameLayoutOptions(*lastOptions);
- if(canReuse) {
- cout << "Reusing the previous mode3::LocalAssemblyGraph." << endl;
- } else {
- lastLocalAssemblyGraphPointer = make_shared<mode3::LocalAssemblyGraph>(
- markerGraph,
- *assemblyGraph3Pointer,
- startSegmentId, maxDistance);
- lastOptions = make_shared<mode3::LocalAssemblyGraph::SvgOptions>(options);
- lastStartSegmentId = startSegmentId;
- lastMaxDistance = maxDistance;
- lastLocalAssemblyGraphPointer->computeLayout(options, timeout);
- lastLocalAssemblyGraphPointer->computeSegmentTangents();
- }
- mode3::LocalAssemblyGraph& localAssemblyGraph = *lastLocalAssemblyGraphPointer;
-
- html << "<p>The local assembly graph has " <<
- num_vertices(localAssemblyGraph) << " segments and " <<
- num_edges(localAssemblyGraph) << " links."
- "<p>";
-
-
-
- // Display the local assembly graph.
- localAssemblyGraph.writeHtml(html, options);
-
- // To facilitate debugging and testing, also write a gfa file
- // that represents the LocalAssemblyGraph.
- localAssemblyGraph.writeGfa("LocalAssemblyGraph.gfa");
-
-}
-
-
-
-void Assembler::exploreMode3AssemblyGraphSegment(
- const vector<string>& request,
- ostream& html)
-{
- SHASTA_ASSERT(assemblyGraph3Pointer);
- const mode3::AssemblyGraph& assemblyGraph3 = *assemblyGraph3Pointer;
-
-
-
- // Get request parameters.
- uint64_t segmentId;
- const bool segmentIdIsPresent = getParameterValue(request, "segmentId", segmentId);
-
- string showOrientedReadsString;
- const bool showOrientedReads = HttpServer::getParameterValue(request,
- "showOrientedReads", showOrientedReadsString);
-
- string showMarkerGraphPathString;
- const bool showMarkerGraphPath = HttpServer::getParameterValue(request,
- "showMarkerGraphPath", showMarkerGraphPathString);
-
- string showSequenceString;
- const bool showSequence = HttpServer::getParameterValue(request,
- "showSequence", showSequenceString);
-
- string showSequenceDetailsString;
- const bool showSequenceDetails = HttpServer::getParameterValue(request,
- "showSequenceDetails", showSequenceDetailsString);
-
-
-
- // Write the form.
- html <<
- "<h3>Display details of an assembly graph segment</h3>"
- "<form>"
- "<table>"
-
- "<tr>"
- "<td>Segment id"
- "<td><input type=text required name=segmentId size=8 style='text-align:center'"
- " value='" << (segmentIdIsPresent ? to_string(segmentId) : "") <<
- "'>"
-
- "<tr>"
- "<td>Show oriented reads"
- "<td class=centered> <input type=checkbox name=showOrientedReads" <<
- (showOrientedReads ? " checked=checked" : "") <<
- ">"
-
- "<tr>"
- "<td>Show marker graph path"
- "<td class=centered> <input type=checkbox name=showMarkerGraphPath" <<
- (showMarkerGraphPath ? " checked=checked" : "") <<
- ">"
-
- "<tr>"
- "<td>Show sequence"
- "<td class=centered> <input type=checkbox name=showSequence" <<
- (showSequence ? " checked=checked" : "") <<
- ">"
-
- "<tr>"
- "<td>Show sequence assembly details"
- "<td class=centered> <input type=checkbox name=showSequenceDetails" <<
- (showSequenceDetails ? " checked=checked" : "") <<
- ">"
-
- "</table>"
- "<br><input type=submit value='Display'>"
- "</form>";
-
- // If the segmentId was not specified, stop here.
- if(not segmentIdIsPresent) {
- return;
- }
-
- // Check that we have a valid segmentId.
- if(segmentId >= assemblyGraph3.markerGraphPaths.size()) {
- html << "Invalid segment id. Maximum valid value is " <<
- assemblyGraph3.markerGraphPaths.size() - 1 << ".";
- return;
- }
-
- // Access the marker graph path for this segment.
- const auto path = assemblyGraph3.markerGraphPaths[segmentId];
-
- // Get information about the oriented reads of this segment.
- mode3::AssemblyGraph::SegmentOrientedReadInformation orientedReads;
- assemblyGraph3.getOrientedReadsOnSegment(segmentId, orientedReads);
-
- const auto oldPrecision = html.precision(1);
- const auto oldFlags = html.setf(std::ios_base::fixed, std::ios_base::floatfield);
- html <<
- "<h1>Assembly graph segment " << segmentId << "</h1>"
- "<p><table>"
- "<tr><th class=left>Length of marker graph path<td class=centered>" << path.size() <<
- "<tr><th class=left>Average marker graph edge coverage on path<td class=centered>" <<
- assemblyGraph3.segmentCoverage[segmentId] <<
- "<tr><th class=left>Number of distinct oriented reads on path<td class=centered>" << orientedReads.infos.size();
-
- // Write the incoming and outgoing links.
- html << "<tr><th class=left>Incoming links<td class=centered>";
- for(const uint64_t linkId: assemblyGraph3.linksByTarget[segmentId]) {
- html << "<a href='exploreMode3AssemblyGraphLink?linkId=" << linkId << "'>" << linkId << "</a> ";
- }
- html << "<tr><th class=left>Outgoing links<td class=centered>";
- for(const uint64_t linkId: assemblyGraph3.linksBySource[segmentId]) {
- html << "<a href='exploreMode3AssemblyGraphLink?linkId=" << linkId << "'>" << linkId << "</a> ";
- }
-
-
- html << "</table>";
- html.precision(oldPrecision);
- html.flags(oldFlags);
-
-
-
- // Write the oriented reads in a table.
- if(showOrientedReads) {
- html <<
- "<h2>Oriented reads on this segment</h2>"
- "<table>"
- "<tr>"
- "<th>Oriented<br>read"
- "<th>Average<br>offset";
- for(const auto& info: orientedReads.infos) {
- html<<
- "<tr>"
- "<td class=centered>" << info.orientedReadId <<
- "<td class=centered>" << info.averageOffset;
- }
- html << "</table>";
- }
-
-
-
- // Write the marker graph path.
- if(showMarkerGraphPath) {
- html <<
- "<h2>Marker graph path for this segment</h2>"
- "<table>"
- "<tr>"
- "<th>Position"
- "<th>Edge"
- "<th>Coverage"
- "<th>Source<br>vertex"
- "<th>Target<br>vertex";
-
- for(uint64_t position=0; position<path.size(); position++) {
- const MarkerGraphEdgeId& edgeId = path[position];
- const MarkerGraph::Edge& edge = markerGraph.edges[edgeId];
- const MarkerGraph::VertexId vertexId0 = edge.source;
- const MarkerGraph::VertexId vertexId1 = edge.target;
-
- html << "<tr>"
- "<td class=centered>" << position <<
- "<td class=centered>" <<
- "<a href='exploreMarkerGraphEdge?edgeId=" << edgeId <<
- "'>" << edgeId << "</a>"
- "<td class=centered>" << markerGraph.edgeMarkerIntervals.size(edgeId) <<
- "<td class=centered>" <<
- "<a href='exploreMarkerGraphVertex?vertexId=" << vertexId0 <<
- "'>" << vertexId0 << "</a>"
- "<td class=centered>" <<
- "<a href='exploreMarkerGraphVertex?vertexId=" << vertexId1 <<
- "'>" << vertexId1 << "</a>"
- "\n";
-
-
-
- }
- html << "</table>";
- }
-
-
-
- // Assembled sequence, optionally with details.
- if(showSequence or showSequenceDetails) {
-
- // Assemble the sequence for this segment.
- AssembledSegment assembledSegment;
- assembleMarkerGraphPath(
- assemblyGraph3.readRepresentation,
- assemblyGraph3.k,
- assemblyGraph3.markers,
- assemblyGraph3.markerGraph,
- assemblyGraph3.markerGraphPaths[segmentId],
- false,
- assembledSegment);
-
- // Check that the sequence we have is the same as the stored sequence
- // for this segment.
- SHASTA_ASSERT(std::equal(
- assembledSegment.rawSequence.begin(), assembledSegment.rawSequence.end(),
- assemblyGraph3.segmentSequences.begin(segmentId), assemblyGraph3.segmentSequences.end(segmentId)
- ));
-
- // Write the sequence.
- assembledSegment.writeHtml(html, showSequence, showSequenceDetails,
- 0, uint32_t(assembledSegment.rawSequence.size()));
- }
-
-
-}
-
-
-
-void Assembler::exploreMode3AssemblyGraphLink(
- const vector<string>& request,
- ostream& html)
-{
- SHASTA_ASSERT(assemblyGraph3Pointer);
- const mode3::AssemblyGraph& assemblyGraph3 = *assemblyGraph3Pointer;
-
- // Get the link id from the request.
- uint64_t linkId;
- const bool linkIdIsPresent = getParameterValue(request, "linkId", linkId);
-
-
-
- // Write the form.
- html <<
- "<h3>Display details of an assembly graph link</h3>"
- "<form>"
- "<table>"
-
- "<tr>"
- "<td>Link id"
- "<td><input type=text required name=linkId size=8 style='text-align:center'"
- " value='" << (linkIdIsPresent ? to_string(linkId) : "") <<
- "'>"
-
- "</table>"
- "<br><input type=submit value='Display'>"
- "</form>";
-
- // If the segmentId was not specified, stop here.
- if(not linkIdIsPresent) {
- return;
- }
-
- const mode3::AssemblyGraph::Link& link = assemblyGraph3.links[linkId];
- const auto transitions = assemblyGraph3.transitions[linkId];
- const uint64_t segmentId0 = link.segmentId0;
- const uint64_t segmentId1 = link.segmentId1;
- const auto path0 = assemblyGraph3.markerGraphPaths[segmentId0];
- const auto path1 = assemblyGraph3.markerGraphPaths[segmentId1];
- const uint64_t pathLength0 = path0.size();
- const uint64_t pathLength1 = path1.size();
-
- html <<
- "<h1>Assembly graph link " << linkId << "</h1>"
- "<p><table>"
- "<tr><th>Segment<th>Id<th>Path<br>length"
- "<tr><th class = left>Source segment<td class=centered>" << segmentId0 << "<td class=centered>" << pathLength0 <<
- "<tr><th class = left>Target segment<td class=centered>" << segmentId1 << "<td class=centered>" << pathLength1 <<
- "</table>";
-
- if(link.segmentsAreAdjacent) {
- html << "<p>The paths of these segments are adjacent.";
- } else {
- html << "<p>The paths of these segments are not adjacent.";
- }
-
-
- const auto oldPrecision = html.precision(1);
- const auto oldFlags = html.setf(std::ios_base::fixed, std::ios_base::floatfield);
- html <<
- "<p><table>"
- "<tr><th class = left tooltip='Number of supporting transitions'>Coverage<td class=centered>" <<
- transitions.size() <<
- "<tr><th class = left>Average link separation<td class=centered>" <<
- link.separation <<
- "</table>";
- html.precision(oldPrecision);
- html.flags(oldFlags);
-
-
- html <<
- "<h2>Transitions</h2>"
- "<p><table><tr>"
- "<th class=centered>Oriented<br>read<br>id"
- "<th class=centered>Last<br>position<br>on segment<br>" << link.segmentId0 <<
- "<th class=centered>Last<br>ordinal<br>on segment<br>" << link.segmentId0 <<
- "<th class=centered>First<br>position<br>on segment<br>" << link.segmentId1 <<
- "<th class=centered>First<br>ordinal<br>on segment<br>" << link.segmentId1 <<
- "<th class=centered>Link<br>separation";
-
-
- for(const auto& p: transitions) {
- const OrientedReadId orientedReadId = p.first;
- const Transition& transition = p.second;
- const auto& pseudoPathEntry0 = transition[0];
- const auto& pseudoPathEntry1 = transition[1];
-
- SHASTA_ASSERT(pseudoPathEntry1.ordinals[0] >= pseudoPathEntry0.ordinals[1]);
-
- const int64_t linkSeparation =
- int64_t(pseudoPathEntry1.ordinals[0] - pseudoPathEntry0.ordinals[1]) -
- int64_t(pathLength0 - 1 - pseudoPathEntry0.position) -
- int64_t(pseudoPathEntry1.position);
-
- html <<
- "<tr><td class=centered>" << orientedReadId <<
-
- "<td class=centered>" << pseudoPathEntry0.position <<
- "<td class=centered>" << pseudoPathEntry0.ordinals[1] <<
-
- "<td class=centered>" << pseudoPathEntry1.position <<
- "<td class=centered>" << pseudoPathEntry1.ordinals[0] <<
-
- "<td class=centered>" << linkSeparation;
- }
- html << "</table>";
-
-
-
-
-}
-
-
-
-void Assembler::exploreMode3AssemblyGraphSegmentPair(
- const vector<string>& request,
- ostream& html)
-{
- using boost::icl::discrete_interval;
- using boost::icl::intersects;
- using boost::icl::length;
-
- SHASTA_ASSERT(assemblyGraph3Pointer);
- const mode3::AssemblyGraph& assemblyGraph3 = *assemblyGraph3Pointer;
-
- // Get the segment ids from the request.
- uint64_t segmentId0;
- const bool segmentId0IsPresent = getParameterValue(request, "segmentId0", segmentId0);
- uint64_t segmentId1;
- const bool segmentId1IsPresent = getParameterValue(request, "segmentId1", segmentId1);
-
-
-
- // Write the form.
- html <<
- "<h3>Display details for a pair assembly graph segment</h3>"
- "<form>"
- "<table>"
-
- "<tr>"
- "<td>Segment id 0"
- "<td><input type=text required name=segmentId0 size=8 style='text-align:center'"
- " value='" << (segmentId0IsPresent ? to_string(segmentId0) : "") <<
- "'>"
-
- "<tr>"
- "<td>Segment id 1"
- "<td><input type=text required name=segmentId1 size=8 style='text-align:center'"
- " value='" << (segmentId1IsPresent ? to_string(segmentId1) : "") <<
- "'>"
-
- "</table>"
- "<br><input type=submit value='Display'>"
- "</form>";
-
- // If the segmentId's were not specified, stop here.
- if(not segmentId0IsPresent) {
- return;
- }
- if(not segmentId1IsPresent) {
- return;
- }
-
- // Check that we have valid segmentId's.
- if(segmentId0 >= assemblyGraph3.markerGraphPaths.size()) {
- html << "Invalid segment id. Maximum valid value is " <<
- assemblyGraph3.markerGraphPaths.size() - 1 << ".";
- return;
- }
- if(segmentId1 >= assemblyGraph3.markerGraphPaths.size()) {
- html << "Invalid segment id. Maximum valid value is " <<
- assemblyGraph3.markerGraphPaths.size() - 1 << ".";
- return;
- }
-
-
- // Get information about the oriented reads of these segments.
- mode3::AssemblyGraph::SegmentOrientedReadInformation orientedReads0;
- mode3::AssemblyGraph::SegmentOrientedReadInformation orientedReads1;
- assemblyGraph3.getOrientedReadsOnSegment(segmentId0, orientedReads0);
- assemblyGraph3.getOrientedReadsOnSegment(segmentId1, orientedReads1);
- const uint64_t length0 = assemblyGraph3.markerGraphPaths.size(segmentId0);
- const uint64_t length1 = assemblyGraph3.markerGraphPaths.size(segmentId1);
-
- // Estimate the offset between the segments and count missing
- // oriented reads.
- SegmentPairInformation segmentPairInformation;
- assemblyGraph3.analyzeSegmentPair(
- segmentId0, segmentId1,
- orientedReads0, orientedReads1,
- markers, segmentPairInformation);
- const uint64_t commonCount = segmentPairInformation.commonCount;
-
-
-
- /// Write a table with information about this pair of segments.
- html <<
- "<p>"
- "<table>"
-
- "<tr>"
- "<th class=left>Segment id"
- "<td class=centered>" << segmentId0 <<
- "<td class=centered>" << segmentId1 <<
-
- "<tr title='Segment length in marker graph edges'>"
- "<th class=left>Length"
- "<td class=centered>" << length0 <<
- "<td class=centered>" << length1 <<
-
- "<tr title='Total number of oriented reads in this segment'>"
- "<th class=left>Total"
- "<td class=centered>" << segmentPairInformation.totalCount[0] <<
- "<td class=centered>" << segmentPairInformation.totalCount[1] <<
-
- "<tr title='Number of oriented reads present in both segments'>"
- "<th class=left>Common"
- "<td class=centered>" << segmentPairInformation.commonCount <<
- "<td class=centered>" << segmentPairInformation.commonCount;
-
- if(segmentPairInformation.commonCount > 0) {
- const auto oldPrecision = html.precision(2);
- const auto oldFlags = html.setf(std::ios_base::fixed, std::ios_base::floatfield);
- html <<
- "<tr title='Number of oriented reads in this segment that are too short to appear in the other segment'>"
- "<th class=left>Short"
- "<td class=centered>" << segmentPairInformation.shortCount[0] <<
- "<td class=centered>" << segmentPairInformation.shortCount[1] <<
-
- "<tr title='Number of oriented reads in this segment that are "
- "unexpectedly missing in the other segment'>"
- "<th class=left>Unexplained"
- "<td class=centered>" << segmentPairInformation.unexplainedCount[0] <<
- "<td class=centered>" << segmentPairInformation.unexplainedCount[1] <<
-
- "<tr title='Jaccard similarity without counting short reads'>"
- "<th class=left>Jaccard"
- "<td class=centered>" << segmentPairInformation.jaccard() <<
- "<td class=centered>" << segmentPairInformation.jaccard() <<
-
- "<tr title='Jaccard similarity without special treatment of short reads'>"
- "<th class=left>Raw Jaccard"
- "<td class=centered>" << segmentPairInformation.rawJaccard() <<
- "<td class=centered>" << segmentPairInformation.rawJaccard() <<
-
- "<tr title='Fraction of oriented reads in this segment that are "
- "unexpectedly missing in the other segment'>"
- "<th class=left>Unexplained fraction"
- "<td class=centered>" << segmentPairInformation.unexplainedFraction(0) <<
- "<td class=centered>" << segmentPairInformation.unexplainedFraction(1);
- html.precision(oldPrecision);
- html.flags(oldFlags);
- }
-
- html << "</table>";
- if(segmentPairInformation.commonCount > 0) {
- html << "<p>Estimated offset " << segmentPairInformation.offset;
- html << "<br>Estimated gap " << segmentPairInformation.offset - int64_t(length0);
- }
-
-
-
- // Write a table with a row for each oriented read.
- html <<
- "<p>"
- "<table>"
- "<tr>"
- "<th>Oriented<br>read"
- "<th>Length"
- "<th>Average<br>offset of<br>oriented read<br>relative to<br>segment " << segmentId0 <<
- "<th>Average<br>offset of<br>oriented read<br>relative to<br>segment " << segmentId1 <<
- "<th>Estimated<br>offset of<br>segment " << segmentId1 <<
- "<br>relative to<br>segment " << segmentId0 <<
- "<th>Hypothetical<br>offset of<br>oriented read<br>relative to<br>segment " << segmentId0 <<
- "<th>Hypothetical<br>offset of<br>oriented read<br>relative to<br>segment " << segmentId1 <<
- "<th>Hypothetical<br>overlap of<br>oriented read<br>with<br>segment " << segmentId0 <<
- "<th>Hypothetical<br>overlap of<br>oriented read<br>with<br>segment " << segmentId1 <<
- "<th>On both<br>segments" <<
- "<th>Too<br>short" <<
- "<th>On segment<br>" << segmentId0 << "<br>only,<br>missing from<br>segment<br>" << segmentId1 <<
- "<th>On segment<br>" << segmentId1 << "<br>only,<br>missing from<br>segment<br>" << segmentId0;
-
-
- // Set up a joint loop over oriented reads in the two segments.
- const auto begin0 = orientedReads0.infos.begin();
- const auto begin1 = orientedReads1.infos.begin();
- const auto end0 = orientedReads0.infos.end();
- const auto end1 = orientedReads1.infos.end();
- auto it0 = begin0;
- auto it1 = begin1;
-
-
-
- while(true) {
-
- // At end of both segments.
- if(it0 == end0 and it1 == end1) {
- break;
- }
-
-
-
- // Only on segment 0.
- if((it1 == end1) or ((it0!=end0) and (it0->orientedReadId < it1->orientedReadId))) {
- const int64_t orientedReadLength = markers.size(it0->orientedReadId.getValue());
- html <<
- "<tr>"
- "<td class=centered>" <<
- "<a href='exploreRead?readId=" << it0->orientedReadId.getReadId() <<
- "&strand=" << it0->orientedReadId.getStrand() << "'>" << it0->orientedReadId << "</a>"
- "<td class=centered>" << orientedReadLength <<
- "<td class=centered>" << it0->averageOffset <<
- "<td>"
- "<td><td>";
-
- if(commonCount) {
- // Compute the hypothetical range of the oriented read relative
- // to the beginning of segment 1.
- const discrete_interval<int64_t> orientedReadRange1(
- it0->averageOffset - segmentPairInformation.offset,
- it0->averageOffset - segmentPairInformation.offset + orientedReadLength);
- const discrete_interval<int64_t> segment1Range(0, length1);
- const bool wouldOverlap = intersects(orientedReadRange1, segment1Range);
- html <<
- "<td class=centered>" << orientedReadRange1.lower() <<
- "<td><td class=centered>" << length(orientedReadRange1 & segment1Range);
- if(wouldOverlap) {
- html << "<td><td><td class=centered>&#10003;<td>";
- } else {
- html << "<td><td class=centered>&#10003;<td><td>";
- }
- } else {
- html << "<td><td><td><td><td><td><td>";
- }
- ++it0;
- }
-
-
-
- // Only on segment 1
- else if((it0 == end0) or ((it1!=end1) and (it1->orientedReadId < it0->orientedReadId))) {
- const int64_t orientedReadLength = markers.size(it1->orientedReadId.getValue());
- html <<
- "<tr>"
- "<td class=centered>" <<
- "<a href='exploreRead?readId=" << it1->orientedReadId.getReadId() <<
- "&strand=" << it1->orientedReadId.getStrand() << "'>" << it1->orientedReadId << "</a>"
- "<td class=centered>" << orientedReadLength <<
- "<td>"
- "<td class=centered>" << it1->averageOffset <<
- "<td>";
-
- if(commonCount) {
- // Compute the hypothetical range of the oriented read relative
- // to the beginning of segment 0.
- const discrete_interval<int64_t> orientedReadRange0(
- it1->averageOffset + segmentPairInformation.offset,
- it1->averageOffset + segmentPairInformation.offset + orientedReadLength);
- const discrete_interval<int64_t> segment0Range(0, length0);
- const bool wouldOverlap = intersects(orientedReadRange0, segment0Range);
- html <<
- "<td class=centered>" << orientedReadRange0.lower() <<
- "<td><td class=centered>" << length(orientedReadRange0 & segment0Range) << "<td>";
- if(wouldOverlap) {
- html << "<td><td><td><td class=centered>&#10003;";
- } else {
- html << "<td><td class=centered>&#10003;<td><td>";
- }
-
- } else {
- html << "<td><td><td><td><td><td><td><td>";
- }
-
- ++it1;
- }
-
- // On both segments.
- else {
- html <<
- "<tr>"
- "<td class=centered>" <<
- "<a href='exploreRead?readId=" << it0->orientedReadId.getReadId() <<
- "&strand=" << it0->orientedReadId.getStrand() << "'>" << it0->orientedReadId << "</a>"
- "<td class=centered>" << markers.size(it0->orientedReadId.getValue()) <<
- "<td class=centered>" << it0->averageOffset <<
- "<td class=centered>" << it1->averageOffset <<
- "<td class=centered>" << it0->averageOffset - it1->averageOffset <<
- "<td><td><td><td>"
- "<td class=centered>&#10003;<td><td><td>";
-
- ++it0;
- ++it1;
- }
- }
- html << "</table>";
-
-}
-
-
-
-void Assembler::exploreMode3MetaAlignment(
- const vector<string>& request,
- ostream& html)
-{
- // Access the mode 3 assembly graph.
- SHASTA_ASSERT(assemblyGraph3Pointer);
- const mode3::AssemblyGraph& assemblyGraph3 = *assemblyGraph3Pointer;
-
- // Get the oriented read ids from the request.
- string orientedReadId0String;
- const bool orientedReadId0IsPresent = getParameterValue(request, "orientedReadId0", orientedReadId0String);
- string orientedReadId1String;
- const bool orientedReadId1IsPresent = getParameterValue(request, "orientedReadId1", orientedReadId1String);
-
- // Write the form.
- html <<
- "Enter the two oriented read ids:"
- "<form>"
- "<p><input type=text size=8 name=orientedReadId0 value='" <<
- (orientedReadId0IsPresent ? orientedReadId0String : "") << "'>"
- "<p><input type=text size=8 name=orientedReadId1 value='" <<
- (orientedReadId1IsPresent ? orientedReadId1String : "") << "'>"
- "<p><input type=submit value='Compute the meta-alignment'>"
- "</form>";
-
- // If the oriented reads are not present, do nothing.
- if(not(orientedReadId0IsPresent and orientedReadId1IsPresent)) {
- return;
- }
- const OrientedReadId orientedReadId0(orientedReadId0String);
- const OrientedReadId orientedReadId1(orientedReadId1String);
-
- html << "<h1>Meta-alignment of oriented reads " <<
- orientedReadId0 << " " << orientedReadId1 << "</h1>";
-
- // Access the pseudo-paths, that is the meta-sequences to be aligned.
- const auto pseudoPath0 = assemblyGraph3.assemblyGraphJourneys[orientedReadId0.getValue()];
- const auto pseudoPath1 = assemblyGraph3.assemblyGraphJourneys[orientedReadId1.getValue()];
- const int n0 = int(pseudoPath0.size());
- const int n1 = int(pseudoPath1.size());
-
- // Create a png file representing the alignment matrix.
- PngImage image = PngImage(int(n0), int(n1));
- for(int i0=0; i0<n0; i0++) {
- const uint64_t segmentId0 = pseudoPath0[i0].segmentId;
- for(int i1=0; i1<n1; i1++) {
- const uint64_t segmentId1 = pseudoPath1[i1].segmentId;
- if(segmentId0 == segmentId1) {
- image.setPixel(i0, i1, 255, 0, 0);
- }
- }
- }
- image.write("MetaAlignment.png");
-
- // Create a base64 version of the png file.
- const string command = "base64 MetaAlignment.png > MetaAlignment.png.base64";
- ::system(command.c_str());
-
- // Display the picture with the alignment.
- // image-rendering:crisp-edges; is currently supported on Firefox but not Chrome,
- // so Chrome will display blurry pictures.
- html <<
- "<h3>Alignment matrix</h3>"
- "<p><img "
- " style='width:" << 3*n0 << "px;height:auto;image-rendering:crisp-edges;'"
- "src=\"data:image/png;base64,";
- ifstream png("MetaAlignment.png.base64");
- html << png.rdbuf();
- html << "\"/>";
-
-}
-
-
-
-void Assembler::exploreMode3AssemblyPath(
- const vector<string>& request,
- ostream& html)
-{
- // Get the parametersof the request.
-
- // The segment that the path will start from.
- string pathStartString;
- HttpServer::getParameterValue(request, "pathStart", pathStartString);
-
- // The path direction can be forward, backward, or bidirectional.
- string pathDirection = "bidirectional";
- HttpServer::getParameterValue(request, "pathDirection", pathDirection);
-
-
-
- // Write the form.
- html <<
- "<h2>Assembly path computation</h2>"
- "<form>"
-
- "Start the path at segment &nbsp;<input type=text name=pathStart required size=8 style='text-align:center'"
- " value='" << pathStartString << "'>"
-
- "<br><input type=radio name=pathDirection value=forward" <<
- (pathDirection=="forward" ? " checked=checked" : "") << "> Forward"
- "<br><input type=radio name=pathDirection value=backward" <<
- (pathDirection=="backward" ? " checked=checked" : "") << "> Backward"
- "<br><input type=radio name=pathDirection value=bidirectional" <<
- (pathDirection=="bidirectional" ? " checked=checked" : "") << "> Both directions" <<
-
- "<p><input type=submit value='Compute the path and assemble its sequence'>"
- "</form>";
-
- // If the path start was not specified, stop here.
- if(pathStartString.empty()) {
- return;
- }
-
- // Get the path start segment.
- uint64_t pathStart;
- try {
- pathStart = boost::lexical_cast<uint64_t>(pathStartString);
- } catch(std::exception&) {
- throw runtime_error("Invalid path start segment id.");
- }
-
- // Check that it is a valid segment id.
- const mode3::AssemblyGraph& assemblyGraph = *assemblyGraph3Pointer;
- if(pathStart >= assemblyGraph.markerGraphPaths.size()) {
- throw runtime_error("Invalid path start segment id. The assembly graph has " +
- to_string(assemblyGraph.markerGraphPaths.size()) + " segments.");
- }
-
- // Write a header.
- html << "<h1>Assembly path</h1>";
-
-
-
- // Compute the assembly path.
- AssemblyPath path;
- if(pathDirection == "forward" or pathDirection == "backward") {
-
- // Forward or backward.
- assemblyGraph.createAssemblyPath(pathStart,
- (pathDirection == "forward") ? 0 : 1, path);
- if(pathDirection == "backward") {
- reverse(path.segments.begin(), path.segments.end());
- }
-
- } else {
-
- // Bidirectional.
- AssemblyPath forwardPath;
- AssemblyPath backwardPath;
- assemblyGraph.createAssemblyPath(pathStart, 0, forwardPath);
- assemblyGraph.createAssemblyPath(pathStart, 1, backwardPath);
-
- // Stitch them together, making sure not to repeat the starting segment.
- path.segments.clear();
- copy(backwardPath.segments.rbegin(), backwardPath.segments.rend(), back_inserter(path.segments));
- copy(forwardPath.segments.begin() + 1, forwardPath.segments.end(), back_inserter(path.segments));
-
- }
-
- html << "<p>This assembly path was created starting at segment " << pathStart <<
- " and moving ";
- if(pathDirection == "forward") {
- html << "forward.";
- } else if(pathDirection == "backward") {
- html << "backward.";
- } else if(pathDirection == "bidirectional") {
- html << "in both directions.";
- }
-
- // Assemble sequence for this path.
- path.assemble(assemblyGraph);
-
- // Write path details to html.
- path.writeHtml(html, assemblyGraph);
-
-
-}
-
-
-
-void Assembler::exploreMode3LinkAssembly(
- const vector<string>& request,
- ostream& html)
-{
- // Access the AssemblyGraph.
- using mode3::AssemblyGraph; // Hide shasta::AssemblyGraph;
- SHASTA_ASSERT(assemblyGraph3Pointer);
- const AssemblyGraph& assemblyGraph = *assemblyGraph3Pointer;
-
- // Get the parameters of the request.
- uint64_t linkId = invalid<uint64_t>;
- getParameterValue(request, "linkId", linkId);
- SHASTA_ASSERT(linkId < assemblyGraph.links.size());
- uint64_t previousPrimarySegmentId = invalid<uint64_t>;
- getParameterValue(request, "previousPrimarySegmentId", previousPrimarySegmentId);
- SHASTA_ASSERT(previousPrimarySegmentId < assemblyGraph.markerGraphPaths.size());
- uint64_t nextPrimarySegmentId = invalid<uint64_t>;
- getParameterValue(request, "nextPrimarySegmentId", nextPrimarySegmentId);
- SHASTA_ASSERT(nextPrimarySegmentId < assemblyGraph.markerGraphPaths.size());
-
- // Access the link.
- if(linkId >= assemblyGraph.links.size()) {
- html << "Invalid link id. There are " << assemblyGraph.links.size() <<
- " links in the assembly graph.";
- return;
- }
- const AssemblyGraph::Link& link = assemblyGraph.links[linkId];
-
- // If this is a trivial link, there is nothing to show.
- if(link.segmentsAreAdjacent) {
- html << "This is a trivial link. No assembly is required.";
- return;
- }
-
-
-
- html << "<h1>Details of link assembly</h1>";
-
- // Create the segments and assemble them.
- AssemblyPathSegment segment0(link.segmentId0, false);
- AssemblyPathSegment segment1(link.segmentId1, false);
- assembleMarkerGraphPath(
- assemblyGraph.readRepresentation,
- assemblyGraph.k,
- assemblyGraph.markers,
- assemblyGraph.markerGraph,
- assemblyGraph.markerGraphPaths[segment0.id],
- false,
- segment0.assembledSegment);
- assembleMarkerGraphPath(
- assemblyGraph.readRepresentation,
- assemblyGraph.k,
- assemblyGraph.markers,
- assemblyGraph.markerGraph,
- assemblyGraph.markerGraphPaths[segment1.id],
- false,
- segment1.assembledSegment);
-
- // Create the AssemblyPathLink.
- AssemblyPathLink assemblyPathLink;
- assemblyPathLink.id = linkId;
- assemblyPathLink.isTrivial = false;
- assemblyPathLink.previousPrimarySegmentId = previousPrimarySegmentId;
- assemblyPathLink.nextPrimarySegmentId = nextPrimarySegmentId;
-
- // Do the assembly.
- AssemblyPath::assembleNonTrivialLink(
- assemblyGraph,
- segment0,
- segment1,
- assemblyPathLink,
- html);
-}
diff --git a/src/AssemblerHttpServer-ReadGraph.cpp b/src/AssemblerHttpServer-ReadGraph.cpp
index c5ba0a9..5281da6 100644
--- a/src/AssemblerHttpServer-ReadGraph.cpp
+++ b/src/AssemblerHttpServer-ReadGraph.cpp
@@ -413,6 +413,16 @@ void Assembler::exploreUndirectedReadGraph(
+ // Cross strand edges are drawn purple.
+ BGL_FORALL_EDGES(e, graph, LocalReadGraph) {
+ const LocalReadGraphEdge& edge = graph[e];
+ if(edge.crossesStrands) {
+ graph[e].color = "Purple";
+ }
+ }
+
+
+
// Triangle analysis of the local read graph, if requested.
LocalReadGraphTriangles triangles;
if(alignmentAnalysis == "triangles") {
@@ -508,6 +518,7 @@ void Assembler::exploreUndirectedReadGraph(
vertexScalingFactor,
edgeThicknessScalingFactor,
maxDistance,
+ *this,
html);
}
diff --git a/src/AssemblerHttpServer-Reads.cpp b/src/AssemblerHttpServer-Reads.cpp
index c1424a5..6ec17e2 100644
--- a/src/AssemblerHttpServer-Reads.cpp
+++ b/src/AssemblerHttpServer-Reads.cpp
@@ -654,6 +654,8 @@ void Assembler::exploreReadRle(
if (marker.position < beginRlePosition || marker.position > endRlePosition-k) {
continue;
}
+ const Kmer kmer = getOrientedReadMarkerKmer(orientedReadId, ordinal);
+ const KmerId kmerId = KmerId(kmer.id(k));
// See if this marker is contained in a vertex of the marker graph.
const MarkerGraph::VertexId vertexId =
@@ -664,10 +666,9 @@ void Assembler::exploreReadRle(
// Write the k-mer of this marker.
- const Kmer kmer(marker.kmerId, k);
html << "<a xlink:title='Marker " << ordinal <<
", position " << marker.position <<
- ", k-mer id " << marker.kmerId;
+ ", k-mer id " << kmerId;
if(hasMarkerGraphVertex) {
html << ", coverage " << markerGraph.vertexCoverage(vertexId);
}
@@ -675,7 +676,7 @@ void Assembler::exploreReadRle(
if(hasMarkerGraphVertex) {
// Add a hyperlink to the marker graph vertex
// that contains this marker.
- const string url = "exploreMarkerGraph?vertexId=" + to_string(vertexId) +
+ const string url = "exploreMarkerGraph0?vertexId=" + to_string(vertexId) +
"&maxDistance=2&detailed=on&minCoverage=3&minConsensus=3&sizePixels=320&timeout=30";
html << " xlink:href='" << url << "' style='cursor:pointer'";
}
@@ -762,7 +763,7 @@ void Assembler::exploreReadRle(
// Loop over all markers on this oriented read.
for(uint32_t ordinal=0; ordinal<orientedReadMarkers.size(); ordinal++) {
const CompressedMarker& marker = orientedReadMarkers[ordinal];
- const Kmer kmer(marker.kmerId, k);
+ const Kmer kmer = getOrientedReadMarkerKmer(orientedReadId, ordinal);
const uint32_t rlePosition = marker.position;
const uint32_t rawPosition = rawPositions[rlePosition];
@@ -783,7 +784,7 @@ void Assembler::exploreReadRle(
html << "</code><td class=centered>" << rlePosition << "<td class=centered>" << rawPosition;
if(hasMarkerGraphVertex) {
- const string url = "exploreMarkerGraph?vertexId=" + to_string(vertexId) +
+ const string url = "exploreMarkerGraph0?vertexId=" + to_string(vertexId) +
"&maxDistance=2&detailed=on&minCoverage=3&minConsensus=3&sizePixels=320&timeout=30";
html << "<td class=centered><a href='" << url << "'>" << vertexId << "</a>"
"<td class=centered>" << markerGraph.vertexCoverage(vertexId);
@@ -806,7 +807,7 @@ void Assembler::exploreReadRle(
for(uint32_t ordinal=0; ordinal<uint32_t(orientedReadMarkers.size()); ordinal++) {
const CompressedMarker& marker = orientedReadMarkers[ordinal];
if (marker.position >= beginRlePosition && marker.position <= endRlePosition - k) {
- kmers.push_back(marker.kmerId);
+ kmers.push_back(getOrientedReadMarkerKmerId(orientedReadId, ordinal));
}
}
vector<uint32_t> kmerFrequency;
@@ -1081,10 +1082,10 @@ void Assembler::exploreReadRaw(
for(const uint64_t ordinal: markersOnThisRow) {
const CompressedMarker& marker = orientedReadMarkers[ordinal];
const uint64_t position = marker.position - beginPosition;
- const Kmer kmer(marker.kmerId, k);
+ const Kmer kmer = getOrientedReadMarkerKmer(orientedReadId, ordinal);
// Write the required number of spaces.
- SHASTA_ASSERT(position > oldPosition); // There must be at least a blank.
+ SHASTA_ASSERT((position==0) or (position > oldPosition)); // There must be at least a blank.
for(uint64_t i=oldPosition; i<position; i++) {
html << "&nbsp;";
}
@@ -1100,7 +1101,7 @@ void Assembler::exploreReadRaw(
// There is a marker graph vertex.
// Write the marker as a link to that vertex.
- const string url = "exploreMarkerGraph?vertexId=" + to_string(vertexId) +
+ const string url = "exploreMarkerGraph0?vertexId=" + to_string(vertexId) +
"&maxDistance=6&detailed=on&sizePixels=600&timeout=30";
html << "<a href='" << url << "' title='Marker " << ordinal <<
", position " << marker.position <<
diff --git a/src/AssemblerHttpServer.cpp b/src/AssemblerHttpServer.cpp
index f128285..ee671b7 100644
--- a/src/AssemblerHttpServer.cpp
+++ b/src/AssemblerHttpServer.cpp
@@ -8,6 +8,7 @@
#include "platformDependent.hpp"
#include "Reads.hpp"
using namespace shasta;
+using namespace mode0;
// Boost libraries.
#include <boost/tokenizer.hpp>
@@ -230,9 +231,11 @@ void Assembler::fillServerFunctionTable()
SHASTA_ADD_TO_FUNCTION_TABLE(alignSequencesInMarkerRepresentation);
SHASTA_ADD_TO_FUNCTION_TABLE(assessAlignments);
SHASTA_ADD_TO_FUNCTION_TABLE(exploreReadGraph);
- SHASTA_ADD_TO_FUNCTION_TABLE(exploreMarkerGraph);
+ SHASTA_ADD_TO_FUNCTION_TABLE(exploreMarkerGraph0);
+ SHASTA_ADD_TO_FUNCTION_TABLE(exploreMarkerGraph1);
SHASTA_ADD_TO_FUNCTION_TABLE(exploreMarkerGraphVertex);
SHASTA_ADD_TO_FUNCTION_TABLE(exploreMarkerGraphEdge);
+ SHASTA_ADD_TO_FUNCTION_TABLE(exploreMarkerGraphEdgePair);
SHASTA_ADD_TO_FUNCTION_TABLE(exploreMarkerCoverage);
SHASTA_ADD_TO_FUNCTION_TABLE(exploreMarkerGraphInducedAlignment);
SHASTA_ADD_TO_FUNCTION_TABLE(followReadInMarkerGraph);
@@ -241,14 +244,8 @@ void Assembler::fillServerFunctionTable()
SHASTA_ADD_TO_FUNCTION_TABLE(exploreAssemblyGraphEdge);
SHASTA_ADD_TO_FUNCTION_TABLE(exploreAssemblyGraphEdgesSupport);
SHASTA_ADD_TO_FUNCTION_TABLE(exploreCompressedAssemblyGraph);
- SHASTA_ADD_TO_FUNCTION_TABLE(exploreMode3AssemblyGraph);
- SHASTA_ADD_TO_FUNCTION_TABLE(exploreMode3AssemblyGraphSegment);
- SHASTA_ADD_TO_FUNCTION_TABLE(exploreMode3AssemblyGraphSegmentPair);
- SHASTA_ADD_TO_FUNCTION_TABLE(exploreMode3AssemblyGraphLink);
- SHASTA_ADD_TO_FUNCTION_TABLE(exploreMode3MetaAlignment);
- SHASTA_ADD_TO_FUNCTION_TABLE(exploreMode3AssemblyPath);
- SHASTA_ADD_TO_FUNCTION_TABLE(exploreMode3LinkAssembly);
+ SHASTA_ADD_TO_FUNCTION_TABLE(fillMode3AssemblyPathStep);
}
#undef SHASTA_ADD_TO_FUNCTION_TABLE
@@ -442,15 +439,37 @@ void Assembler::writeNavigation(ostream& html) const
writeNavigation(html, "Read graph", {
{"Read graph", "exploreReadGraph"},
});
- writeNavigation(html, "Marker graph", {
- {"Local marker graph", "exploreMarkerGraph?useBubbleReplacementEdges=on"},
- {"Marker graph vertices", "exploreMarkerGraphVertex"},
- {"Marker graph edges", "exploreMarkerGraphEdge"},
- {"Marker coverage", "exploreMarkerCoverage"},
- {"Induced alignments", "exploreMarkerGraphInducedAlignment"},
- {"Follow a read in the marker graph", "followReadInMarkerGraph"},
- {"Marker connectivity", "exploreMarkerConnectivity"},
- });
+
+
+
+ if(assemblerInfo->assemblyMode == 3) {
+ writeNavigation(html, "Marker graph", {
+ {"Local marker graph", "exploreMarkerGraph0?useBubbleReplacementEdges=on"},
+ {"Local marker graph for mode 3 assembly", "exploreMarkerGraph1"},
+ {"Marker graph vertices", "exploreMarkerGraphVertex"},
+ {"Marker graph edges", "exploreMarkerGraphEdge"},
+ {"Marker graph edge pairs", "exploreMarkerGraphEdgePair"},
+ {"Marker coverage", "exploreMarkerCoverage"},
+ {"Induced alignments", "exploreMarkerGraphInducedAlignment"},
+ {"Follow a read in the marker graph", "followReadInMarkerGraph"},
+ {"Marker connectivity", "exploreMarkerConnectivity"},
+ {"Assembly path step", "fillMode3AssemblyPathStep"},
+ {"Path graph", "exploreMode3PathGraph"},
+ });
+ } else {
+ writeNavigation(html, "Marker graph", {
+ {"Local marker graph", "exploreMarkerGraph0?useBubbleReplacementEdges=on"},
+ {"Marker graph vertices", "exploreMarkerGraphVertex"},
+ {"Marker graph edges", "exploreMarkerGraphEdge"},
+ {"Marker coverage", "exploreMarkerCoverage"},
+ {"Induced alignments", "exploreMarkerGraphInducedAlignment"},
+ {"Follow a read in the marker graph", "followReadInMarkerGraph"},
+ {"Marker connectivity", "exploreMarkerConnectivity"},
+ });
+ }
+
+
+
if(assemblerInfo->assemblyMode == 0) {
writeNavigation(html, "Assembly graph", {
{"Local assembly graph", "exploreAssemblyGraph"},
@@ -459,16 +478,6 @@ void Assembler::writeNavigation(ostream& html) const
{"Compressed assembly graph", "exploreCompressedAssemblyGraph"},
});
}
- if(assemblerInfo->assemblyMode == 3) {
- writeNavigation(html, "Assembly graph", {
- {"Local assembly graph", "exploreMode3AssemblyGraph"},
- {"Assembly graph segments", "exploreMode3AssemblyGraphSegment"},
- {"Assembly graph segment pairs", "exploreMode3AssemblyGraphSegmentPair"},
- {"Assembly graph links", "exploreMode3AssemblyGraphLink"},
- {"Meta-alignments", "exploreMode3MetaAlignment"},
- {"Assembly paths", "exploreMode3AssemblyPath"},
- });
- }
if (!httpServerData.docsDirectory.empty()) {
writeNavigation(html, "Help", {
@@ -566,6 +575,10 @@ void Assembler::writeGnuPlotPngToHtml(
// Write the png file to html.
writePngToHtml(html, pngFileName);
+
+ // Remove the files we created.
+ std::filesystem::remove(gnuplotFileName);
+ std::filesystem::remove(pngFileName);
}
@@ -586,9 +599,9 @@ void Assembler::accessAllSoft()
bool allDataAreAvailable = true;
try {
- accessKmers();
+ accessKmerChecker();
} catch(const exception& e) {
- cout << "K-mers are not accessible." << endl;
+ cout << "The k-mer checker is not accessible." << endl;
allDataAreAvailable = false;
}
@@ -675,6 +688,17 @@ void Assembler::accessAllSoft()
allDataAreAvailable = false;
}
+#if 0
+ if(assemblerInfo->assemblyMode == 3) {
+ try {
+ accessMarkerGraphPrimaryJourneys();
+ } catch(const exception& e) {
+ cout << "MarkerGraph graph primary journeys are not accessible." << endl;
+ allDataAreAvailable = false;
+ }
+ }
+#endif
+
try {
accessCompressedAlignments();
} catch(const exception& e) {
@@ -718,18 +742,6 @@ void Assembler::accessAllSoft()
- // Data specific to assembly mode 3.
- if(assemblerInfo->assemblyMode == 3) {
- try {
- accessMode3AssemblyGraph();
- } catch(const exception& e) {
- cout << "The mode 3 assembly graph is not accessible." << endl;
- allDataAreAvailable = false;
- }
- }
-
-
-
if(!allDataAreAvailable) {
cout << "Not all assembly data are accessible." << endl;
cout << "Some functionality is not available." << endl;
@@ -906,20 +918,7 @@ void Assembler::writeAssemblySummary(ostream& html)
void Assembler::writeAssemblySummaryBody(ostream& html)
{
using std::setprecision;
- AssemblyGraph& assemblyGraph = *assemblyGraphPointer;
-
-
- // Compute the number of run-length k-mers used as markers.
- uint64_t totalRleKmerCount = 0;
- uint64_t markerRleKmerCount = 0;
- for(const auto& tableEntry: kmerTable) {
- if(tableEntry.isRleKmer) {
- ++totalRleKmerCount;
- if(tableEntry.isMarker) {
- ++markerRleKmerCount;
- }
- }
- }
+ mode0::AssemblyGraph& assemblyGraph = *assemblyGraphPointer;
const uint64_t totalDiscardedReadCount =
assemblerInfo->discardedInvalidBaseReadCount +
@@ -1006,21 +1005,6 @@ void Assembler::writeAssemblySummaryBody(ostream& html)
html <<
- "<h3>Marker <i>k</i>-mers</h3>"
- "<table>"
- "<tr><td>Length <i>k</i> of <i>k</i>-mers used as markers"
- "<td class=right>" << assemblerInfo->k <<
- "<tr><td>Total number of <i>k</i>-mers"
- "<td class=right>" << totalRleKmerCount <<
- "<tr><td>Number of <i>k</i>-mers used as markers"
- "<td class=right>" << markerRleKmerCount <<
- "<tr><td>Fraction of <i>k</i>-mers used as markers"
- "<td class=right>" << setprecision(3) << double(markerRleKmerCount) / double(totalRleKmerCount) <<
- "</table>"
- "<ul><li>In the above table, all <i>k</i>-mer counts only include run-length encoded <i>k</i>-mers, "
- "that is, <i>k</i>-mers without repeated bases.</ul>"
-
-
"<h3>Markers</h3>"
"<table>"
@@ -1240,20 +1224,6 @@ void Assembler::writeAssemblySummaryJson(ostream& json)
AssemblyGraph& assemblyGraph = *assemblyGraphPointer;
using std::setprecision;
-
-
- // Compute the number of run-length k-mers used as markers.
- uint64_t totalRleKmerCount = 0;
- uint64_t markerRleKmerCount = 0;
- for(const auto& tableEntry: kmerTable) {
- if(tableEntry.isRleKmer) {
- ++totalRleKmerCount;
- if(tableEntry.isMarker) {
- ++markerRleKmerCount;
- }
- }
- }
-
const uint64_t totalDiscardedReadCount =
assemblerInfo->discardedInvalidBaseReadCount +
assemblerInfo->discardedShortReadReadCount +
@@ -1332,17 +1302,6 @@ void Assembler::writeAssemblySummaryJson(ostream& json)
double(totalDiscardedBaseCount + assemblerInfo->baseCount)
<< "\n"
" }\n"
- " },\n";
-
-
- json <<
- " \"Marker k-mers\":\n"
- " {\n"
- " \"Length k of k-mers used as markers\": " << assemblerInfo->k << ",\n"
- " \"Total number of k-mers\": " << totalRleKmerCount << ",\n"
- " \"Number of k-mers used as markers\": " << markerRleKmerCount << ",\n"
- " \"Fraction of k<-mers used as markers\": " <<
- setprecision(3) << double(markerRleKmerCount) / double(totalRleKmerCount) <<
" },\n"
diff --git a/src/AssemblerLowHash.cpp b/src/AssemblerLowHash.cpp
index 8e67654..afba1bf 100644
--- a/src/AssemblerLowHash.cpp
+++ b/src/AssemblerLowHash.cpp
@@ -1,6 +1,5 @@
#include "Assembler.hpp"
#include "LowHash0.hpp"
-#include "LowHash1.hpp"
using namespace shasta;
@@ -23,7 +22,7 @@ void Assembler::findAlignmentCandidatesLowHash0(
{
// Check that we have what we need.
- checkKmersAreOpen();
+ SHASTA_ASSERT(kmerChecker);
checkMarkersAreOpen();
const ReadId readCount = ReadId(markers.size() / 2);
SHASTA_ASSERT(readCount > 0);
@@ -43,9 +42,8 @@ void Assembler::findAlignmentCandidatesLowHash0(
maxBucketSize,
minFrequency,
threadCount,
- kmerTable,
getReads(),
- markers,
+ markerKmerIds,
alignmentCandidates.candidates,
readLowHashStatistics,
largeDataFileNamePrefix,
@@ -132,52 +130,6 @@ void Assembler::writeOverlappingReads(
-// New version that also stores alignmentCandidates.featureOrdinals.
-// This can be used to filter the alignment candidates.
-void Assembler::findAlignmentCandidatesLowHash1(
- size_t m, // Number of consecutive k-mers that define a feature.
- double hashFraction, // Low hash threshold.
- size_t minHashIterationCount, // Number of lowHash iterations.
- size_t log2MinHashBucketCount, // Base 2 log of number of buckets for lowHash.
- size_t minBucketSize, // The minimum size for a bucket to be used.
- size_t maxBucketSize, // The maximum size for a bucket to be used.
- size_t minFrequency, // Minimum number of minHash hits for a pair to become a candidate.
- size_t threadCount)
-{
- // Check that we have what we need.
- checkKmersAreOpen();
- checkMarkersAreOpen();
- const ReadId readCount = ReadId(markers.size() / 2);
- SHASTA_ASSERT(readCount > 0);
-
- // Prepare storage.
- alignmentCandidates.candidates.createNew(
- largeDataName("AlignmentCandidates"), largeDataPageSize);
- alignmentCandidates.featureOrdinals.createNew(
- largeDataName("AlignmentCandidatesFeatureOrdinale"), largeDataPageSize);
-
- // Do the computation.
- LowHash1 lowHash1(
- m,
- hashFraction,
- minHashIterationCount,
- log2MinHashBucketCount,
- minBucketSize,
- maxBucketSize,
- minFrequency,
- threadCount,
- kmerTable,
- getReads(),
- markers,
- alignmentCandidates,
- largeDataFileNamePrefix,
- largeDataPageSize);
-
- alignmentCandidates.unreserve();
-}
-
-
-
void Assembler::writeAlignmentCandidates(bool useReadName, bool verbose) const
{
diff --git a/src/AssemblerMarkerGraph.cpp b/src/AssemblerMarkerGraph.cpp
index cb37ce0..9a73df2 100644
--- a/src/AssemblerMarkerGraph.cpp
+++ b/src/AssemblerMarkerGraph.cpp
@@ -6,12 +6,14 @@
#include "compressAlignment.hpp"
#include "Coverage.hpp"
#include "dset64-gccAtomic.hpp"
+#include "extractKmer.hpp"
#include "PeakFinder.hpp"
#include "performanceLog.hpp"
-#include "LocalMarkerGraph.hpp"
+#include "LocalMarkerGraph0.hpp"
#include "Reads.hpp"
#include "timestamp.hpp"
using namespace shasta;
+using namespace mode0;
// Spoa.
#include "spoa/spoa.hpp"
@@ -77,7 +79,7 @@ void Assembler::createMarkerGraphVertices(
// Check that we have what we need.
reads->checkReadsAreOpen();
reads->checkReadFlagsAreOpen();
- checkKmersAreOpen();
+ SHASTA_ASSERT(kmerChecker);
checkMarkersAreOpen();
checkAlignmentDataAreOpen();
SHASTA_ASSERT(compressedAlignments.isOpen());
@@ -221,6 +223,22 @@ void Assembler::createMarkerGraphVertices(
++histogram[markerCount];
}
+ // Store the disjoint sets histogram in a MemoryMapped::Vector.
+ // This is used when flagging primary marker graph edges for Mode 3 assembly.
+ // This stored pairs(coverage, frequency).
+ // Only pairs where the frequency is not zero are stored.
+ {
+ markerGraph.disjointSetsHistogram.createNew(
+ largeDataName("DisjointSetsHistogram"),
+ largeDataPageSize);
+ for(uint64_t coverage=0; coverage<histogram.size(); coverage++) {
+ const uint64_t frequency = histogram[coverage];
+ if(frequency) {
+ markerGraph.disjointSetsHistogram.push_back({coverage, frequency});
+ }
+ }
+ }
+
ofstream csv("DisjointSetsHistogram.csv");
csv << "Coverage,Frequency\n";
for(uint64_t coverage=0; coverage<histogram.size(); coverage++) {
@@ -592,7 +610,6 @@ void Assembler::createMarkerGraphVerticesThreadFunction1(size_t threadId)
const uint32_t ordinal1 = p[1];
const MarkerId markerId0 = getMarkerId(orientedReadIds[0], ordinal0);
const MarkerId markerId1 = getMarkerId(orientedReadIds[1], ordinal1);
- SHASTA_ASSERT(markers.begin()[markerId0].kmerId == markers.begin()[markerId1].kmerId);
disjointSetsPointer->unite(markerId0, markerId1);
// Also merge the reverse complemented markers.
@@ -1238,6 +1255,11 @@ void Assembler::accessMarkerGraphReverseComplementVertex(bool readWriteAccess)
readWriteAccess);
}
+void Assembler::accessDisjointSetsHistogram()
+{
+ markerGraph.disjointSetsHistogram.accessExistingReadOnly(largeDataName("DisjointSetsHistogram"));
+}
+
// Find the reverse complement of each marker graph edge.
@@ -1489,12 +1511,13 @@ void Assembler::checkMarkerGraphIsStrandSymmetricThreadFunction2(size_t threadId
const MarkerGraph::Edge& edge0 = markerGraph.edges[e0];
const MarkerGraph::Edge& edge1 = markerGraph.edges[e1];
- SHASTA_ASSERT(edge0.coverage == edge1.coverage);
+ SHASTA_ASSERT(markerGraph.edgeCoverage(e0) == markerGraph.edgeCoverage(e1));
SHASTA_ASSERT(
edge0.wasRemovedByTransitiveReduction
== edge1.wasRemovedByTransitiveReduction);
SHASTA_ASSERT(edge0.wasPruned == edge1.wasPruned);
SHASTA_ASSERT(edge0.isSuperBubbleEdge == edge1.isSuperBubbleEdge);
+ SHASTA_ASSERT(edge0.isLowCoverageCrossEdge == edge1.isLowCoverageCrossEdge);
#if 0
// This portion does not work if parallel edges are present,
@@ -1618,103 +1641,6 @@ void Assembler::writeBadMarkerGraphVertices() const
-// Compute marker graph vertex coverage statistics by KmerId.
-void Assembler::vertexCoverageStatisticsByKmerId() const
-{
- // Check that we have what we need.
- checkKmersAreOpen();
- checkMarkersAreOpen();
- checkMarkerGraphVerticesAreAvailable();
-
- const uint64_t k = assemblerInfo->k;
-
- // For each KmerId, maintain a histogram by coverage.
- vector< vector<uint64_t> > histogram(kmerTable.size());
-
- // Loop over all marker graph vertices.
- for(MarkerGraph::VertexId vertexId=0; vertexId!=markerGraph.vertexCount(); vertexId++) {
-
- // Get the markers for this vertex.
- const span<const MarkerId> markerIds = markerGraph.getVertexMarkerIds(vertexId);
- const uint64_t coverage = markerIds.size();
- SHASTA_ASSERT(coverage > 0);
-
- // Find the KmerId.
- const MarkerId firstMarkerId = markerIds.front();
- const CompressedMarker& compressedMarker = markers.begin()[firstMarkerId];
- const KmerId kmerId = compressedMarker.kmerId;
-
- // Increment the histogram.
- SHASTA_ASSERT(kmerId < histogram.size());
- vector<uint64_t>& h = histogram[kmerId];
- if(h.size() <= coverage) {
- h.resize(coverage + 1, 0ULL);
- }
- ++h[coverage];
- }
-
-
-
- // Find the maximum histogram size for any k-mer.
- uint64_t hMaxSize = 0ULL;
- for(uint64_t kmerId=0; kmerId<kmerTable.size(); kmerId++) {
- if(not kmerTable[kmerId].isMarker) {
- continue;
- }
- if(not kmerTable[kmerId].isRleKmer) {
- continue;
- }
- const vector<uint64_t>& h = histogram[kmerId];
- hMaxSize = max(hMaxSize, uint64_t(h.size()));
- }
-
-
-
- // Write it out.
- ofstream csv("VertexCoverageByKmerId.csv");
- csv << "Kmer,Total,";
- for(uint64_t coverage=1; coverage<hMaxSize; coverage++) {
- csv << coverage << ",";
- }
- csv << "\n";
- for(uint64_t kmerId=0; kmerId<kmerTable.size(); kmerId++) {
- if(not kmerTable[kmerId].isMarker) {
- continue;
- }
- if(not kmerTable[kmerId].isRleKmer) {
- continue;
- }
- const Kmer kmer(kmerId, k);
-
- // Compute the total number of markers with this k-mer
- // that are associated with a vertex.
- const vector<uint64_t>& h = histogram[kmerId];
- uint64_t totalMarkerCount = 0ULL;
- for(uint64_t coverage=1; coverage<hMaxSize; coverage++) {
- uint64_t vertexCount = 0;
- if(coverage < h.size()) {
- vertexCount = h[coverage];
- }
- const uint64_t markerCount = coverage * vertexCount;
- totalMarkerCount += markerCount;
- }
-
- kmer.write(csv, k);
- csv << "," << totalMarkerCount << ",";
- for(uint64_t coverage=1; coverage<hMaxSize; coverage++) {
- uint64_t vertexCount = 0;
- if(coverage < h.size()) {
- vertexCount = h[coverage];
- }
- const uint64_t markerCount = coverage * vertexCount;
- csv << markerCount << ",";
- }
- csv << "\n";
- }
-}
-
-
-
bool Assembler::extractLocalMarkerGraph(
OrientedReadId orientedReadId,
uint32_t ordinal,
@@ -1727,7 +1653,7 @@ bool Assembler::extractLocalMarkerGraph(
bool useSuperBubbleEdges,
bool useLowCoverageCrossEdges,
bool useRemovedSecondaryEdges,
- LocalMarkerGraph& graph
+ LocalMarkerGraph0& graph
)
{
const MarkerGraph::VertexId startVertexId =
@@ -1758,7 +1684,7 @@ bool Assembler::extractLocalMarkerGraph(
bool useSuperBubbleEdges,
bool useLowCoverageCrossEdges,
bool useRemovedSecondaryEdges,
- LocalMarkerGraph& graph
+ LocalMarkerGraph0& graph
)
{
// Sanity check.
@@ -1766,8 +1692,8 @@ bool Assembler::extractLocalMarkerGraph(
// Some shorthands.
AssemblyGraph& assemblyGraph = *assemblyGraphPointer;
- using vertex_descriptor = LocalMarkerGraph::vertex_descriptor;
- using edge_descriptor = LocalMarkerGraph::edge_descriptor;
+ using vertex_descriptor = LocalMarkerGraph0::vertex_descriptor;
+ using edge_descriptor = LocalMarkerGraph0::edge_descriptor;
// Start a timer.
const auto startTime = steady_clock::now();
@@ -1800,7 +1726,7 @@ bool Assembler::extractLocalMarkerGraph(
// Dequeue a vertex.
const vertex_descriptor v0 = q.front();
q.pop();
- const LocalMarkerGraphVertex& vertex0 = graph[v0];
+ const LocalMarkerGraph0Vertex& vertex0 = graph[v0];
const MarkerGraph::VertexId vertexId0 = vertex0.vertexId;
const uint64_t distance0 = vertex0.distance;
const uint64_t distance1 = distance0 + 1;
@@ -1903,8 +1829,8 @@ bool Assembler::extractLocalMarkerGraph(
// Create edges.
- BGL_FORALL_VERTICES(v0, graph, LocalMarkerGraph) {
- const LocalMarkerGraphVertex& vertex0 = graph[v0];
+ BGL_FORALL_VERTICES(v0, graph, LocalMarkerGraph0) {
+ const LocalMarkerGraph0Vertex& vertex0 = graph[v0];
const MarkerGraph::VertexId vertexId0 = vertex0.vertexId;
// Loop over the children that exist in the local marker graph
@@ -1979,8 +1905,8 @@ bool Assembler::extractLocalMarkerGraph(
// Store consensus repeat counts for all vertices.
if(markerGraph.vertexRepeatCounts.isOpen) {
const size_t k = assemblerInfo->k;
- BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph) {
- LocalMarkerGraphVertex& vertex = graph[v];
+ BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph0) {
+ LocalMarkerGraph0Vertex& vertex = graph[v];
vertex.storedConsensusRepeatCounts.resize(k);
const uint8_t* begin = markerGraph.vertexRepeatCounts.begin() + k * vertex.vertexId;
copy(begin, begin+k, vertex.storedConsensusRepeatCounts.begin());
@@ -2004,8 +1930,8 @@ bool Assembler::extractLocalMarkerGraph(
const int8_t gap = -1;
auto spoaAlignmentEngine = spoa::AlignmentEngine::Create(alignmentType, match, mismatch, gap);
spoa::Graph spoaAlignmentGraph;
- BGL_FORALL_EDGES(e, graph, LocalMarkerGraph) {
- LocalMarkerGraphEdge& edge = graph[e];
+ BGL_FORALL_EDGES(e, graph, LocalMarkerGraph0) {
+ LocalMarkerGraph0Edge& edge = graph[e];
ComputeMarkerGraphEdgeConsensusSequenceUsingSpoaDetail detail;
computeMarkerGraphEdgeConsensusSequenceUsingSpoa(
edge.edgeId,
@@ -2156,12 +2082,6 @@ void Assembler::createMarkerGraphEdgesThreadFunction0(size_t threadId)
const auto vertex1 = p.first;
const auto& markerIntervals = p.second;
edge.target = vertex1;
- size_t coverage = markerIntervals.size();
- if(coverage < 256) {
- edge.coverage = uint8_t(coverage);
- } else {
- edge.coverage = 255;
- }
// Store the edge.
thisThreadEdges.push_back(edge);
@@ -2302,34 +2222,41 @@ void Assembler::transitiveReduction(
edge.isSuperBubbleEdge = 0;
}
- // Gather edges for each coverage less than highCoverageThreshold.
+ // Compute maximum edge coverage.
+ uint64_t maximumEdgeCoverage = 0;
+ for(EdgeId edgeId=0; edgeId!=edges.size(); edgeId++) {
+ maximumEdgeCoverage = max(maximumEdgeCoverage, markerGraph.edgeCoverage(edgeId));
+ }
+ cout << "Maximum edge coverage is " << maximumEdgeCoverage << endl;
+
+
+
+ // Gather edges for each coverage up to maximumEdgeCoverage.
// Only add to the list those with id less than the id of their reverse complement.
- MemoryMapped::VectorOfVectors<EdgeId, EdgeId> edgesByCoverage;
+ MemoryMapped::VectorOfVectors<EdgeId, EdgeId> edgesByCoverage;
edgesByCoverage.createNew(
largeDataName("tmp-flagMarkerGraphWeakEdges-edgesByCoverage"),
largeDataPageSize);
- edgesByCoverage.beginPass1(highCoverageThreshold);
+ edgesByCoverage.beginPass1(maximumEdgeCoverage + 1);
for(EdgeId edgeId=0; edgeId!=edges.size(); edgeId++) {
if (markerGraph.reverseComplementEdge[edgeId] < edgeId) {
continue;
}
- const MarkerGraph::Edge& edge = edges[edgeId];
- if(edge.coverage < highCoverageThreshold) {
- edgesByCoverage.incrementCount(edge.coverage);
- }
+ const uint64_t coverage = markerGraph.edgeCoverage(edgeId);
+ edgesByCoverage.incrementCount(coverage);
}
edgesByCoverage.beginPass2();
for(EdgeId edgeId=0; edgeId!=edges.size(); edgeId++) {
if (markerGraph.reverseComplementEdge[edgeId] < edgeId) {
continue;
}
- const MarkerGraph::Edge& edge = edges[edgeId];
- if(edge.coverage < highCoverageThreshold) {
- edgesByCoverage.store(edge.coverage, edgeId);
- }
+ const uint64_t coverage = markerGraph.edgeCoverage(edgeId);
+ edgesByCoverage.store(coverage, edgeId);
}
edgesByCoverage.endPass2();
+
+
// Check that there are no edges with coverage 0.
SHASTA_ASSERT(edgesByCoverage[0].size() == 0);
@@ -2351,12 +2278,8 @@ void Assembler::transitiveReduction(
// Flag as weak all edges with coverage <= lowCoverageThreshold
- for(size_t coverage=1; coverage<=lowCoverageThreshold; coverage++) {
+ for(size_t coverage=1; coverage<=min(lowCoverageThreshold, maximumEdgeCoverage); coverage++) {
const auto& edgesWithThisCoverage = edgesByCoverage[coverage];
- if(edgesWithThisCoverage.size() > 0) {
- cout << "Flagging as weak " << 2 * edgesWithThisCoverage.size() << " edges with coverage "
- << coverage << "." << endl;
- }
for(const EdgeId edgeId: edgesWithThisCoverage) {
edges[edgeId].wasRemovedByTransitiveReduction = 1;
edges[markerGraph.reverseComplementEdge[edgeId]].wasRemovedByTransitiveReduction = 1;
@@ -2393,12 +2316,11 @@ void Assembler::transitiveReduction(
// Process edges of intermediate coverage.
for(size_t coverage=lowCoverageThreshold+1;
- coverage<highCoverageThreshold; coverage++) {
+ coverage<min(highCoverageThreshold, maximumEdgeCoverage+1); coverage++) {
const auto& edgesWithThisCoverage = edgesByCoverage[coverage];
if(edgesWithThisCoverage.size() == 0) {
continue;
}
- size_t count = 0;
// Loop over edges with this coverage.
for(const EdgeId edgeId: edgesWithThisCoverage) {
@@ -2454,7 +2376,6 @@ void Assembler::transitiveReduction(
if(found) {
edges[edgeId].wasRemovedByTransitiveReduction = 1;
edges[markerGraph.reverseComplementEdge[edgeId]].wasRemovedByTransitiveReduction = 1;
- count += 2;
}
// Clean up to be ready to process the next edge.
@@ -2466,12 +2387,6 @@ void Assembler::transitiveReduction(
}
bfsVertices.clear();
}
-
- if(count) {
- cout << "Flagged as weak " << count <<
- " edges with coverage " << coverage <<
- " out of "<< 2*edgesWithThisCoverage.size() << " total." << endl;
- }
}
@@ -2500,179 +2415,6 @@ void Assembler::transitiveReduction(
-// Approximate reverse transitive reduction of the marker graph.
-// The goal is to remove local back-edges.
-// This works similarly to transitive reduction,
-// but in the opposite direction.
-// This does the following:
-// - Edges with coverage greater than lowCoverageThreshold
-// and less then highCoverageThreshold are processed in
-// ordered of increasing coverage:
-// * For each such edge A->B, we look for a path of length
-// at most maxDistance starting at B and ending at A
-// that does not use edge A->B and also does not use any
-// edges already marked wasRemovedByTransitiveReduction.
-// * If such a path is found, the edge is marked
-// wasRemovedByTransitiveReduction.
-void Assembler::reverseTransitiveReduction(
- size_t lowCoverageThreshold,
- size_t highCoverageThreshold,
- size_t maxDistance)
-{
- // Some shorthands for readability.
- auto& edges = markerGraph.edges;
- using VertexId = MarkerGraph::VertexId;
- using EdgeId = MarkerGraph::EdgeId;
- using Edge = MarkerGraph::Edge;
-
- // Initial message.
- cout << timestamp << "Reverse transitive reduction of the marker graph begins." << endl;
- cout << "The marker graph has " << markerGraph.vertexCount() << " vertices and ";
- cout << edges.size() << " edges." << endl;
-
- // Gather edges for each coverage less than highCoverageThreshold.
- // Only add to the list those with id less than the id of their reverse complement.
- MemoryMapped::VectorOfVectors<EdgeId, EdgeId> edgesByCoverage;
- edgesByCoverage.createNew(
- largeDataName("tmp-flagMarkerGraphWeakEdges-edgesByCoverage"),
- largeDataPageSize);
- edgesByCoverage.beginPass1(highCoverageThreshold);
- for(EdgeId edgeId=0; edgeId!=edges.size(); edgeId++) {
- if (markerGraph.reverseComplementEdge[edgeId] < edgeId) {
- continue;
- }
- const MarkerGraph::Edge& edge = edges[edgeId];
- if(edge.coverage>lowCoverageThreshold && edge.coverage<highCoverageThreshold) {
- edgesByCoverage.incrementCount(edge.coverage);
- }
- }
- edgesByCoverage.beginPass2();
- for(EdgeId edgeId=0; edgeId!=edges.size(); edgeId++) {
- if (markerGraph.reverseComplementEdge[edgeId] < edgeId) {
- continue;
- }
- const MarkerGraph::Edge& edge = edges[edgeId];
- if(edge.coverage>lowCoverageThreshold && edge.coverage<highCoverageThreshold) {
- edgesByCoverage.store(edge.coverage, edgeId);
- }
- }
- edgesByCoverage.endPass2();
-
- // Vector to contain vertex distances during each BFS.
- // Is is set to -1 for vertices not reached by the BFS.
- MemoryMapped::Vector<int> vertexDistances;
- vertexDistances.createNew(
- largeDataName("tmp-flagMarkerGraphWeakEdges-vertexDistances"),
- largeDataPageSize);
- vertexDistances.resize(markerGraph.vertexCount());
- fill(vertexDistances.begin(), vertexDistances.end(), -1);
-
- // Queue to be used for all BFSs.
- std::queue<VertexId> q;
-
- // Vector to store vertices encountered during a BFS.
- vector<VertexId> bfsVertices;
-
-
-
- // Process edges in the specified coverage range.
- size_t removedCount = 0;
- for(size_t coverage=lowCoverageThreshold+1;
- coverage<highCoverageThreshold; coverage++) {
- const auto& edgesWithThisCoverage = edgesByCoverage[coverage];
- if(edgesWithThisCoverage.size() == 0) {
- continue;
- }
- size_t count = 0;
-
- // Loop over edges with this coverage.
- for(const EdgeId edgeId: edgesWithThisCoverage) {
- const Edge& edge = edges[edgeId];
- if(edge.wasRemovedByTransitiveReduction) {
- continue;
- }
- const VertexId u0 = edge.target;
- const VertexId u1 = edge.source;
-
- // Do a forward BFS starting at u0, up to distance maxDistance,
- // using only edges currently marked as strong
- // and without using this edge.
- // If we encounter u1, u1 is reachable from u0 without
- // using this edge, and so we can mark this edge as weak.
- q.push(u0);
- vertexDistances[u0] = 0;
- bfsVertices.push_back(u0);
- bool found = false;
- while(!q.empty()) {
- const VertexId v0 = q.front();
- q.pop();
- const int distance0 = vertexDistances[v0];
- const int distance1 = distance0 + 1;
- for(const auto edgeId01: markerGraph.edgesBySource[v0]) {
- if(edgeId01 == edgeId) {
- continue;
- }
- const Edge& edge01 = markerGraph.edges[edgeId01];
- if(edge01.wasRemovedByTransitiveReduction) {
- continue;
- }
- const VertexId v1 = edge01.target;
- if(vertexDistances[v1] >= 0) {
- continue; // We already encountered this vertex.
- }
- if(v1 == u1) {
- // We found it!
- found = true;
- break;
- }
- vertexDistances[v1] = distance1;
- bfsVertices.push_back(v1);
- if(distance1 < int(maxDistance)) {
- q.push(v1);
- }
- }
- if(found) {
- break;
- }
- }
-
- if(found) {
- edges[edgeId].wasRemovedByTransitiveReduction = 1;
- edges[markerGraph.reverseComplementEdge[edgeId]].wasRemovedByTransitiveReduction = 1;
- count += 2;
- }
-
- // Clean up to be ready to process the next edge.
- while(!q.empty()) {
- q.pop();
- }
- for(const VertexId v: bfsVertices) {
- vertexDistances[v] = -1;
- }
- bfsVertices.clear();
- }
-
- if(count) {
- cout << timestamp << "Reverse transitive reduction removed " << count <<
- " edges with coverage " << coverage <<
- " out of "<< 2*edgesWithThisCoverage.size() << " total." << endl;
- }
- removedCount += count;
- }
- cout << timestamp << "Reverse transitive reduction removed " << removedCount <<" edges." << endl;
-
-
- // Clean up our work areas.
- edgesByCoverage.remove();
- // edgeFlags.remove();
- vertexDistances.remove();
-
- cout << timestamp << "Reverse transitive reduction of the marker graph ends." << endl;
-
-}
-
-
-
// Return true if an edge disconnects the local subgraph.
bool Assembler::markerGraphEdgeDisconnectsLocalStrongSubgraph(
MarkerGraph::EdgeId startEdgeId,
@@ -4514,7 +4256,7 @@ void Assembler::assembleMarkerGraphVertices(size_t threadCount)
SHASTA_ASSERT(assemblerInfo->readRepresentation == 1);
// Check that we have what we need.
- checkKmersAreOpen();
+ SHASTA_ASSERT(kmerChecker);
reads->checkReadsAreOpen();
checkMarkersAreOpen();
checkMarkerGraphVerticesAreAvailable();
@@ -4582,7 +4324,7 @@ void Assembler::computeMarkerGraphVerticesCoverageData(size_t threadCount)
performanceLog << timestamp<< "computeMarkerGraphVerticesCoverageData begins." << endl;
// Check that we have what we need.
- checkKmersAreOpen();
+ SHASTA_ASSERT(kmerChecker);
reads->checkReadsAreOpen();
checkMarkersAreOpen();
checkMarkerGraphVerticesAreAvailable();
@@ -4753,7 +4495,7 @@ void Assembler::assembleMarkerGraphEdges(
performanceLog << timestamp << "assembleMarkerGraphEdges begins." << endl;
// Check that we have what we need.
- checkKmersAreOpen();
+ SHASTA_ASSERT(kmerChecker);
reads->checkReadsAreOpen();
checkMarkersAreOpen();
checkMarkerGraphVerticesAreAvailable();
@@ -4997,14 +4739,19 @@ void Assembler::assembleMarkerGraphEdgesThreadFunction(size_t threadId)
void Assembler::accessMarkerGraphConsensus()
{
- if(assemblerInfo->readRepresentation == 1) {
- markerGraph.vertexRepeatCounts.accessExistingReadOnly(
- largeDataName("MarkerGraphVertexRepeatCounts"));
+ if(assemblerInfo->assemblyMode == 3) {
+ markerGraph.edgeSequence.accessExistingReadOnly(largeDataName("MarkerGraphEdgesSequence"));
+
+ } else {
+ if(assemblerInfo->readRepresentation == 1) {
+ markerGraph.vertexRepeatCounts.accessExistingReadOnly(
+ largeDataName("MarkerGraphVertexRepeatCounts"));
+ }
+ markerGraph.edgeConsensus.accessExistingReadOnly(
+ largeDataName("MarkerGraphEdgesConsensus"));
+ markerGraph.edgeConsensusOverlappingBaseCount.accessExistingReadOnly(
+ largeDataName("MarkerGraphEdgesConsensusOverlappingBaseCount"));
}
- markerGraph.edgeConsensus.accessExistingReadOnly(
- largeDataName("MarkerGraphEdgesConsensus"));
- markerGraph.edgeConsensusOverlappingBaseCount.accessExistingReadOnly(
- largeDataName("MarkerGraphEdgesConsensusOverlappingBaseCount"));
}
@@ -5065,7 +4812,8 @@ void Assembler::computeMarkerGraphCoverageHistogram()
// Edges.
vector<uint64_t> edgeCoverageHistogram;
- for(const MarkerGraph::Edge& edge: markerGraph.edges) {
+ for(MarkerGraphEdgeId edgeId=0; edgeId<markerGraph.edges.size(); edgeId++) {
+ const MarkerGraph::Edge& edge = markerGraph.edges[edgeId];
// If this edge was removed, skip it.
if(edge.wasRemoved()) {
@@ -5073,7 +4821,7 @@ void Assembler::computeMarkerGraphCoverageHistogram()
}
// Increment the histogram.
- const size_t coverage = edge.coverage;
+ const size_t coverage = markerGraph.edgeCoverage(edgeId);
if(coverage >= edgeCoverageHistogram.size()) {
edgeCoverageHistogram.resize(coverage+1, 0);
}
@@ -5410,153 +5158,12 @@ void Assembler::debugWriteMarkerGraph(const string& fileNamePrefix) const
-// Assemble the RLE sequence of a path of the marker graph, under the assumption
-// that, for each edge, all oriented reads have exactly the same sequence.
-// This will be the case if edges were created by Assembler::createMarkerGraphEdgesStrict.
-void Assembler::assembleMarkerGraphPathRleStrict(
- span<const MarkerGraph::EdgeId> path,
- vector<Base>& rleSequence
- ) const
-{
- using VertexId = MarkerGraph::VertexId;
- using EdgeId = MarkerGraph::EdgeId;
- const uint64_t k = assemblerInfo->k;
-
- // Start with no sequence.
- rleSequence.clear();
- if(path.empty()) {
- return;
- }
-
- // Add the RLE sequence of the first vertex.
- VertexId v0 = markerGraph.edges[path.front()].source;
- const MarkerId firstMarkerId = markerGraph.getVertexMarkerIds(v0)[0];
- const CompressedMarker& firstMarker = markers.begin()[firstMarkerId];
- const KmerId kmerId = firstMarker.kmerId;
- const Kmer kmer(kmerId, k);
- for(uint64_t i=0; i<k; i++) {
- rleSequence.push_back(kmer[i]);
- }
-
-
-
- // Loop over edges of the path.
- for(const EdgeId edgeId: path) {
- const MarkerGraph::Edge& edge = markerGraph.edges[edgeId];
- SHASTA_ASSERT(edge.source == v0);
- const VertexId v1 = edge.target;
-
- const span<const MarkerInterval> markerIntervals = markerGraph.edgeMarkerIntervals[edgeId];
- SHASTA_ASSERT(not markerIntervals.empty());
-
- // Get the RLE sequence and check that all the MarkerIntervals agree.
- // This will be the case if edges were created by Assembler::createMarkerGraphEdgesStrict.
- uint64_t overlappingRleBaseCount;
- vector<Base> edgeRleSequence;
- getMarkerIntervalRleSequence(
- markerIntervals.front(),
- overlappingRleBaseCount,
- edgeRleSequence);
- uint64_t markerIntervalOverlappingRleBaseCount;
- vector<Base> markerIntervalRleSequence;
- for(const MarkerInterval& markerInterval: markerIntervals) {
- getMarkerIntervalRleSequence(
- markerInterval,
- markerIntervalOverlappingRleBaseCount,
- markerIntervalRleSequence);
- SHASTA_ASSERT(markerIntervalOverlappingRleBaseCount == overlappingRleBaseCount);
- SHASTA_ASSERT(markerIntervalRleSequence == edgeRleSequence);
- }
-
-
-
- // Construct the sequence of the v1 vertex.
- const MarkerId markerId1 = markerGraph.getVertexMarkerIds(v1)[0];
- const CompressedMarker& marker1 = markers.begin()[markerId1];
- const KmerId kmerId1 = marker1.kmerId;
- const Kmer kmer1(kmerId1, k);
-
-
- // Add the sequence of this edge and the v1 vertex.
- if(overlappingRleBaseCount == 0) {
-
- // There is no overlap.
-
- // Add the edge sequence.
- copy(edgeRleSequence.begin(), edgeRleSequence.end(), back_inserter(rleSequence));
-
- // Add the entire sequence of v1.
- for(uint64_t i=0; i<k; i++) {
- rleSequence.push_back(kmer1[i]);
- }
-
- } else {
-
- // There is overlap.
- // Add the sequence of v1, excluding the overlapping bases.
- for(uint64_t i=overlappingRleBaseCount; i<k; i++) {
- rleSequence.push_back(kmer1[i]);
- }
- }
-
-
- // Prepare to process the next edge.
- v0 = v1;
- }
-}
-
-
-
-void Assembler::assembleAssemblyGraphEdgeRleStrict(
- AssemblyGraph::EdgeId edgeId,
- vector<Base>& rleSequence
-) const
-{
- const AssemblyGraph& assemblyGraph = *assemblyGraphPointer;
- assembleMarkerGraphPathRleStrict(
- assemblyGraph.edgeLists[edgeId],
- rleSequence);
-}
-
-
-
-// Get the RLE sequence implied by a MarkerInterval.
-// If the markers overlap, returns the number of
-// overlapping RLE bases in overlappingRleBaseCount
-// and empty rleSequence.
-// Otherwise, returns zero overlappingRleBaseCount
-// and the intervening sequence in rleSequence
-// (which can be empty if the two markers are exactly adjacent).
-void Assembler::getMarkerIntervalRleSequence(
- const MarkerInterval& markerInterval,
- uint64_t& overlappingRleBaseCount,
- vector<Base>& rleSequence) const
+// Find the common KmerId for all the markers of a marker graph vertex.
+KmerId Assembler::getMarkerGraphVertexKmerId(MarkerGraphVertexId vertexId) const
{
- const uint64_t k = assemblerInfo->k;
- const OrientedReadId orientedReadId = markerInterval.orientedReadId;
-
- // Extract the k-mers and their RLE positions in this oriented read.
- array<Kmer, 2> kmers;
- array<uint32_t, 2> positions;
- for(uint64_t i=0; i<2; i++) {
- const MarkerId markerId = getMarkerId(orientedReadId, markerInterval.ordinals[i]);
- const CompressedMarker& compressedMarker = markers.begin()[markerId];
- kmers[i] = Kmer(compressedMarker.kmerId, k);
- positions[i] = compressedMarker.position;
- }
-
-
- if(positions[1] < positions[0] + k) {
- // The two markers overlap.
- overlappingRleBaseCount = (positions[0] + k) - positions[1];
- rleSequence.clear();
- } else {
- // The two markers don't overlap.
- overlappingRleBaseCount = 0;
- rleSequence.clear();
- for(uint32_t position=positions[0]+uint32_t(k); position<positions[1]; position++) {
- rleSequence.push_back(getReads().getOrientedReadBase(orientedReadId, position));
- }
- }
+ return markerGraph.getVertexKmerId(
+ vertexId,
+ assemblerInfo->k,
+ *reads,
+ markers);
}
-
diff --git a/src/AssemblerMarkerGraphEdges.cpp b/src/AssemblerMarkerGraphEdges.cpp
index b5f6275..c3bd1e6 100644
--- a/src/AssemblerMarkerGraphEdges.cpp
+++ b/src/AssemblerMarkerGraphEdges.cpp
@@ -332,15 +332,11 @@ void Assembler::createMarkerGraphEdgesStrictPass3(size_t threadId)
if( (strandCoverage[0] >= minEdgeCoveragePerStrand) and
(strandCoverage[1] >= minEdgeCoveragePerStrand)) {
- // If getting here, we actually generate an edge.
- uint64_t coverage = candidateEdge.size();
-
// Store the edge.
MarkerGraph::Edge edge;
edge.clearFlags();
edge.source = vertexId0;
edge.target = vertexId1;
- edge.coverage = (coverage > 255) ? 255 : uint8_t(coverage);
thisThreadEdges.push_back(edge);
// Store the marker intervals.
@@ -572,12 +568,6 @@ void Assembler::createMarkerGraphSecondaryEdges(
MarkerGraph::Edge edge;
edge.source = v0;
edge.target = v1;
- const uint64_t coverage = markerIntervals.size();
- if(coverage < 256) {
- edge.coverage = uint8_t(coverage);
- } else {
- edge.coverage = 255;
- }
edge.isSecondary = 1;
markerGraph.edges.push_back(edge);
markerGraph.edgeMarkerIntervals.appendVector(markerIntervals);
@@ -630,10 +620,8 @@ vector< vector<uint64_t> > Assembler::clusterMarkerGraphEdgeOrientedReads(
// The length of each marker sequence.
const size_t k = assemblerInfo->k;
- const MarkerGraph::Edge& edge = markerGraph.edges[edgeId];
const span<const MarkerInterval> markerIntervals = markerGraph.edgeMarkerIntervals[edgeId];
const uint64_t n = markerIntervals.size();
- SHASTA_ASSERT(edge.coverage == n);
@@ -811,7 +799,6 @@ void Assembler::splitMarkerGraphSecondaryEdges(
auto& newEdge = markerGraph.edges.back();
newEdge.source = Uint40(tmpNewEdge.source);
newEdge.target = Uint40(tmpNewEdge.target);
- newEdge.coverage = uint8_t(tmpNewEdge.markerIntervals.size());
newEdge.isSecondary = 1;
markerGraph.edgeMarkerIntervals.appendVector(tmpNewEdge.markerIntervals);
}
@@ -950,3 +937,546 @@ void Assembler::splitMarkerGraphSecondaryEdgesThreadFunction(size_t threadId)
__sync_fetch_and_add(&data.splitCount, splitCount);
__sync_fetch_and_add(&data.createdCount, createdCount);
}
+
+
+
+
+// Assemble Mode 3 sequence for all marker graph edges.
+// See the comments before MarkerGraph::edgeSequence for more information.
+// The sequence of each edge is simply obtained from the first
+// marker interval of the edge.
+// For now this is done sequentially.
+void Assembler::assembleMarkerGraphEdgesMode3()
+{
+ const uint64_t k = assemblerInfo->k;
+ SHASTA_ASSERT((k % 2) == 0);
+ const uint64_t kHalf = k / 2;
+ SHASTA_ASSERT(getReads().representation == 0);
+
+ markerGraph.edgeSequence.createNew(
+ largeDataName("MarkerGraphEdgesSequence"), largeDataPageSize);
+
+ // Loop over all marker graph edges.
+ for(MarkerGraphEdgeId edgeId=0; edgeId<markerGraph.edges.size(); edgeId++) {
+ markerGraph.edgeSequence.appendVector();
+
+ // Get the first marker interval for this edge.
+ const span<const MarkerInterval> markerIntervals = markerGraph.edgeMarkerIntervals[edgeId];
+ const MarkerInterval& firstMarkerInterval = markerIntervals.front();
+ const OrientedReadId orientedReadId = firstMarkerInterval.orientedReadId;
+ const uint64_t ordinal0 = firstMarkerInterval.ordinals[0];
+ const uint64_t ordinal1 = firstMarkerInterval.ordinals[1];
+
+ // Get the position interval on the oriented read that corresponds to this
+ // marker interval, including k/2 bases on each of the adjacent markers.
+ const span<const CompressedMarker> orientedReadMarkers = markers[orientedReadId.getValue()];
+ const uint64_t positionBegin = orientedReadMarkers[ordinal0].position + kHalf;
+ const uint64_t positionEnd = orientedReadMarkers[ordinal1].position + kHalf;
+
+ for(uint64_t position=positionBegin; position!=positionEnd; position++) {
+ const Base base = getReads().getOrientedReadBase(orientedReadId, uint32_t(position));
+ markerGraph.edgeSequence.append(base);
+ }
+ }
+
+}
+
+
+
+// Analyze and compare the read compositions of two marker graph edges.
+// This can only be done if the two edges have no duplicate OrientedReadIds
+// in the markers. In that case, each OrientedReadId of an edge
+// corresponds to one and only one markerInterval for each edge.
+bool Assembler::analyzeMarkerGraphEdgePair(
+ MarkerGraphEdgeId edgeIdA,
+ MarkerGraphEdgeId edgeIdB,
+ MarkerGraphEdgePairInfo& info
+ ) const
+{
+
+ // Check for duplicate OrientedReadIds on the two edges.
+ if(markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdA)) {
+ return false;
+ }
+ if(markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdB)) {
+ return false;
+ }
+
+ // Prepare for the joint loop over OrientedReadIds of the two edges.
+ const auto markerIntervalsA = markerGraph.edgeMarkerIntervals[edgeIdA];
+ const auto markerIntervalsB = markerGraph.edgeMarkerIntervals[edgeIdB];
+ const auto beginA = markerIntervalsA.begin();
+ const auto beginB = markerIntervalsB.begin();
+ const auto endA = markerIntervalsA.end();
+ const auto endB = markerIntervalsB.end();
+
+ // Store the total number of OrientedReadIds on the two edges.
+ info.totalA = endA - beginA;
+ info.totalB = endB - beginB;
+
+
+
+ // Joint loop over the MarkerIntervals of the two edges,
+ // to count the common reads and compute average offsets.
+ info.common = 0;
+ int64_t sumMarkerOffsets = 0;
+ int64_t sumTwiceBaseOffsets = 0;
+ auto itA = beginA;
+ auto itB = beginB;
+ while(itA != endA and itB != endB) {
+
+ if(itA->orientedReadId < itB->orientedReadId) {
+ ++itA;
+ continue;
+ }
+
+ if(itB->orientedReadId < itA->orientedReadId) {
+ ++itB;
+ continue;
+ }
+
+ // We found a common OrientedReadId.
+ ++info.common;
+ const OrientedReadId orientedReadId = itA->orientedReadId;
+ const auto orientedReadMarkers = markers[orientedReadId.getValue()];
+
+ // Compute the offset in markers.
+ SHASTA_ASSERT(itA->ordinals[1] == itA->ordinals[0] + 1);
+ SHASTA_ASSERT(itB->ordinals[1] == itB->ordinals[0] + 1);
+ const uint32_t ordinalA = itA->ordinals[0];
+ const uint32_t ordinalB = itB->ordinals[0];
+ const int64_t markerOffset = int64_t(ordinalB) - int64_t(ordinalA);
+ sumMarkerOffsets += markerOffset;
+
+ // Compute the offset in bases.
+ const int64_t positionA0 = int64_t(orientedReadMarkers[ordinalA].position);
+ const int64_t positionA1 = int64_t(orientedReadMarkers[ordinalA+1].position);
+ const int64_t positionB0 = int64_t(orientedReadMarkers[ordinalB].position);
+ const int64_t positionB1 = int64_t(orientedReadMarkers[ordinalB+1].position);
+ sumTwiceBaseOffsets -= positionA0;
+ sumTwiceBaseOffsets -= positionA1;
+ sumTwiceBaseOffsets += positionB0;
+ sumTwiceBaseOffsets += positionB1;
+
+ // Continue the joint loop.
+ ++itA;
+ ++itB;
+
+ }
+ info.onlyA = info.totalA - info.common;
+ info.onlyB = info.totalB - info.common;
+
+ // If there are no common reads, this is all we can do.
+ if(info.common == 0) {
+ info.offsetInMarkers = invalid<int64_t>;
+ info.offsetInBases = invalid<int64_t>;
+ info.onlyAShort = invalid<uint64_t>;
+ info.onlyBShort = invalid<uint64_t>;
+ return true;
+ }
+
+ // Compute the estimated offsets.
+ info.offsetInMarkers = int64_t(std::round(double(sumMarkerOffsets) / double(info.common)));
+ info.offsetInBases = int64_t(0.5 * std::round(double(sumTwiceBaseOffsets) / double(info.common)));
+
+ // Now do the joint loop again, and count the onlyA and onlyB oriented reads
+ // that are too short to appear in the other edge.
+ itA = beginA;
+ itB = beginB;
+ uint64_t onlyACheck = 0;
+ uint64_t onlyBCheck = 0;
+ info.onlyAShort = 0;
+ info.onlyBShort = 0;
+ while(true) {
+ if(itA == endA and itB == endB) {
+ break;
+ }
+
+ else if(itB == endB or ((itA!=endA) and (itA->orientedReadId < itB->orientedReadId))) {
+ // This oriented read only appears in edge A.
+ ++onlyACheck;
+ const OrientedReadId orientedReadId = itA->orientedReadId;
+ const auto orientedReadMarkers = markers[orientedReadId.getValue()];
+ const int64_t lengthInBases = int64_t(getReads().getReadRawSequenceLength(orientedReadId.getReadId()));
+
+ // Get the positions of edge A in this oriented read.
+ const uint32_t ordinalA0 = itA->ordinals[0];
+ const uint32_t ordinalA1 = itA->ordinals[1];
+ const int64_t positionA0 = int64_t(orientedReadMarkers[ordinalA0].position);
+ const int64_t positionA1 = int64_t(orientedReadMarkers[ordinalA1].position);
+
+ // Find the hypothetical positions of edge B, assuming the estimated base offset.
+ const int64_t positionB0 = positionA0 + info.offsetInBases;
+ const int64_t positionB1 = positionA1 + info.offsetInBases;
+
+ // If this ends up outside the read, this counts as onlyAShort.
+ if(positionB0 < 0 or positionB1 >= lengthInBases) {
+ ++info.onlyAShort;
+ }
+
+ ++itA;
+ continue;
+ }
+
+ else if(itA == endA or ((itB!=endB) and (itB->orientedReadId < itA->orientedReadId))) {
+ // This oriented read only appears in edge B.
+ ++onlyBCheck;
+ const OrientedReadId orientedReadId = itB->orientedReadId;
+ const auto orientedReadMarkers = markers[orientedReadId.getValue()];
+ const int64_t lengthInBases = int64_t(getReads().getReadRawSequenceLength(orientedReadId.getReadId()));
+
+ // Get the positions of edge B in this oriented read.
+ const uint32_t ordinalB0 = itB->ordinals[0];
+ const uint32_t ordinalB1 = itB->ordinals[1];
+ const int64_t positionB0 = int64_t(orientedReadMarkers[ordinalB0].position);
+ const int64_t positionB1 = int64_t(orientedReadMarkers[ordinalB1].position);
+
+ // Find the hypothetical positions of edge A, assuming the estimated base offset.
+ const int64_t positionA0 = positionB0 - info.offsetInBases;
+ const int64_t positionA1 = positionB1 - info.offsetInBases;
+
+ // If this ends up outside the read, this counts as onlyBShort.
+ if(positionA0 < 0 or positionA1 >= lengthInBases) {
+ ++info.onlyBShort;
+ }
+
+ ++itB;
+ continue;
+ }
+
+ else {
+ // This oriented read appears in both edges. In this loop, we
+ // don't need to do anything.
+ ++itA;
+ ++itB;
+ }
+ }
+ SHASTA_ASSERT(onlyACheck == info.onlyA);
+ SHASTA_ASSERT(onlyBCheck == info.onlyB);
+
+
+ return true;
+}
+
+
+
+#if 0
+// More detailed analysis for a pair of marker graph edges,
+// both of which must be primary.
+void Assembler::analyzePrimaryMarkerGraphEdgePair(
+ MarkerGraphEdgeId edgeIdA,
+ MarkerGraphEdgeId edgeIdB) const
+{
+ cout << "analyzePrimaryMarkerGraphEdgePair begins for " << edgeIdA << " " << edgeIdB << endl;
+
+ // Sanity checks.
+ SHASTA_ASSERT(markerGraph.edges[edgeIdA].isPrimary == 1);
+ SHASTA_ASSERT(markerGraph.edges[edgeIdB].isPrimary == 1);
+
+ // The MarkerIntervals on these two edges.
+ const auto markerIntervalsA = markerGraph.edgeMarkerIntervals[edgeIdA];
+ const auto markerIntervalsB = markerGraph.edgeMarkerIntervals[edgeIdB];
+
+ // Find the position of edgeA on the primary journey of each oriented read on edgeA.
+ vector<uint64_t> positionInJourneyA(markerIntervalsA.size(), invalid<uint64_t>);
+ for(uint64_t i=0; i<markerIntervalsA.size(); i++) {
+ const OrientedReadId orientedReadId = markerIntervalsA[i].orientedReadId;
+ const auto journey = markerGraph.primaryJourneys[orientedReadId.getValue()];
+ for(uint64_t position=0; position<journey.size(); position++) {
+ if(journey[position].edgeId == edgeIdA) {
+ positionInJourneyA[i] = position;
+ break;
+ }
+ }
+ SHASTA_ASSERT(positionInJourneyA[i] != invalid<uint64_t>);
+ }
+
+ // Find the position of edgeB on the primary journey of each oriented read on edgeB.
+ vector<uint64_t> positionInJourneyB(markerIntervalsB.size(), invalid<uint64_t>);
+ for(uint64_t i=0; i<markerIntervalsB.size(); i++) {
+ const OrientedReadId orientedReadId = markerIntervalsB[i].orientedReadId;
+ const auto journey = markerGraph.primaryJourneys[orientedReadId.getValue()];
+ for(uint64_t position=0; position<journey.size(); position++) {
+ if(journey[position].edgeId == edgeIdB) {
+ positionInJourneyB[i] = position;
+ break;
+ }
+ }
+ SHASTA_ASSERT(positionInJourneyB[i] != invalid<uint64_t>);
+ }
+
+
+ // The MarkerGraphEdgeIds that we encountered so far by moving forward from edgeA on
+ // the primary journeys of oriented reads on edgeA.
+ std::set<MarkerGraphEdgeId> edgeIdsForwardA;
+
+ // The MarkerGraphEdgeIds that we encountered so far by moving backward from edgeB on
+ // the primary journeys of oriented reads on edgeB.
+ std::set<MarkerGraphEdgeId> edgeIdsBackwardB;
+
+ // Iterate over offsets in the primary journeys.
+ // For journeys of the oriented reads on edgeA, we use positive offsets.
+ // For journeys of the oriented reads on edgeB, we use negative offsets.
+ for(uint64_t offset=1; ; ++offset) {
+
+ uint64_t activeCountA = 0;
+ for(uint64_t i=0; i<markerIntervalsA.size(); i++) {
+ const OrientedReadId orientedReadId = markerIntervalsA[i].orientedReadId;
+ const auto journey = markerGraph.primaryJourneys[orientedReadId.getValue()];
+ const uint64_t position = positionInJourneyA[i] + offset;
+ if(position >= journey.size()) {
+ continue;
+ }
+ ++activeCountA;
+ const MarkerGraphEdgeId edgeId = journey[position].edgeId;
+
+ if(not edgeIdsForwardA.contains(edgeId)) {
+ edgeIdsForwardA.insert(edgeId);
+
+ if(edgeIdsBackwardB.contains(edgeId)) {
+ MarkerGraphEdgePairInfo infoA;
+ analyzeMarkerGraphEdgePair(edgeIdA, edgeId, infoA);
+ MarkerGraphEdgePairInfo infoB;
+ analyzeMarkerGraphEdgePair(edgeId, edgeIdB, infoB);
+ cout << "At offset " << offset << " found " << edgeId <<
+ ", common " << infoA.common << " " << infoB.common << ", total offset " <<
+ infoA.offsetInBases+ infoB.offsetInBases << endl;
+ }
+ }
+ }
+
+ uint64_t activeCountB = 0;
+ for(uint64_t i=0; i<markerIntervalsB.size(); i++) {
+ const OrientedReadId orientedReadId = markerIntervalsB[i].orientedReadId;
+ const auto journey = markerGraph.primaryJourneys[orientedReadId.getValue()];
+ if(offset > positionInJourneyB[i]) {
+ continue;
+ }
+ const uint64_t position = positionInJourneyB[i] - offset;
+ ++activeCountB;
+ const MarkerGraphEdgeId edgeId = journey[position].edgeId;
+
+ if(not edgeIdsBackwardB.contains(edgeId)) {
+ edgeIdsBackwardB.insert(edgeId);
+
+ if(edgeIdsForwardA.contains(edgeId)) {
+ MarkerGraphEdgePairInfo infoA;
+ analyzeMarkerGraphEdgePair(edgeIdA, edgeId, infoA);
+ MarkerGraphEdgePairInfo infoB;
+ analyzeMarkerGraphEdgePair(edgeId, edgeIdB, infoB);
+ cout << "At offset " << offset << " found " << edgeId <<
+ ", common " << infoA.common << " " << infoB.common << endl;
+ }
+ }
+ }
+
+ if(activeCountA == 0 or activeCountB == 0) {
+ break;
+ }
+ }
+}
+#endif
+
+
+
+// Estimate the offset, in bases, between two marker graph edges.
+// This assumes, WITHOUT CHECKING, that each of the two edges has no duplicate
+// oriented reads. This assumption is satisfied for primary marker graph edges
+// in Mode 3 assembly.
+// If there are common oriented reads between the two edges, this uses
+// analyzeMarkerGraphEdgePair.
+// This can fail, in which case it returns invalid<uint64_t>.
+uint64_t Assembler::estimateBaseOffsetUnsafe(
+ MarkerGraphEdgeId edgeIdA,
+ MarkerGraphEdgeId edgeIdB) const
+{
+ // If there are common oriented reads between the two edges, use
+ // analyzeMarkerGraphEdgePair. This is the most common case.
+ if(countCommonOrientedReadsUnsafe(edgeIdA, edgeIdB) > 0) {
+ MarkerGraphEdgePairInfo info;
+ SHASTA_ASSERT(analyzeMarkerGraphEdgePair(edgeIdA, edgeIdB, info));
+ if(info.offsetInBases >= 0) {
+ return info.offsetInBases;
+ } else {
+ return invalid<uint64_t>;
+ }
+ } else {
+ return invalid<uint64_t>;
+ }
+
+#if 0
+ // There are no common oriented reads between the two edges.
+ // Find a primary marker graph edge in-between that has common
+ // oriented reads with both edgeIdA and edgeIdB.
+
+ // Sanity checks.
+ SHASTA_ASSERT(markerGraph.edges[edgeIdA].isPrimary == 1);
+ SHASTA_ASSERT(markerGraph.edges[edgeIdB].isPrimary == 1);
+
+ // The MarkerIntervals on these two edges.
+ const auto markerIntervalsA = markerGraph.edgeMarkerIntervals[edgeIdA];
+ const auto markerIntervalsB = markerGraph.edgeMarkerIntervals[edgeIdB];
+
+ // Find the position of edgeA on the primary journey of each oriented read on edgeA.
+ vector<uint64_t> positionInJourneyA(markerIntervalsA.size(), invalid<uint64_t>);
+ for(uint64_t i=0; i<markerIntervalsA.size(); i++) {
+ const OrientedReadId orientedReadId = markerIntervalsA[i].orientedReadId;
+ const auto journey = markerGraph.primaryJourneys[orientedReadId.getValue()];
+ for(uint64_t position=0; position<journey.size(); position++) {
+ if(journey[position].edgeId == edgeIdA) {
+ positionInJourneyA[i] = position;
+ break;
+ }
+ }
+ SHASTA_ASSERT(positionInJourneyA[i] != invalid<uint64_t>);
+ }
+
+ // Find the position of edgeB on the primary journey of each oriented read on edgeB.
+ vector<uint64_t> positionInJourneyB(markerIntervalsB.size(), invalid<uint64_t>);
+ for(uint64_t i=0; i<markerIntervalsB.size(); i++) {
+ const OrientedReadId orientedReadId = markerIntervalsB[i].orientedReadId;
+ const auto journey = markerGraph.primaryJourneys[orientedReadId.getValue()];
+ for(uint64_t position=0; position<journey.size(); position++) {
+ if(journey[position].edgeId == edgeIdB) {
+ positionInJourneyB[i] = position;
+ break;
+ }
+ }
+ SHASTA_ASSERT(positionInJourneyB[i] != invalid<uint64_t>);
+ }
+
+
+ // The MarkerGraphEdgeIds that we encountered so far by moving forward from edgeA on
+ // the primary journeys of oriented reads on edgeA.
+ std::set<MarkerGraphEdgeId> edgeIdsForwardA;
+
+ // The MarkerGraphEdgeIds that we encountered so far by moving backward from edgeB on
+ // the primary journeys of oriented reads on edgeB.
+ std::set<MarkerGraphEdgeId> edgeIdsBackwardB;
+
+ // The best edgeId we found, and the lowest of its common oriented reads
+ // with edgeIdA and edgeIdB.
+ uint64_t edgeIdBest = invalid<uint64_t>;
+ uint64_t commonBest = 0;
+
+ // Iterate over offsets in the primary journeys.
+ // For journeys of the oriented reads on edgeA, we use positive offsets.
+ // For journeys of the oriented reads on edgeB, we use negative offsets.
+ for(uint64_t offset=1; ; ++offset) {
+
+ uint64_t activeCountA = 0;
+ for(uint64_t i=0; i<markerIntervalsA.size(); i++) {
+ const OrientedReadId orientedReadId = markerIntervalsA[i].orientedReadId;
+ const auto journey = markerGraph.primaryJourneys[orientedReadId.getValue()];
+ const uint64_t position = positionInJourneyA[i] + offset;
+ if(position >= journey.size()) {
+ continue;
+ }
+ ++activeCountA;
+ const MarkerGraphEdgeId edgeId = journey[position].edgeId;
+
+ if(not edgeIdsForwardA.contains(edgeId)) {
+ edgeIdsForwardA.insert(edgeId);
+
+ if(edgeIdsBackwardB.contains(edgeId)) {
+ const uint64_t commonCountA = countCommonOrientedReadsUnsafe(edgeIdA, edgeId);
+ const uint64_t commonCountB = countCommonOrientedReadsUnsafe(edgeId, edgeIdB);
+ const uint64_t commonCountMin = min(commonCountA, commonCountB);
+ if(commonCountMin > commonBest) {
+ edgeIdBest = edgeId;
+ commonBest = commonCountMin;
+ }
+ }
+ }
+ }
+
+ uint64_t activeCountB = 0;
+ for(uint64_t i=0; i<markerIntervalsB.size(); i++) {
+ const OrientedReadId orientedReadId = markerIntervalsB[i].orientedReadId;
+ const auto journey = markerGraph.primaryJourneys[orientedReadId.getValue()];
+ if(offset > positionInJourneyB[i]) {
+ continue;
+ }
+ const uint64_t position = positionInJourneyB[i] - offset;
+ ++activeCountB;
+ const MarkerGraphEdgeId edgeId = journey[position].edgeId;
+
+ if(not edgeIdsBackwardB.contains(edgeId)) {
+ edgeIdsBackwardB.insert(edgeId);
+
+ if(edgeIdsForwardA.contains(edgeId)) {
+ const uint64_t commonCountA = countCommonOrientedReadsUnsafe(edgeIdA, edgeId);
+ const uint64_t commonCountB = countCommonOrientedReadsUnsafe(edgeId, edgeIdB);
+ const uint64_t commonCountMin = min(commonCountA, commonCountB);
+ if(commonCountMin > commonBest) {
+ edgeIdBest = edgeId;
+ commonBest = commonCountMin;
+ }
+ }
+ }
+ }
+
+ if(activeCountA == 0 or activeCountB == 0) {
+ break;
+ }
+ }
+
+ if(commonBest == 0) {
+ return invalid<uint64_t>;
+ }
+
+ // edgeIdBest has common oriented reads with both edgeIdA and edgeIdB.
+ MarkerGraphEdgePairInfo infoA;
+ MarkerGraphEdgePairInfo infoB;
+ SHASTA_ASSERT(analyzeMarkerGraphEdgePair(edgeIdA, edgeIdBest, infoA));
+ SHASTA_ASSERT(analyzeMarkerGraphEdgePair(edgeIdBest, edgeIdB, infoB));
+ SHASTA_ASSERT(infoA.common > 0);
+ SHASTA_ASSERT(infoB.common > 0);
+ return infoA.offsetInBases + infoB.offsetInBases;
+#endif
+}
+
+
+
+// Count the number of common oriented reads between two marker graph edges.
+// This assumes, WITHOUT CHECKING, that each of the two edges has no duplicate
+// oriented reads. This assumption is satisfied for primary marker graph edges
+// in Mode 3 assembly.
+uint64_t Assembler::countCommonOrientedReadsUnsafe(
+ MarkerGraphEdgeId edgeIdA,
+ MarkerGraphEdgeId edgeIdB) const
+{
+ // Prepare for the joint loop over OrientedReadIds of the two edges.
+ const auto markerIntervalsA = markerGraph.edgeMarkerIntervals[edgeIdA];
+ const auto markerIntervalsB = markerGraph.edgeMarkerIntervals[edgeIdB];
+ const auto beginA = markerIntervalsA.begin();
+ const auto beginB = markerIntervalsB.begin();
+ const auto endA = markerIntervalsA.end();
+ const auto endB = markerIntervalsB.end();
+
+
+ // Joint loop over the MarkerIntervals of the two edges,
+ // to count the common reads and compute average offsets.
+ // This assumes that there are no duplicate oriented reads
+ // on the two edges.
+ uint64_t n = 0;
+ auto itA = beginA;
+ auto itB = beginB;
+ while(itA != endA and itB != endB) {
+
+ if(itA->orientedReadId < itB->orientedReadId) {
+ ++itA;
+ } else if(itB->orientedReadId < itA->orientedReadId) {
+ ++itB;
+ continue;
+ } else {
+ // We found a common OrientedReadId.
+ ++n;
+ ++itA;
+ ++itB;
+ }
+
+ }
+ return n;
+}
+
diff --git a/src/AssemblerMarkers.cpp b/src/AssemblerMarkers.cpp
index 4b7eb9f..7c34b6f 100644
--- a/src/AssemblerMarkers.cpp
+++ b/src/AssemblerMarkers.cpp
@@ -1,7 +1,10 @@
// Shasta.
#include "Assembler.hpp"
+#include "extractKmer.hpp"
#include "findMarkerId.hpp"
#include "MarkerFinder.hpp"
+#include "performanceLog.hpp"
+#include "timestamp.hpp"
using namespace shasta;
// Standard library.
@@ -11,12 +14,12 @@ using namespace shasta;
void Assembler::findMarkers(size_t threadCount)
{
reads->checkReadsAreOpen();
- checkKmersAreOpen();
+ SHASTA_ASSERT(kmerChecker);
markers.createNew(largeDataName("Markers"), largeDataPageSize);
MarkerFinder markerFinder(
assemblerInfo->k,
- kmerTable,
+ *kmerChecker,
getReads(),
markers,
threadCount);
@@ -41,7 +44,7 @@ void Assembler::checkMarkersAreOpen() const
void Assembler::writeMarkers(ReadId readId, Strand strand, const string& fileName)
{
// Check that we have what we need.
- checkKmersAreOpen();
+ SHASTA_ASSERT(kmerChecker);
reads->checkReadsAreOpen();
checkMarkersAreOpen();
reads->checkReadId(readId);
@@ -56,44 +59,25 @@ void Assembler::writeMarkers(ReadId readId, Strand strand, const string& fileNam
for(uint32_t ordinal=0; ordinal<orientedReadMarkers.size(); ordinal++) {
const CompressedMarker& marker = orientedReadMarkers[ordinal];
const MarkerId markerId = getMarkerId(orientedReadId, ordinal);
+ const KmerId kmerId = getOrientedReadMarkerKmerId(orientedReadId, ordinal);
+ const Kmer kmer(kmerId, assemblerInfo->k);
csv << markerId << ",";
csv << ordinal << ",";
- csv << marker.kmerId << ",";
- csv << Kmer(marker.kmerId, assemblerInfo->k) << ",";
+ csv << kmerId << ",";
+ csv << kmer << ",";
csv << marker.position << "\n";
}
}
-vector<KmerId> Assembler::getMarkers(ReadId readId, Strand strand)
-{
- const OrientedReadId orientedReadId(readId, strand);
- const auto orientedReadMarkers = markers[orientedReadId.getValue()];
-
- vector<KmerId> v;
- for(const CompressedMarker& marker: orientedReadMarkers) {
- v.push_back(marker.kmerId);
- }
- return v;
-}
-
-
// Get markers sorted by KmerId for a given OrientedReadId.
void Assembler::getMarkersSortedByKmerId(
OrientedReadId orientedReadId,
vector<MarkerWithOrdinal>& markersSortedByKmerId) const
{
- const auto compressedMarkers = markers[orientedReadId.getValue()];
- markersSortedByKmerId.clear();
- markersSortedByKmerId.resize(compressedMarkers.size());
-
- for(uint32_t ordinal=0; ordinal<compressedMarkers.size(); ordinal++) {
- const CompressedMarker& compressedMarker = compressedMarkers[ordinal];
- markersSortedByKmerId[ordinal] = MarkerWithOrdinal(compressedMarker, ordinal);
- }
-
- // Sort by kmerId.
+ markersSortedByKmerId.resize(markers.size(orientedReadId.getValue()));
+ getOrientedReadMarkers(orientedReadId, markersSortedByKmerId);
sort(markersSortedByKmerId.begin(), markersSortedByKmerId.end());
}
@@ -154,28 +138,482 @@ MarkerId Assembler::findReverseComplement(MarkerId markerId) const
-// Write the frequency of markers in oriented reads.
-void Assembler::writeMarkerFrequency()
+void Assembler::computeMarkerKmerIds(uint64_t threadCount)
{
- const uint64_t k = assemblerInfo->k;
- const uint64_t kmerCount = 1ULL << (2ULL*k);
- SHASTA_ASSERT(markers.isOpen());
- vector<uint64_t> frequency(kmerCount, 0);
+ performanceLog << timestamp << "Gathering marker KmerIds." << endl;
- const CompressedMarker* compressedMarker = markers.begin();
- const CompressedMarker* end = markers.end();
- for(; compressedMarker!=end; ++compressedMarker) {
- ++frequency[compressedMarker->kmerId];
+ // Check that we have what we need.
+ checkMarkersAreOpen();
+ const uint64_t readCount = reads->readCount();
+
+ // Adjust the numbers of threads, if necessary.
+ if(threadCount == 0) {
+ threadCount = std::thread::hardware_concurrency();
+ }
+
+ // Do it.
+ // The layout is identical to that used by the markers.
+ markerKmerIds.createNew(largeDataName("MarkerKmerIds"), largeDataPageSize);
+ for(uint64_t readId=0; readId<readCount; readId++) {
+ const OrientedReadId orientedReadId0(uint32_t(readId), 0);
+ const OrientedReadId orientedReadId1(uint32_t(readId), 1);
+ const uint64_t readMarkerCount = markers.size(orientedReadId0.getValue());
+ SHASTA_ASSERT(markers.size(orientedReadId1.getValue()) == readMarkerCount);
+ for(uint64_t strand=0; strand<2; strand++) {
+ markerKmerIds.appendVector(readMarkerCount);
+ }
+ }
+ markerKmerIds.unreserve();
+ const uint64_t batchSize = 100;
+ setupLoadBalancing(readCount, batchSize);
+ runThreads(&Assembler::computeMarkerKmerIdsThreadFunction, threadCount);
+
+
+
+#if 0
+ // Test the low level functions to extract Kmers/KmerIds.
+ const uint64_t k = assemblerInfo->k;
+ vector<Kmer> kmerVector;
+ vector<KmerId> kmerIdVector;
+ performanceLog << timestamp << "Testing." << endl;
+ for(uint64_t readId=0; readId<readCount; readId++) {
+ for(uint64_t strand=0; strand<2; strand++) {
+
+ const OrientedReadId orientedReadId = OrientedReadId(ReadId(readId), Strand(strand));
+ const auto orientedReadMarkers = markers[orientedReadId.getValue()];
+ const auto orientedReadMarkerKmerIds = markerKmerIds[orientedReadId.getValue()];
+ const uint64_t orientedReadMarkerCount = orientedReadMarkers.size();
+ SHASTA_ASSERT(orientedReadMarkerKmerIds.size() == orientedReadMarkerCount);
+
+ kmerVector.resize(orientedReadMarkerCount);
+ kmerIdVector.resize(orientedReadMarkerCount);
+ const span<Kmer> kmerSpan(kmerVector);
+ const span<KmerId> kmerIdSpan(kmerIdVector);
+
+ getOrientedReadMarkerKmers(orientedReadId, kmerSpan);
+ getOrientedReadMarkerKmerIds(orientedReadId, kmerIdSpan);
+
+ for(uint64_t ordinal=0; ordinal<orientedReadMarkerCount; ordinal++) {
+ SHASTA_ASSERT(kmerVector[ordinal].id(k) == orientedReadMarkers[ordinal].kmerId);
+ SHASTA_ASSERT(kmerIdVector[ordinal] == orientedReadMarkers[ordinal].kmerId);
+
+ SHASTA_ASSERT(kmerVector[ordinal] == getOrientedReadMarkerKmer(orientedReadId, ordinal));
+ SHASTA_ASSERT(kmerIdVector[ordinal] == getOrientedReadMarkerKmerId(orientedReadId, ordinal));
+ }
+ }
}
+#endif
- ofstream csv("MarkerFrequency.csv");
- for(uint64_t kmerId=0; kmerId<kmerCount; kmerId++) {
- const uint64_t n = frequency[kmerId];
- if(n== 0) {
- continue;
+}
+
+
+
+void Assembler::cleanupMarkerKmerIds()
+{
+ markerKmerIds.remove();
+}
+
+
+
+void Assembler::computeMarkerKmerIdsThreadFunction(size_t threadId)
+{
+
+ // Loop over all batches assigned to this thread.
+ uint64_t begin, end;
+ while(getNextBatch(begin, end)) {
+
+ // Loop over reads in this batch.
+ for(uint64_t readId=begin; readId!=end; ++readId) {
+
+ const OrientedReadId orientedReadId0(uint32_t(readId), 0);
+ const OrientedReadId orientedReadId1(uint32_t(readId), 1);
+
+ getReadMarkerKmerIds(
+ ReadId(readId),
+ markerKmerIds[orientedReadId0.getValue()],
+ markerKmerIds[orientedReadId1.getValue()]);
}
- const Kmer kmer(kmerId, k);
- kmer.write(csv, k);
- csv << "," << n << "\n";
}
+
+}
+
+
+
+Kmer Assembler::getOrientedReadMarkerKmer(OrientedReadId orientedReadId, uint32_t ordinal) const
+{
+ const ReadId readId = orientedReadId.getReadId();
+ const Strand strand = orientedReadId.getStrand();
+
+ if(strand == 0) {
+ return getOrientedReadMarkerKmerStrand0(readId, ordinal);
+ } else {
+ return getOrientedReadMarkerKmerStrand1(readId, ordinal);
+ }
+
+}
+
+
+
+Kmer Assembler::getOrientedReadMarkerKmerStrand0(ReadId readId, uint32_t ordinal0) const
+{
+ const uint64_t k = assemblerInfo->k;
+ const auto read = reads->getRead(uint32_t(readId));
+ const OrientedReadId orientedReadId0(readId, 0);
+ const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()];
+
+ Kmer kmer0;
+ extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0);
+
+ return kmer0;
+}
+
+
+
+Kmer Assembler::getOrientedReadMarkerKmerStrand1(ReadId readId, uint32_t ordinal1) const
+{
+ const uint64_t k = assemblerInfo->k;
+
+ // We only have the read stored without reverse complement, so get it from there...
+ const auto read = reads->getRead(uint32_t(readId));
+ const OrientedReadId orientedReadId0(readId, 0);
+ const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()];
+ const uint64_t readMarkerCount = orientedReadMarkers0.size();
+ const uint64_t ordinal0 = readMarkerCount - 1 - ordinal1;
+ Kmer kmer0;
+ extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0);
+
+ // ... then do the reverse complement.
+ const Kmer kmer1 = kmer0.reverseComplement(k);
+ return kmer1;
+}
+
+
+
+KmerId Assembler::getOrientedReadMarkerKmerId(OrientedReadId orientedReadId, uint32_t ordinal) const
+{
+ const Kmer kmer = getOrientedReadMarkerKmer(orientedReadId, ordinal);
+ return KmerId(kmer.id(assemblerInfo->k));
+}
+
+
+
+// Get all marker Kmers for an oriented read.
+void Assembler::getOrientedReadMarkerKmers(
+ OrientedReadId orientedReadId,
+ const span<Kmer>& kmers) const
+{
+ const ReadId readId = orientedReadId.getReadId();
+ const Strand strand = orientedReadId.getStrand();
+
+ if(strand == 0) {
+ getOrientedReadMarkerKmersStrand0(readId, kmers);
+ } else {
+ getOrientedReadMarkerKmersStrand1(readId, kmers);
+ }
+}
+
+
+
+void Assembler::getOrientedReadMarkerKmersStrand0(ReadId readId, const span<Kmer>& kmers0) const
+{
+ const uint64_t k = assemblerInfo->k;
+
+ const auto read = reads->getRead(uint32_t(readId));
+ const OrientedReadId orientedReadId0(readId, 0);
+ const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()];
+ const uint64_t readMarkerCount = orientedReadMarkers0.size();
+ SHASTA_ASSERT(kmers0.size() == readMarkerCount);
+
+ // Loop over all markers.
+ for(uint64_t ordinal0=0; ordinal0<readMarkerCount; ordinal0++) {
+ Kmer kmer0;
+ extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0);
+ kmers0[ordinal0] = kmer0;
+ }
+
+}
+
+
+
+void Assembler::getOrientedReadMarkerKmersStrand1(ReadId readId, const span<Kmer>& kmers1) const
+{
+ const uint64_t k = assemblerInfo->k;
+
+ const auto read = reads->getRead(uint32_t(readId));
+ const OrientedReadId orientedReadId0(readId, 0);
+ const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()];
+ const uint64_t readMarkerCount = orientedReadMarkers0.size();
+ SHASTA_ASSERT(kmers1.size() == readMarkerCount);
+
+ // Loop over all markers.
+ for(uint64_t ordinal0=0; ordinal0<readMarkerCount; ordinal0++) {
+ Kmer kmer0;
+ extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0);
+ const Kmer kmer1 = kmer0.reverseComplement(k);
+ const uint64_t ordinal1 = readMarkerCount - 1 - ordinal0;
+ kmers1[ordinal1] = kmer1;
+ }
+
}
+
+
+
+// Get all marker KmerIds for an oriented read.
+void Assembler::getOrientedReadMarkerKmerIds(
+ OrientedReadId orientedReadId,
+ const span<KmerId>& kmerIds) const
+{
+ const ReadId readId = orientedReadId.getReadId();
+ const Strand strand = orientedReadId.getStrand();
+
+ if(strand == 0) {
+ getOrientedReadMarkerKmerIdsStrand0(readId, kmerIds);
+ } else {
+ getOrientedReadMarkerKmerIdsStrand1(readId, kmerIds);
+ }
+}
+
+
+
+void Assembler::getOrientedReadMarkerKmerIdsStrand0(ReadId readId, const span<KmerId>& kmerIds0) const
+{
+ const uint64_t k = assemblerInfo->k;
+
+ const auto read = reads->getRead(uint32_t(readId));
+ const OrientedReadId orientedReadId0(readId, 0);
+ const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()];
+ const uint64_t readMarkerCount = orientedReadMarkers0.size();
+ SHASTA_ASSERT(kmerIds0.size() == readMarkerCount);
+
+ // Loop over all markers.
+ for(uint64_t ordinal0=0; ordinal0<readMarkerCount; ordinal0++) {
+ Kmer kmer0;
+ extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0);
+ kmerIds0[ordinal0] = KmerId(kmer0.id(k));
+ }
+
+}
+
+
+
+void Assembler::getOrientedReadMarkerKmerIdsStrand1(ReadId readId, const span<KmerId>& kmerIds1) const
+{
+ const uint64_t k = assemblerInfo->k;
+
+ const auto read = reads->getRead(uint32_t(readId));
+ const OrientedReadId orientedReadId0(readId, 0);
+ const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()];
+ const uint64_t readMarkerCount = orientedReadMarkers0.size();
+ SHASTA_ASSERT(kmerIds1.size() == readMarkerCount);
+
+ // Loop over all markers.
+ for(uint64_t ordinal0=0; ordinal0<readMarkerCount; ordinal0++) {
+ Kmer kmer0;
+ extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0);
+ const Kmer kmer1 = kmer0.reverseComplement(k);
+ const uint64_t ordinal1 = readMarkerCount - 1 - ordinal0;
+ kmerIds1[ordinal1] = KmerId(kmer1.id(k));
+ }
+
+}
+
+
+
+void Assembler::getOrientedReadMarkers(
+ OrientedReadId orientedReadId,
+ const span<MarkerWithOrdinal>& markers) const
+{
+ const ReadId readId = orientedReadId.getReadId();
+ const Strand strand = orientedReadId.getStrand();
+
+ if(strand == 0) {
+ getOrientedReadMarkersStrand0(readId, markers);
+ } else {
+ getOrientedReadMarkersStrand1(readId, markers);
+ }
+
+}
+
+
+
+void Assembler::getOrientedReadMarkersStrand0(
+ ReadId readId,
+ const span<MarkerWithOrdinal>& markers0) const
+{
+ const uint64_t k = assemblerInfo->k;
+
+ const auto read = reads->getRead(uint32_t(readId));
+ const OrientedReadId orientedReadId0(readId, 0);
+ const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()];
+ const uint64_t readMarkerCount = orientedReadMarkers0.size();
+ SHASTA_ASSERT(markers0.size() == readMarkerCount);
+
+ // Loop over all markers.
+ for(uint64_t ordinal0=0; ordinal0<readMarkerCount; ordinal0++) {
+ const CompressedMarker& compressedMarker0 = orientedReadMarkers0[ordinal0];
+ const uint32_t position = compressedMarker0.position;
+ Kmer kmer0;
+ extractKmer(read, uint64_t(position), k, kmer0);
+ markers0[ordinal0] = MarkerWithOrdinal(KmerId(kmer0.id(k)), position, uint32_t(ordinal0));
+ }
+
+}
+
+
+
+void Assembler::getOrientedReadMarkersStrand1(
+ ReadId readId,
+ const span<MarkerWithOrdinal>& markers1) const
+{
+ const uint64_t k = assemblerInfo->k;
+
+ const auto read = reads->getRead(uint32_t(readId));
+ const OrientedReadId orientedReadId0(readId, 0);
+ const OrientedReadId orientedReadId1(readId, 1);
+ const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()];
+ const auto orientedReadMarkers1 = markers[orientedReadId1.getValue()];
+ const uint64_t readMarkerCount = orientedReadMarkers0.size();
+ SHASTA_ASSERT(markers1.size() == readMarkerCount);
+
+ // Loop over all markers.
+ for(uint64_t ordinal0=0; ordinal0<readMarkerCount; ordinal0++) {
+ const uint64_t ordinal1 = readMarkerCount - 1 - ordinal0;
+ const CompressedMarker& compressedMarker1 = orientedReadMarkers1[ordinal1];
+ const uint32_t position1 = compressedMarker1.position;
+ Kmer kmer0;
+ extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0);
+ const Kmer kmer1 = kmer0.reverseComplement(k);
+ markers1[ordinal1] = MarkerWithOrdinal(KmerId(kmer1.id(k)), position1, uint32_t(ordinal1));
+ }
+
+}
+
+
+
+// Get all marker Kmers for a read in both orientations.
+void Assembler::getReadMarkerKmers(
+ ReadId readId,
+ const span<Kmer>& kmers0,
+ const span<Kmer>& kmers1) const
+{
+ const uint64_t k = assemblerInfo->k;
+
+ // Access the information we need for this read.
+ const auto read = reads->getRead(uint32_t(readId));
+ const OrientedReadId orientedReadId0(uint32_t(readId), 0);
+ const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()];
+ const uint64_t readMarkerCount = orientedReadMarkers0.size();
+ SHASTA_ASSERT(kmers0.size() == readMarkerCount);
+ SHASTA_ASSERT(kmers1.size() == readMarkerCount);
+
+ // Loop over all markers.
+ for(uint64_t ordinal0=0; ordinal0<readMarkerCount; ordinal0++) {
+
+ // Strand 0.
+ Kmer kmer0;
+ extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0);
+ kmers0[ordinal0] = kmer0;
+
+ // Strand 1.
+ const Kmer kmer1 = kmer0.reverseComplement(k);
+ const uint64_t ordinal1 = readMarkerCount - 1 - ordinal0;
+ kmers1[ordinal1] = kmer1;
+ }
+
+}
+
+
+
+// Get all marker KmerIds for a read in both orientations.
+void Assembler::getReadMarkerKmerIds(
+ ReadId readId,
+ const span<KmerId>& kmerIds0,
+ const span<KmerId>& kmerIds1) const
+{
+ // Get the marker length.
+ const uint64_t k = assemblerInfo->k;
+
+ // Access the information we need for this read.
+ const auto read = reads->getRead(uint32_t(readId));
+ const OrientedReadId orientedReadId0(uint32_t(readId), 0);
+ const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()];
+ const uint64_t readMarkerCount = orientedReadMarkers0.size();
+ SHASTA_ASSERT(kmerIds0.size() == readMarkerCount);
+ SHASTA_ASSERT(kmerIds1.size() == readMarkerCount);
+
+ // Loop over all markers.
+ for(uint64_t ordinal0=0; ordinal0<readMarkerCount; ordinal0++) {
+
+ // Strand 0.
+ Kmer kmer0;
+ extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0);
+ kmerIds0[ordinal0] = KmerId(kmer0.id(k));
+
+ // Strand 1.
+ const Kmer kmer1 = kmer0.reverseComplement(k);
+ const uint64_t ordinal1 = readMarkerCount - 1 - ordinal0;
+ kmerIds1[ordinal1] = KmerId(kmer1.id(k));
+ }
+
+}
+
+
+
+// Get the Kmer for an oriented read at a given marker ordinal.
+Kmer Assembler::getOrientedReadMarkerKmer(OrientedReadId orientedReadId, uint64_t ordinal) const
+{
+ const uint64_t k = assemblerInfo->k;
+
+ const ReadId readId = orientedReadId.getReadId();
+ const Strand strand = orientedReadId.getStrand();
+ const auto read = reads->getRead(readId);
+ const OrientedReadId orientedReadId0(uint32_t(readId), 0);
+ const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()];
+
+ if(strand == 0) {
+
+ const uint64_t ordinal0 = ordinal;
+ Kmer kmer0;
+ extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0);
+ return kmer0;
+
+ } else {
+
+ const uint64_t ordinal0 = orientedReadMarkers0.size() - 1 - ordinal;
+ Kmer kmer0;
+ extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0);
+ return kmer0.reverseComplement(k);
+
+ }
+}
+
+
+
+// Get the KmerId for an oriented read at a given marker ordinal.
+KmerId Assembler::getOrientedReadMarkerKmerId(OrientedReadId orientedReadId, uint64_t ordinal) const
+{
+ const uint64_t k = assemblerInfo->k;
+
+ const ReadId readId = orientedReadId.getReadId();
+ const Strand strand = orientedReadId.getStrand();
+ const auto read = reads->getRead(readId);
+ const OrientedReadId orientedReadId0(uint32_t(readId), 0);
+ const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()];
+
+ if(strand == 0) {
+
+ const uint64_t ordinal0 = ordinal;
+ Kmer kmer0;
+ extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0);
+ return KmerId(kmer0.id(k));
+
+ } else {
+
+ const uint64_t ordinal0 = orientedReadMarkers0.size() - 1 - ordinal;
+ Kmer kmer0;
+ extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0);
+ return KmerId(kmer0.reverseComplement(k).id(k));
+
+ }
+}
+
diff --git a/src/AssemblerMode3.cpp b/src/AssemblerMode3.cpp
index 4c4199f..0990b8b 100644
--- a/src/AssemblerMode3.cpp
+++ b/src/AssemblerMode3.cpp
@@ -1,82 +1,236 @@
+// Shasta
#include "Assembler.hpp"
-#include "mode3.hpp"
-#include "mode3-Detangler.hpp"
-#include "mode3-PathGraph.hpp"
+#include "LocalMarkerGraph1.hpp"
+#include "mode3-LocalAssembly.hpp"
+#include "mode3-AssemblyGraph.hpp"
+#include "mode3-PrimaryGraph.hpp"
+#include "Mode3Assembler.hpp"
#include "Reads.hpp"
using namespace shasta;
-using namespace mode3;
+// Boost libraries.
+#include <boost/graph/iteration_macros.hpp>
+// Standard library.
+#include "fstream.hpp"
+#include <map>
-void Assembler::mode3Assembly(
- size_t threadCount)
-{
- // EXPOSE WHEN CODE STABILIZES.
- // const uint64_t minClusterSize = 3;
- // Adjust the numbers of threads, if necessary.
- if(threadCount == 0) {
- threadCount = std::thread::hardware_concurrency();
- }
- assemblyGraph3Pointer = std::make_shared<mode3::AssemblyGraph>(
- largeDataFileNamePrefix,
- largeDataPageSize,
- threadCount,
- assemblerInfo->readRepresentation,
- assemblerInfo->k,
- *reads,
+void Assembler::flagPrimaryMarkerGraphEdges(
+ uint64_t minPrimaryCoverage,
+ uint64_t maxPrimaryCoverage,
+ uint64_t threadCount)
+{
+ // Check that we have what we need.
+ SHASTA_ASSERT(markers.isOpen());
+ checkMarkerGraphVerticesAreAvailable();
+ SHASTA_ASSERT(markerGraph.edges.isOpenWithWriteAccess);
+ SHASTA_ASSERT(markerGraph.disjointSetsHistogram.isOpen);
+
+ markerGraph.flagPrimaryEdges(
+ minPrimaryCoverage,
+ maxPrimaryCoverage,
markers,
- markerGraph,
- *consensusCaller);
- auto& assemblyGraph3 = *assemblyGraph3Pointer;
- assemblyGraph3.writeGfa("AssemblyGraph");
- // assemblyGraph3.clusterSegments(threadCount, minClusterSize);
- assemblyGraph3.createJaccardGraph(threadCount);
- // assemblyGraph3.assembleJaccardGraphPaths();
- assemblyGraph3.createDeBruijnGraph();
-
+ threadCount);
}
-void Assembler::accessMode3AssemblyGraph()
+void Assembler::mode3Assembly(
+ uint64_t threadCount,
+ const Mode3AssemblyOptions& options,
+ bool debug
+ )
{
- assemblyGraph3Pointer = std::make_shared<mode3::AssemblyGraph>(
- largeDataFileNamePrefix,
- assemblerInfo->readRepresentation,
- assemblerInfo->k,
- *reads, markers, markerGraph, *consensusCaller);
+ mode3Assembler = make_shared<Mode3Assembler>(*this, threadCount, options, debug);
}
-void Assembler::analyzeMode3Subgraph(const vector<uint64_t>& segmentIds)
+
+void Assembler::mode3AssembleComponent(
+ const string& fileName,
+ uint64_t threadCount,
+ const Mode3AssemblyOptions& options,
+ bool assembleSequence,
+ bool debug) const
{
- SHASTA_ASSERT(assemblyGraph3Pointer);
- vector<mode3::AssemblyGraph::AnalyzeSubgraphClasses::Cluster> clusters;
- assemblyGraph3Pointer->analyzeSubgraph(segmentIds, clusters, true);
+ mode3::AssemblyGraph(fileName, *this, threadCount, options, assembleSequence, debug);
}
-void Assembler::createMode3PathGraph()
+// Assemble sequence between two primary edges.
+void Assembler::fillMode3AssemblyPathStep(const vector<string>& request, ostream& html)
{
- SHASTA_ASSERT(assemblyGraph3Pointer);
- const mode3::AssemblyGraph& assemblyGraph = *assemblyGraph3Pointer;
+ // Check that our assumptions are satisfied.
+ if(assemblerInfo->assemblyMode != 3) {
+ throw runtime_error("This is only available for assembly mode 3.");
+ }
+ SHASTA_ASSERT(getReads().representation == 0); // No RLE.
+ SHASTA_ASSERT((assemblerInfo->k % 2) == 0); // Marker length is even.
- mode3::PathGraph pathGraph(assemblyGraph);
+ mode3::LocalAssemblyDisplayOptions options(html);
-}
+ // Get the parameters for the request.
+ uint64_t edgeIdA = invalid<uint64_t>;
+ getParameterValue(request, "edgeIdA", edgeIdA);
+ uint64_t edgeIdB = invalid<uint64_t>;
+ getParameterValue(request, "edgeIdB", edgeIdB);
+ string useAString;
+ const bool useA = getParameterValue(request, "useA", useAString);
-void Assembler::createMode3Detangler()
-{
- SHASTA_ASSERT(assemblyGraph3Pointer);
- const mode3::AssemblyGraph& assemblyGraph = *assemblyGraph3Pointer;
+ string useBString;
+ const bool useB = getParameterValue(request, "useB", useBString);
- mode3::Detangler detangler(assemblyGraph);
+ uint64_t minVertexCoverage = 0;
+ getParameterValue(request, "minVertexCoverage", minVertexCoverage);
-}
+ string showOrientedReadsString;
+ options.showOrientedReads = getParameterValue(request, "showOrientedReads", showOrientedReadsString);
+
+ string showMarkersString;
+ options.showMarkers = getParameterValue(request, "showMarkers", showMarkersString);
+
+ string showGraphString;
+ options.showGraph = getParameterValue(request, "showGraph", showGraphString);
+
+ string showVerticesString;
+ options.showVertices = getParameterValue(request, "showVertices", showVerticesString);
+
+ string showVertexLabelsString;
+ options.showVertexLabels = getParameterValue(request, "showVertexLabels", showVertexLabelsString);
+
+ string showEdgeLabelsString;
+ options.showEdgeLabels = getParameterValue(request, "showEdgeLabels", showEdgeLabelsString);
+
+ string showAssemblyDetailsString;
+ options.showAssemblyDetails = getParameterValue(request, "showAssemblyDetails", showAssemblyDetailsString);
+
+ string showDebugInformationString;
+ options.showDebugInformation = getParameterValue(request, "showDebugInformation", showDebugInformationString);
+
+
+
+ // Write the form.
+ html <<
+ "<form>"
+ "<table>"
+
+ "<tr><th class=left>Edge A<td class=centered>"
+ "<input type=text required name=edgeIdA size=8 style='text-align:center' " <<
+ ((edgeIdA == invalid<uint64_t>) ? "" : ("value='" + to_string(edgeIdA) + "'")) << ">"
+
+ "<tr><th class=left>Edge B<td class=centered>"
+ "<input type=text required name=edgeIdB size=8 style='text-align:center' " <<
+ ((edgeIdB == invalid<uint64_t>) ? "" : ("value='" + to_string(edgeIdB) + "'")) << ">"
+
+ "<tr>"
+ "<th class=left>Use for assembly oriented reads that appear only on edge A"
+ "<td class=centered><input type=checkbox name=useA" <<
+ (useA ? " checked" : "") << ">"
+
+ "<tr>"
+ "<th class=left>Use for assembly oriented reads that appear only on edge B"
+ "<td class=centered><input type=checkbox name=useB" <<
+ (useB ? " checked" : "") << ">"
+ "<tr><th class=left>Minimum vertex coverage<br>(0 = automatic)<td class=centered>"
+ "<input type=text required name=minVertexCoverage size=8 style='text-align:center' "
+ "value='" << minVertexCoverage << "'>"
+
+ "<tr>"
+ "<th class=left>Display the oriented reads"
+ "<td class=centered><input type=checkbox name=showOrientedReads" <<
+ (options.showOrientedReads ? " checked" : "") << ">"
+
+ "<tr>"
+ "<th class=left>Display the markers"
+ "<td class=centered><input type=checkbox name=showMarkers" <<
+ (options.showMarkers ? " checked" : "") << ">"
+
+ "<tr>"
+ "<th class=left>Display the graph"
+ "<td class=centered><input type=checkbox name=showGraph" <<
+ (options.showGraph ? " checked" : "") << ">"
+
+ "<tr>"
+ "<th class=left>Display the vertices"
+ "<td class=centered><input type=checkbox name=showVertices" <<
+ (options.showVertices ? " checked" : "") << ">"
+
+ "<tr>"
+ "<th class=left>Display vertex labels"
+ "<td class=centered><input type=checkbox name=showVertexLabels" <<
+ (options.showVertexLabels ? " checked" : "") << ">"
+
+ "<tr>"
+ "<th class=left>Display edge labels"
+ "<td class=centered><input type=checkbox name=showEdgeLabels" <<
+ (options.showEdgeLabels ? " checked" : "") << ">"
+
+ "<tr>"
+ "<th class=left>Display assembly details"
+ "<td class=centered><input type=checkbox name=showAssemblyDetails" <<
+ (options.showAssemblyDetails ? " checked" : "") << ">"
+
+ "<tr>"
+ "<th class=left>Display debug information"
+ "<td class=centered><input type=checkbox name=showDebugInformation" <<
+ (options.showDebugInformation ? " checked" : "") << ">"
+
+ "</table>"
+ "<br><input type=submit value='Do it'>"
+ "</form>";
+
+
+
+ // If the edge ids are missing, do nothing.
+ if(edgeIdA == invalid<uint64_t> or edgeIdB == invalid<uint64_t>) {
+ return;
+ }
+
+ // Sanity checks on the edge ids.
+ if(edgeIdA >= markerGraph.edges.size()) {
+ throw runtime_error("Marker graph edge " + to_string(edgeIdA) +
+ " is not valid. Maximum valid edge id is " + to_string(markerGraph.edges.size()));
+ }
+ if(edgeIdB >= markerGraph.edges.size()) {
+ throw runtime_error("Marker graph edge " + to_string(edgeIdB) +
+ " is not valid. Maximum valid edge id is " + to_string(markerGraph.edges.size()));
+ }
+
+ // Sanity check that the two edges are distinct.
+ if(edgeIdA == edgeIdB) {
+ html << "<p>Specify two distinct edges.";
+ return;
+ }
+
+ // This analysis can only be done if both edges have no duplicate OrientedReadIds
+ // in their MarkerIntervals.
+ if(markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdA)) {
+ html << "<p>Marker graph edge " << edgeIdA << " has duplicate oriented reads.";
+ return;
+ }
+ if(markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdB)) {
+ html << "<p>Marker graph edge " << edgeIdB << " has duplicate oriented reads.";
+ return;
+ }
+
+ // Check that there are common reads.
+ MarkerGraphEdgePairInfo info;
+ SHASTA_ASSERT(analyzeMarkerGraphEdgePair(edgeIdA, edgeIdB, info));
+ if(info.common == 0) {
+ html << "<p>The two edges have no common oriented reads.";
+ return;
+ }
+
+ // Local assembly for this assembly step.
+ mode3::LocalAssembly localAssembly(*this, edgeIdA, edgeIdB, minVertexCoverage,
+ options,
+ httpServerData.assemblerOptions->assemblyOptions.mode3Options.localAssemblyOptions,
+ useA, useB);
+}
diff --git a/src/AssemblerOptions.cpp b/src/AssemblerOptions.cpp
index abc112e..d4cc7a0 100644
--- a/src/AssemblerOptions.cpp
+++ b/src/AssemblerOptions.cpp
@@ -14,7 +14,11 @@ using namespace shasta;
-// Constructor.
+// Constructor from a command line.
+// If the command line includes a --config option,
+// the specified built-in configuration or configuration file
+// is used to fill the AssemblyOptions,
+// but values specified on the command line take precedence.
AssemblerOptions::AssemblerOptions(int argumentCount, const char** arguments) :
commandLineOnlyOptionsDescription("Options allowed only on the command line"),
configurableOptionsDescription("Options allowed on the command line and in the config file")
@@ -124,6 +128,41 @@ AssemblerOptions::AssemblerOptions(int argumentCount, const char** arguments) :
+// Constructor from a configuration file.
+// This only fills in the configurable options specified in
+// the given configuration file. Command line only options
+// are left at their defaults.
+AssemblerOptions::AssemblerOptions(const string& fileName)
+{
+
+ using boost::program_options::positional_options_description;
+ using boost::program_options::value;
+ using boost::program_options::variables_map;
+ using boost::program_options::command_line_parser;
+
+ addConfigurableOptions();
+
+ ifstream configFile(fileName);
+ if (not configFile) {
+ throw runtime_error("Invalid configuration file " + fileName + " specified.\n");
+ }
+ variables_map variablesMap;
+ store(parse_config_file(configFile, configurableOptionsDescription), variablesMap);
+ notify(variablesMap);
+
+ // Parse MarkerGraph.simplifyMaxLength.
+ markerGraphOptions.parseSimplifyMaxLength();
+
+ // Parse ReadOptions.desiredCoverageString into its numeric value.
+ readsOptions.parseDesiredCoverageString();
+
+ // Unpack the consensuscaller and replace relative path with the absolute
+ // one if necessary.
+ assemblyOptions.parseConsensusCallerString();
+}
+
+
+
// Add non-configurable options to the Boost option description object.
// These are options that can only be used on the command line
// (not in the configuration file).
@@ -252,6 +291,12 @@ void AssemblerOptions::addConfigurableOptions()
"This is done by specifying the O_DIRECT flag when opening "
"input files containing reads.")
+ ("Reads.handleDuplicates",
+ value<string>(&readsOptions.handleDuplicates)->
+ default_value("useOneCopy"),
+ "Controls handling of reads with duplicate names. "
+ "Can be one of: useAllCopies, useOneCopy, useNone, forbid.")
+
("Reads.palindromicReads.skipFlagging",
bool_switch(&readsOptions.palindromicReads.skipFlagging)->
default_value(false),
@@ -327,8 +372,7 @@ void AssemblerOptions::addConfigurableOptions()
("MinHash.version",
value<int>(&minHashOptions.version)->
default_value(0),
- "Controls the version of the LowHash algorithm to use. Can be 0 (default) "
- "or 1.(experimental).")
+ "Controls the version of the LowHash algorithm to use. Must be 0 (default).")
("MinHash.m",
value<int>(&minHashOptions.m)->
@@ -357,12 +401,16 @@ void AssemblerOptions::addConfigurableOptions()
("MinHash.minBucketSize",
value<int>(&minHashOptions.minBucketSize)->
default_value(0),
- "The minimum bucket size to be used by the LowHash algorithm.")
+ "The minimum bucket size to be used by the LowHash algorithm. "
+ "If minBucketSize and maxBucketSize are both 0, they are adjusted automatically "
+ "at each iteration using simple heuristics.")
("MinHash.maxBucketSize",
value<int>(&minHashOptions.maxBucketSize)->
default_value(10),
- "The maximum bucket size to be used by the LowHash algorithm.")
+ "The maximum bucket size to be used by the LowHash algorithm. "
+ "If minBucketSize and maxBucketSize are both 0, they are adjusted automatically "
+ "at each iteration using simple heuristics.")
("MinHash.minFrequency",
value<int>(&minHashOptions.minFrequency)->
@@ -381,7 +429,7 @@ void AssemblerOptions::addConfigurableOptions()
value<int>(&alignOptions.alignMethod)->
default_value(3),
"The alignment method to be used to create the read graph & the marker graph. "
- "0 = old Shasta method, 1 = SeqAn (slow), 3 = banded SeqAn, 4 = new Shasta method (experimental).")
+ "0 = old Shasta method, 1 = SeqAn (slow), 3 = banded SeqAn, 4 and 5 = experimental.")
("Align.maxSkip",
value<int>(&alignOptions.maxSkip)->
@@ -488,6 +536,16 @@ void AssemblerOptions::addConfigurableOptions()
default_value(100),
"Only used for alignment method 4 (experimental).")
+ ("Align.align5.driftRateTolerance",
+ value<double>(&alignOptions.align5DriftRateTolerance)->
+ default_value(0.02),
+ "Maximum allowed drift rate for alignment method 5.")
+
+ ("Align.align5.minBandExtend",
+ value<uint64_t>(&alignOptions.align5MinBandExtend)->
+ default_value(10),
+ "Minimum band extension for alignment method 5.")
+
("ReadGraph.creationMethod",
value<int>(&readGraphOptions.creationMethod)->
default_value(0),
@@ -501,7 +559,7 @@ void AssemblerOptions::addConfigurableOptions()
("ReadGraph.maxChimericReadDistance",
value<int>(&readGraphOptions.maxChimericReadDistance)->
default_value(2),
- "Used for chimeric read detection.")
+ "Used for chimeric read detection. Set to 0 to turn off chimera detection.")
("ReadGraph.strandSeparationMethod",
value<uint64_t>(&readGraphOptions.strandSeparationMethod)->
@@ -599,7 +657,7 @@ void AssemblerOptions::addConfigurableOptions()
default_value(6),
"Minimum edge coverage (number of supporting oriented reads) "
"for a marker graph edge to be created."
- "Experimental. Only used with --Assembly.mode 1.")
+ "Only used with --Assembly.mode 2.")
("MarkerGraph.minEdgeCoveragePerStrand",
value<uint64_t>(&markerGraphOptions.minEdgeCoveragePerStrand)->
@@ -607,7 +665,7 @@ void AssemblerOptions::addConfigurableOptions()
"Minimum edge coverage (number of supporting oriented reads) "
"on each strand "
"for a marker graph edge to be created."
- "Experimental. Only used with --Assembly.mode 1.")
+ "Only used with --Assembly.mode 2.")
("MarkerGraph.allowDuplicateMarkers",
bool_switch(&markerGraphOptions.allowDuplicateMarkers)->
@@ -667,11 +725,6 @@ void AssemblerOptions::addConfigurableOptions()
"corresponding marker graph edges. A cross edge is defined as an edge v0->v1 "
"with out-degree(v0)>1, in-degree(v1)>1.")
- ("MarkerGraph.reverseTransitiveReduction",
- bool_switch(&markerGraphOptions.reverseTransitiveReduction)->
- default_value(false),
- "Perform approximate reverse transitive reduction of the marker graph.")
-
("MarkerGraph.peakFinder.minAreaFraction",
value<double>(&markerGraphOptions.peakFinderMinAreaFraction)->
default_value(0.08),
@@ -924,6 +977,178 @@ void AssemblerOptions::addConfigurableOptions()
default_value(false),
"Suppress output of haploid representation of the assembly (Mode 2 assembly only).")
+ ("Assembly.mode3.minPrimaryCoverage",
+ value<uint64_t>(&assemblyOptions.mode3Options.minPrimaryCoverage)->
+ default_value(0),
+ "Minimum primary coverage. "
+ "If minPrimaryCoverage and maxPrimaryCoverage are both 0, "
+ "they are set automatically to appropriate values using a simple heuristic."
+ "Only used with --Assembly.mode 3.")
+
+ ("Assembly.mode3.maxPrimaryCoverage",
+ value<uint64_t>(&assemblyOptions.mode3Options.maxPrimaryCoverage)->
+ default_value(0),
+ "Maximum primary coverage. "
+ "If minPrimaryCoverage and maxPrimaryCoverage are both 0, "
+ "they are set automatically to appropriate values using a simple heuristic."
+ "Only used with --Assembly.mode 3.")
+
+ ("Assembly.mode3.primaryGraph.maxLoss",
+ value<double>(&assemblyOptions.mode3Options.primaryGraphOptions.maxLoss)->
+ default_value(0.1),
+ "Use for weak edge removal in the primary graph. "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.primaryGraph.crossEdgesLowCoverageThreshold",
+ value<uint64_t>(&assemblyOptions.mode3Options.primaryGraphOptions.crossEdgesLowCoverageThreshold)->
+ default_value(1),
+ "Low coverage threshold for cross edge removal in the primary graph. "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.primaryGraph.crossEdgesHighCoverageThreshold",
+ value<uint64_t>(&assemblyOptions.mode3Options.primaryGraphOptions.crossEdgesHighCoverageThreshold)->
+ default_value(3),
+ "High coverage threshold for cross edge removal in the primary graph. "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.assemblyGraph.detangleToleranceLow",
+ value<uint64_t>(&assemblyOptions.mode3Options.assemblyGraphOptions.detangleToleranceLow)->
+ default_value(0),
+ "Used for detangling of the assembly graph "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.assemblyGraph.detangleToleranceHigh",
+ value<uint64_t>(&assemblyOptions.mode3Options.assemblyGraphOptions.detangleToleranceHigh)->
+ default_value(2),
+ "Used for detangling of the assembly graph "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.assemblyGraph.epsilon",
+ value<double>(&assemblyOptions.mode3Options.assemblyGraphOptions.epsilon)->
+ default_value(0.1),
+ "Epsilon value for the Bayesian model used for detangling the assembly graph "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.assemblyGraph.minLogP",
+ value<double>(&assemblyOptions.mode3Options.assemblyGraphOptions.minLogP)->
+ default_value(20.),
+ "MinLogP value (in dB) for the Bayesian model used for detangling the assembly graph "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.assemblyGraph.longBubbleThreshold",
+ value<uint64_t>(&assemblyOptions.mode3Options.assemblyGraphOptions.longBubbleThreshold)->
+ default_value(5000),
+ "Long bubble threshold "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.assemblyGraph.phaseErrorThreshold",
+ value<double>(&assemblyOptions.mode3Options.assemblyGraphOptions.phaseErrorThreshold)->
+ default_value(0.1),
+ "Phase error threshold for phasing "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.assemblyGraph.bubbleErrorThreshold",
+ value<double>(&assemblyOptions.mode3Options.assemblyGraphOptions.bubbleErrorThreshold)->
+ default_value(0.03),
+ "Bubble error threshold for bubble cleanup "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.assemblyGraph.bubbleCleanupMaxOffset",
+ value<uint64_t>(&assemblyOptions.mode3Options.assemblyGraphOptions.bubbleCleanupMaxOffset)->
+ default_value(1000),
+ "Maximum bubble offset for bubble cleanup "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.assemblyGraph.chainTerminalCommonThreshold",
+ value<uint64_t>(&assemblyOptions.mode3Options.assemblyGraphOptions.chainTerminalCommonThreshold)->
+ default_value(3),
+ "Used for bubble cleanup "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.assemblyGraph.superbubbleLengthThreshold1",
+ value<uint64_t>(&assemblyOptions.mode3Options.assemblyGraphOptions.superbubbleLengthThreshold1)->
+ default_value(30000),
+ "Length threshold used for superbubble cleanup "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.assemblyGraph.superbubbleLengthThreshold2",
+ value<uint64_t>(&assemblyOptions.mode3Options.assemblyGraphOptions.superbubbleLengthThreshold2)->
+ default_value(10000),
+ "Low length threshold used for superbubble removal "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.assemblyGraph.superbubbleLengthThreshold3",
+ value<uint64_t>(&assemblyOptions.mode3Options.assemblyGraphOptions.superbubbleLengthThreshold3)->
+ default_value(30000),
+ "High length threshold used for superbubble removal "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.assemblyGraph.superbubbleLengthThreshold4",
+ value<uint64_t>(&assemblyOptions.mode3Options.assemblyGraphOptions.superbubbleLengthThreshold4)->
+ default_value(30000),
+ "Length threshold used for superbubble detangling "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.localAssembly.estimatedOffsetRatio",
+ value<double>(&assemblyOptions.mode3Options.localAssemblyOptions.estimatedOffsetRatio)->
+ default_value(1.1),
+ "For local assembly, the estimated offset between edgeIdA and edgeIdB gets "
+ "extended by this ratio to decide how much to extend reads that only appear in edgeIdA or edgeIdB. "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.localAssembly.vertexSamplingRate",
+ value<double>(&assemblyOptions.mode3Options.localAssemblyOptions.vertexSamplingRate)->
+ default_value(0.8),
+ "Vertex sampling rate for local assembly, used to set minVertexCoverage. "
+ "Only used if minVertexCoverage is 0 on input to mode3::LocalAssembly constructor. "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.localAssembly.matchScore",
+ value<int64_t>(&assemblyOptions.mode3Options.localAssemblyOptions.matchScore)->
+ default_value(6),
+ "Match score for local assembly. (Mode 3 assembly only).")
+
+ ("Assembly.mode3.localAssembly.mismatchScore",
+ value<int64_t>(&assemblyOptions.mode3Options.localAssemblyOptions.mismatchScore)->
+ default_value(-1),
+ "Mismatch score for local assembly. (Mode 3 assembly only).")
+
+ ("Assembly.mode3.localAssembly.gapScore",
+ value<int64_t>(&assemblyOptions.mode3Options.localAssemblyOptions.gapScore)->
+ default_value(-1),
+ "Gap score for local assembly. (Mode 3 assembly only).")
+
+ ("Assembly.mode3.localAssembly.maxSkipBases",
+ value<uint64_t>(&assemblyOptions.mode3Options.localAssemblyOptions.maxSkipBases)->
+ default_value(500),
+ "Number of bases (not markers) that can be skipped by an alignment in local assembly. "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.localAssembly.maxDrift",
+ value<double>(&assemblyOptions.mode3Options.localAssemblyOptions.maxDrift)->
+ default_value(0.005),
+ "The maximum tolerated length drift of each read. "
+ "Used to compute the band for banded alignments in local assembly. "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.localAssembly.minHalfBand",
+ value<uint64_t>(&assemblyOptions.mode3Options.localAssemblyOptions.minHalfBand)->
+ default_value(100),
+ "Minimum half band, in markers, for local assembly. "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.localAssembly.minScoreRatio",
+ value<double>(&assemblyOptions.mode3Options.localAssemblyOptions.minScoreRatio)->
+ default_value(0.7),
+ "Score threshold for discarding alignments in for local assembly. "
+ "(Mode 3 assembly only).")
+
+ ("Assembly.mode3.localAssembly.maxMsaLength",
+ value<uint64_t>(&assemblyOptions.mode3Options.localAssemblyOptions.maxMsaLength)->
+ default_value(5000),
+ "Maximum length of a multiple sequence alignment for local assembly. "
+ "(Mode 3 assembly only).")
+
;
}
@@ -950,6 +1175,7 @@ void ReadsOptions::write(ostream& s) const
s << "desiredCoverage = " << desiredCoverageString << "\n";
s << "noCache = " <<
convertBoolToPythonString(noCache) << "\n";
+ s << "handleDuplicates = " << handleDuplicates << "\n";
palindromicReads.write(s);
}
@@ -1009,6 +1235,8 @@ void AlignOptions::write(ostream& s) const
s << "align4.deltaY = " << align4DeltaY << "\n";
s << "align4.minEntryCountPerCell = " << align4MinEntryCountPerCell << "\n";
s << "align4.maxDistanceFromBoundary = " << align4MaxDistanceFromBoundary << "\n";
+ s << "align5.driftRateTolerance = " << align5DriftRateTolerance << "\n";
+ s << "align5.minBandExtend = " << align5MinBandExtend << "\n";
}
@@ -1060,8 +1288,6 @@ void MarkerGraphOptions::write(ostream& s) const
s << "pruneIterationCount = " << pruneIterationCount << "\n";
s << "simplifyMaxLength = " << simplifyMaxLength << "\n";
s << "crossEdgeCoverageThreshold = " << crossEdgeCoverageThreshold << "\n";
- s << "reverseTransitiveReduction = " <<
- convertBoolToPythonString(reverseTransitiveReduction) << "\n";
s << "peakFinder.minAreaFraction = " << peakFinderMinAreaFraction << "\n";
s << "peakFinder.areaStartIndex = " << peakFinderAreaStartIndex << "\n";
@@ -1105,6 +1331,7 @@ void AssemblyOptions::write(ostream& s) const
s << "iterative.bridgeRemovalMaxDistance = " << iterativeBridgeRemovalMaxDistance << "\n";
mode2Options.write(s);
+ mode3Options.write(s);
}
@@ -1134,6 +1361,68 @@ void Mode2AssemblyOptions::write(ostream& s) const
+void Mode3AssemblyOptions::write(ostream& s) const
+{
+ s << "minPrimaryCoverage = " << minPrimaryCoverage << "\n";
+ s << "maxPrimaryCoverage = " << maxPrimaryCoverage << "\n";
+ primaryGraphOptions.write(s);
+ assemblyGraphOptions.write(s);
+ localAssemblyOptions.write(s);
+}
+
+
+
+void Mode3AssemblyOptions::PrimaryGraphOptions::write(ostream& s) const
+{
+ s << "mode3.primaryGraph.maxLoss = " << maxLoss << "\n";
+ s << "mode3.primaryGraph.crossEdgesLowCoverageThreshold = " << crossEdgesLowCoverageThreshold << "\n";
+ s << "mode3.primaryGraph.crossEdgesHighCoverageThreshold = " << crossEdgesHighCoverageThreshold << "\n";
+
+}
+
+
+
+void Mode3AssemblyOptions::AssemblyGraphOptions::write(ostream& s) const
+{
+ s << "mode3.assemblyGraph.detangleToleranceLow = " << detangleToleranceLow << "\n";
+ s << "mode3.assemblyGraph.detangleToleranceHigh = " << detangleToleranceHigh << "\n";
+ s << "mode3.assemblyGraph.epsilon = " << epsilon << "\n";
+ s << "mode3.assemblyGraph.minLogP = " << minLogP << "\n";
+ s << "mode3.assemblyGraph.longBubbleThreshold = " << longBubbleThreshold << "\n";
+ s << "mode3.assemblyGraph.phaseErrorThreshold = " << phaseErrorThreshold << "\n";
+ s << "mode3.assemblyGraph.bubbleErrorThreshold = " << bubbleErrorThreshold << "\n";
+ s << "mode3.assemblyGraph.bubbleCleanupMaxOffset = " << bubbleCleanupMaxOffset << "\n";
+ s << "mode3.assemblyGraph.chainTerminalCommonThreshold = " << chainTerminalCommonThreshold << "\n";
+ s << "mode3.assemblyGraph.superbubbleLengthThreshold1 = " << superbubbleLengthThreshold1 << "\n";
+ s << "mode3.assemblyGraph.superbubbleLengthThreshold2 = " << superbubbleLengthThreshold2 << "\n";
+ s << "mode3.assemblyGraph.superbubbleLengthThreshold3 = " << superbubbleLengthThreshold3 << "\n";
+ s << "mode3.assemblyGraph.superbubbleLengthThreshold4 = " << superbubbleLengthThreshold4 << "\n";
+}
+
+
+
+void Mode3AssemblyOptions::LocalAssemblyOptions::write(ostream& s) const
+{
+ s << "mode3.localAssembly.estimatedOffsetRatio = " << estimatedOffsetRatio << "\n";
+ s << "mode3.localAssembly.vertexSamplingRate = " << vertexSamplingRate << "\n";
+
+ s << "mode3.localAssembly.matchScore = " << matchScore << "\n";
+ s << "mode3.localAssembly.mismatchScore = " << mismatchScore << "\n";
+ s << "mode3.localAssembly.gapScore = " << gapScore << "\n";
+
+ s << "mode3.localAssembly.maxSkipBases = " << maxSkipBases << "\n";
+
+ s << "mode3.localAssembly.maxDrift = " << maxDrift << "\n";
+
+ s << "mode3.localAssembly.minHalfBand = " << minHalfBand << "\n";
+
+ s << "mode3.localAssembly.minScoreRatio = " << minScoreRatio << "\n";
+
+ s << "mode3.localAssembly.maxMsaLength = " << maxMsaLength << "\n";
+}
+
+
+
void AssemblerOptions::write(ostream& s) const
{
readsOptions.write(s);
diff --git a/src/AssemblerOptions.hpp b/src/AssemblerOptions.hpp
index b8b74e9..e1a2f65 100644
--- a/src/AssemblerOptions.hpp
+++ b/src/AssemblerOptions.hpp
@@ -76,6 +76,7 @@ namespace shasta {
class MarkerGraphOptions;
class MinHashOptions;
class Mode2AssemblyOptions;
+ class Mode3AssemblyOptions;
class PalindromicReadOptions;
class ReadsOptions;
class ReadGraphOptions;
@@ -127,6 +128,16 @@ public:
bool noCache;
string desiredCoverageString;
uint64_t desiredCoverage;
+
+ // String to control handling of duplicate reads.
+ // Can be one of:
+ // useAllCopies
+ // useOneCopy
+ // useNone
+ // forbid
+ // See ReadFlags.hpp for the meaning of each option.
+ string handleDuplicates;
+
PalindromicReadOptions palindromicReads;
void write(ostream&) const;
@@ -195,6 +206,8 @@ public:
uint64_t align4DeltaY;
uint64_t align4MinEntryCountPerCell;
uint64_t align4MaxDistanceFromBoundary;
+ double align5DriftRateTolerance;
+ uint64_t align5MinBandExtend;
void write(ostream&) const;
};
@@ -246,7 +259,6 @@ public:
string simplifyMaxLength;
double crossEdgeCoverageThreshold;
vector<size_t> simplifyMaxLengthVector;
- bool reverseTransitiveReduction;
double peakFinderMinAreaFraction;
uint64_t peakFinderAreaStartIndex;
@@ -307,6 +319,104 @@ public:
+// Assembly options that are specific to Mode 3 assembly.
+// See source code in the mode3 namespace
+// (source files with a mode3-) prefix for more information
+class shasta::Mode3AssemblyOptions {
+public:
+
+ uint64_t minPrimaryCoverage;
+ uint64_t maxPrimaryCoverage;
+
+ // Options used to clean up the PrimaryGraph.
+ class PrimaryGraphOptions {
+ public:
+
+ // Parameter to control removal of weak edges.
+ double maxLoss;
+
+ // Parameters to control removal of cross edges.
+ uint64_t crossEdgesLowCoverageThreshold;
+ uint64_t crossEdgesHighCoverageThreshold;
+
+ void write(ostream&) const;
+ };
+ PrimaryGraphOptions primaryGraphOptions;
+
+
+
+ class AssemblyGraphOptions {
+ public:
+
+ // Detangle tolerances.
+ uint64_t detangleToleranceLow;
+ uint64_t detangleToleranceHigh;
+
+ // Bayesian model.
+ double epsilon;
+ double minLogP;
+
+ // Other thresholds used by the mode3::AssemblyGraph
+ uint64_t longBubbleThreshold;
+ double phaseErrorThreshold;
+ double bubbleErrorThreshold;
+ uint64_t bubbleCleanupMaxOffset;
+ uint64_t chainTerminalCommonThreshold;
+ uint64_t superbubbleLengthThreshold1;
+ uint64_t superbubbleLengthThreshold2;
+ uint64_t superbubbleLengthThreshold3;
+ uint64_t superbubbleLengthThreshold4;
+
+ void write(ostream&) const;
+ };
+ AssemblyGraphOptions assemblyGraphOptions;
+
+
+
+ // Options used by class mode3::LocalAssembly
+ class LocalAssemblyOptions {
+ public:
+
+ // The estimated offset gets extended by this ratio to
+ // decide how much to extend reads that only appear in edgeIdA or edgeIdB.
+ double estimatedOffsetRatio;
+
+ // Vertex sampling rate, used to set minVertexCoverage.
+ // Only used if minVertexCoverage is 0 on input to
+ // mode3::LocalAssembly constructor.
+ double vertexSamplingRate;
+
+ // Alignment parameters.
+ int64_t matchScore;
+ int64_t mismatchScore;
+ int64_t gapScore;
+
+ // Number of bases (not markers) that can be skipped by an alignment.
+ uint64_t maxSkipBases;
+
+ // The maximum tolerated length drift of each read.
+ // Used to compute the band for banded alignments.
+ double maxDrift;
+
+ // Minimum half band, in markers.
+ uint64_t minHalfBand;
+
+ // Minimum ration of scorew to best possible score for
+ // an alignment to be used.
+ double minScoreRatio;
+
+ // The maximum length of an MSA alignment we are willing to compute.
+ uint64_t maxMsaLength;
+
+ void write(ostream&) const;
+ };
+ LocalAssemblyOptions localAssemblyOptions;
+
+ void write(ostream&) const;
+};
+
+
+
// Options in the [Assembly] section of the configuration file.
// Can also be entered on the command line with option names
// beginning with "Assembly.".
@@ -343,6 +453,9 @@ public:
// Mode 2 assembly options.
Mode2AssemblyOptions mode2Options;
+ // Mode 3 assembly options.
+ Mode3AssemblyOptions mode3Options;
+
void write(ostream&) const;
// If a relative path is provided for a Bayesian consensus caller
@@ -365,9 +478,19 @@ public:
MarkerGraphOptions markerGraphOptions;
AssemblyOptions assemblyOptions;
- // Constructor.
+ // Constructor from a command line.
+ // If the command line includes a --config option,
+ // the specified built-in configuration or configuration file
+ // is used to fill the AssemblyOptions,
+ // but values specified on the command line take precedence.
AssemblerOptions(int argumentCount, const char** arguments);
+ // Constructor from a configuration file.
+ // This only fills in the configurable options specified in
+ // the given configuration file. Command line only options
+ // are left at their defaults.
+ AssemblerOptions(const string& fileName);
+
// Add configurable options to the Boost option description object.
void addCommandLineOnlyOptions();
void addConfigurableOptions();
diff --git a/src/AssemblerReadGraph.cpp b/src/AssemblerReadGraph.cpp
index 5389a68..70c4125 100644
--- a/src/AssemblerReadGraph.cpp
+++ b/src/AssemblerReadGraph.cpp
@@ -642,8 +642,7 @@ void Assembler::computeReadGraphConnectedComponents() const
componentMap[componentId].push_back(orientedReadId);
}
}
- cout << "The read graph has " << componentMap.size() <<
- " connected components." << endl;
+ // cout << "The read graph has " << componentMap.size() << " connected components." << endl;
@@ -1215,8 +1214,7 @@ void Assembler::flagCrossStrandReadGraphEdges2()
componentMap[componentId].push_back(orientedReadId);
}
}
- cout << "The read graph has " << componentMap.size() <<
- " connected components." << endl;
+ // cout << "The read graph has " << componentMap.size() << " connected components." << endl;
diff --git a/src/AssemblerReads.cpp b/src/AssemblerReads.cpp
index 22cbc79..da78c56 100644
--- a/src/AssemblerReads.cpp
+++ b/src/AssemblerReads.cpp
@@ -297,3 +297,13 @@ void Assembler::computeReadIdsSortedByName()
reads->computeReadIdsSortedByName();
}
+
+
+
+// Find duplicate reads, as determined by name (not sequence).
+// This also sets the isDuplicate and discardDueToDuplicates read flags
+// and summarizes what it found Duplicates.csv.
+void Assembler::findDuplicateReads(const string& handleDuplicates)
+{
+ reads->findDuplicates(handleDuplicates);
+}
diff --git a/src/AssemblyGraph.cpp b/src/AssemblyGraph.cpp
index 38cd478..9bbbae1 100644
--- a/src/AssemblyGraph.cpp
+++ b/src/AssemblyGraph.cpp
@@ -1,6 +1,7 @@
#include "AssemblyGraph.hpp"
#include "deduplicate.hpp"
using namespace shasta;
+using namespace mode0;
#include "fstream.hpp"
#include "iterator.hpp"
diff --git a/src/AssemblyGraph.hpp b/src/AssemblyGraph.hpp
index bffee58..38d7df1 100644
--- a/src/AssemblyGraph.hpp
+++ b/src/AssemblyGraph.hpp
@@ -25,12 +25,14 @@ vertex in the assembly graph.
#include <limits>
namespace shasta {
- class AssemblyGraph;
+ namespace mode0 {
+ class AssemblyGraph;
+ }
}
-class shasta::AssemblyGraph {
+class shasta::mode0::AssemblyGraph {
public:
// Use the same vertex and edge ids of the marker graph.
diff --git a/src/AssemblyGraph2.cpp b/src/AssemblyGraph2.cpp
index 5b41dfa..07b9d89 100644
--- a/src/AssemblyGraph2.cpp
+++ b/src/AssemblyGraph2.cpp
@@ -45,6 +45,7 @@ AssemblyGraph2::AssemblyGraph2(
uint64_t readRepresentation,
uint64_t k, // Marker length
const MemoryMapped::Vector<ReadFlags>& readFlags,
+ const Reads& reads,
const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
MarkerGraph& markerGraph,
uint64_t pruneLength,
@@ -57,6 +58,7 @@ AssemblyGraph2::AssemblyGraph2(
readRepresentation(readRepresentation),
k(k),
readFlags(readFlags),
+ reads(reads),
markers(markers),
markerGraph(markerGraph)
{
@@ -649,7 +651,7 @@ AssemblyGraph2::edge_descriptor AssemblyGraph2::addEdge(
(*this)[e].storeReadInformation(markerGraph);
}
if(assemble) {
- AssemblyGraph2::assemble(e);
+ AssemblyGraph2::assemble(e, reads);
}
return e;
@@ -744,7 +746,7 @@ void AssemblyGraph2::assemble()
// Use assembled sequence from the marker graph to obtain
// assembled sequence for all edges.
BGL_FORALL_EDGES(e, g, G) {
- assemble(e);
+ assemble(e, reads);
}
performanceLog << timestamp << "AssemblyGraph2::assemble ends." << endl;
@@ -785,7 +787,7 @@ void AssemblyGraph2::assembleThreadFunction(size_t threadId)
// Loop over all edges in this batch.
for(uint64_t i=begin; i!=end; i++) {
const edge_descriptor e = assembleParallelData.allEdges[i];
- assemble(e);
+ assemble(e, reads);
}
}
}
@@ -793,7 +795,7 @@ void AssemblyGraph2::assembleThreadFunction(size_t threadId)
// Assemble sequence for every marker graph path of a given edge.
-void AssemblyGraph2::assemble(edge_descriptor e)
+void AssemblyGraph2::assemble(edge_descriptor e, const Reads& reads)
{
G& g = *this;
@@ -807,7 +809,7 @@ void AssemblyGraph2::assemble(edge_descriptor e)
MarkerGraph::EdgeId const * const end = begin + path.size();
const span<const MarkerGraph::EdgeId> pathSpan(begin, end);
assembleMarkerGraphPath(readRepresentation, k,
- markers, markerGraph, pathSpan, false, assembledSegment);
+ reads, markers, markerGraph, pathSpan, false, assembledSegment);
@@ -2058,43 +2060,6 @@ uint64_t AssemblyGraph2Edge::countCommonSuffixBases() const
-
-// Figure out if this is a bubble is caused by copy number
-// differences in repeats of period up to maxPeriod.
-// If this is the case, returns the shortest period for which this is true.
-// Otherwise, returns 0.
-void AssemblyGraph2Edge::computeCopyNumberDifferencePeriod(uint64_t maxPeriod)
-{
- if(not isBubble()) {
- period = 0;
- }
-
- // Check all pairs of branches.
- vector<uint64_t> periods;
- for(uint64_t i=0; i<branches.size()-1; i++) {
- const vector<Base>& iSequence = branches[i].rawSequence;
- for(uint64_t j=i+1; j<branches.size(); j++) {
- const vector<Base>& jSequence = branches[j].rawSequence;
- const uint64_t pairPeriod = shasta::isCopyNumberDifference(iSequence, jSequence, maxPeriod);
- if(pairPeriod == 0) {
- period = 0;
- return;
- }
- periods.push_back(pairPeriod);
- }
- }
- deduplicate(periods);
-
-
- if(periods.size() == 1) {
- period = periods.front();
- } else {
- period = 0;
- }
-}
-
-
-
// Compute the edit distance between the sequences of the two branches.
// This can only be called for a diploid bubble (2 branches).
uint64_t AssemblyGraph2Edge::bubbleEditDistance() const
@@ -2686,7 +2651,7 @@ AssemblyGraph2::edge_descriptor AssemblyGraph2::mergeWithPreviousIfPossible(edge
newBranch.storeReadInformation(markerGraph);
// Compute sequence for the updated edge.
- assemble(eNew);
+ assemble(eNew, reads);
// Remove the edges we are merging.
boost::remove_edge(e, g);
@@ -2754,7 +2719,7 @@ AssemblyGraph2::edge_descriptor AssemblyGraph2::mergeWithFollowingIfPossible(edg
newBranch.storeReadInformation(markerGraph);
// Compute sequence for the updated edge.
- assemble(eNew);
+ assemble(eNew, reads);
// Remove the edges we are merging.
boost::remove_edge(e, g);
@@ -3725,7 +3690,7 @@ void AssemblyGraph2::handleSuperbubble1(
g[eNew].storeReadInformation(markerGraph);
}
if(assemble) {
- AssemblyGraph2::assemble(eNew);
+ AssemblyGraph2::assemble(eNew, reads);
}
}
diff --git a/src/AssemblyGraph2.hpp b/src/AssemblyGraph2.hpp
index d475659..2b372d8 100644
--- a/src/AssemblyGraph2.hpp
+++ b/src/AssemblyGraph2.hpp
@@ -214,11 +214,6 @@ public:
uint64_t backwardTransferCount = 0;
uint64_t forwardTransferCount = 0;
- // Figure out if this is a bubble is caused by copy number
- // differences in repeats of period up to maxPeriod.
- // If this is the case, stores the shortest period for which this is true.
- // Otherwise, stores 0 as the period.
- void computeCopyNumberDifferencePeriod(uint64_t maxPeriod);
uint64_t period = 0;
string color(uint64_t branchId) const;
@@ -258,6 +253,7 @@ public:
uint64_t readRepresentation,
uint64_t k, // Marker length
const MemoryMapped::Vector<ReadFlags>& readFlags,
+ const Reads& reads,
const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
MarkerGraph&,
uint64_t pruneLength,
@@ -314,6 +310,7 @@ private:
uint64_t readRepresentation;
uint64_t k;
const MemoryMapped::Vector<ReadFlags>& readFlags;
+ const Reads& reads;
const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers;
public:
uint64_t getReadCount() const
@@ -447,7 +444,7 @@ private:
AssembleParallelData assembleParallelData;
// Assemble sequence for every marker graph path of a given edge.
- void assemble(edge_descriptor);
+ void assemble(edge_descriptor, const Reads&);
// Store GFA sequence in each edge.
void storeGfaSequence();
diff --git a/src/AssemblyPathGraph.cpp b/src/AssemblyPathGraph.cpp
index c482831..792b586 100644
--- a/src/AssemblyPathGraph.cpp
+++ b/src/AssemblyPathGraph.cpp
@@ -3,6 +3,7 @@
#include "deduplicate.hpp"
#include "html.hpp"
using namespace shasta;
+using namespace mode0;
// Boost libraries.
#include <boost/graph/graphviz.hpp>
diff --git a/src/AssemblyPathGraph.hpp b/src/AssemblyPathGraph.hpp
index 8cc8d56..e863f7f 100644
--- a/src/AssemblyPathGraph.hpp
+++ b/src/AssemblyPathGraph.hpp
@@ -128,15 +128,14 @@ namespace shasta {
ostream&,
const AssemblyPathGraphEdge&);
- class AssemblyGraph;
}
class shasta::AssemblyPathGraphVertex {
public:
- AssemblyGraph::VertexId vertexId;
- AssemblyPathGraphVertex(AssemblyGraph::VertexId vertexId) :
+ mode0::AssemblyGraph::VertexId vertexId;
+ AssemblyPathGraphVertex(mode0::AssemblyGraph::VertexId vertexId) :
vertexId(vertexId) {}
AssemblyPathGraphBaseClass::vertex_descriptor reverseComplementVertex =
@@ -149,7 +148,7 @@ class shasta::AssemblyPathGraphEdge {
public:
// The AsssemblyGraph path corresponding to this edge.
- vector <AssemblyGraph::EdgeId> path;
+ vector <mode0::AssemblyGraph::EdgeId> path;
// The length of the path, as measured on the marker graph.
uint64_t pathLength = 0;
@@ -172,7 +171,7 @@ public:
// Initialize the path to a single AssemblyGraph edge.
- AssemblyPathGraphEdge(AssemblyGraph::EdgeId edgeId) :
+ AssemblyPathGraphEdge(mode0::AssemblyGraph::EdgeId edgeId) :
path(1, edgeId) {}
AssemblyPathGraphEdge() {}
@@ -261,7 +260,7 @@ public:
// The constructor does not fill in the oriented read ids for each edge.
// This must be done separately (see Assembler::detangle).
- AssemblyPathGraph(const AssemblyGraph&);
+ AssemblyPathGraph(const mode0::AssemblyGraph&);
// The tangles currently present in the graph, keyed by their ids.
TangleId nextTangleId = 0;
@@ -273,7 +272,7 @@ public:
void fillReverseComplementNewEdges(
const vector<edge_descriptor>& newEdges,
- const AssemblyGraph&);
+ const mode0::AssemblyGraph&);
// Initial creation of all tangles.
void createTangles();
@@ -302,7 +301,7 @@ public:
// for GFA output.
void detangle(
double basesPerMarker,
- const AssemblyGraph&);
+ const mode0::AssemblyGraph&);
// Detangle a single tangle.
// This does not fill in the reverseComplementEdge of newly created edges,
diff --git a/src/AssemblyPathGraph2.cpp b/src/AssemblyPathGraph2.cpp
index 00c8cd4..497b242 100644
--- a/src/AssemblyPathGraph2.cpp
+++ b/src/AssemblyPathGraph2.cpp
@@ -3,6 +3,7 @@
#include "deduplicate.hpp"
#include "html.hpp"
using namespace shasta;
+using namespace mode0;
// Boost libraries.
#include <boost/graph/graphviz.hpp>
diff --git a/src/AssemblyPathGraph2.hpp b/src/AssemblyPathGraph2.hpp
index 1c8ae26..786c818 100644
--- a/src/AssemblyPathGraph2.hpp
+++ b/src/AssemblyPathGraph2.hpp
@@ -132,15 +132,17 @@ namespace shasta {
ostream&,
const AssemblyPathGraph2Edge&);
- class AssemblyGraph;
+ namespace mode0 {
+ class AssemblyGraph;
+ }
}
class shasta::AssemblyPathGraph2Vertex {
public:
- AssemblyGraph::VertexId vertexId;
- AssemblyPathGraph2Vertex(AssemblyGraph::VertexId vertexId) :
+ mode0::AssemblyGraph::VertexId vertexId;
+ AssemblyPathGraph2Vertex(mode0::AssemblyGraph::VertexId vertexId) :
vertexId(vertexId) {}
AssemblyPathGraph2BaseClass::vertex_descriptor reverseComplementVertex =
@@ -153,7 +155,7 @@ class shasta::AssemblyPathGraph2Edge {
public:
// The AsssemblyGraph path corresponding to this edge.
- vector <AssemblyGraph::EdgeId> path;
+ vector <mode0::AssemblyGraph::EdgeId> path;
// The length of the path, as measured on the marker graph.
uint64_t pathLength = 0;
@@ -176,7 +178,7 @@ public:
// Initialize the path to a single AssemblyGraph edge.
- AssemblyPathGraph2Edge(AssemblyGraph::EdgeId edgeId) :
+ AssemblyPathGraph2Edge(mode0::AssemblyGraph::EdgeId edgeId) :
path(1, edgeId) {}
AssemblyPathGraph2Edge() {}
@@ -282,7 +284,7 @@ public:
// The constructor does not fill in the oriented read ids for each edge.
// This must be done separately (see Assembler::detangle2).
AssemblyPathGraph2(
- const AssemblyGraph&,
+ const mode0::AssemblyGraph&,
uint64_t diagonalReadCountMin,
uint64_t offDiagonalReadCountMax,
double detangleOffDiagonalRatio);
@@ -302,7 +304,7 @@ public:
void fillReverseComplementNewEdges(
const vector<edge_descriptor>& newEdges,
- const AssemblyGraph&);
+ const mode0::AssemblyGraph&);
// Initial creation of all tangles.
void createTangles();
@@ -331,7 +333,7 @@ public:
// for GFA output.
void detangle(
double basesPerMarker,
- const AssemblyGraph&);
+ const mode0::AssemblyGraph&);
// Detangle a single tangle.
// This does not fill in the reverseComplementEdge of newly created edges,
diff --git a/src/CompressedAssemblyGraph.cpp b/src/CompressedAssemblyGraph.cpp
index d120643..5e5767e 100644
--- a/src/CompressedAssemblyGraph.cpp
+++ b/src/CompressedAssemblyGraph.cpp
@@ -10,6 +10,7 @@
#include "runCommandWithTimeout.hpp"
#include "subgraph.hpp"
using namespace shasta;
+using namespace mode0;
// Boost libraries.
#include <boost/algorithm/string.hpp>
diff --git a/src/CompressedAssemblyGraph.hpp b/src/CompressedAssemblyGraph.hpp
index e74a3cf..ca7e6ba 100644
--- a/src/CompressedAssemblyGraph.hpp
+++ b/src/CompressedAssemblyGraph.hpp
@@ -36,9 +36,9 @@ namespace shasta {
class shasta::CompressedAssemblyGraphVertex {
public:
- AssemblyGraph::VertexId vertexId;
+ mode0::AssemblyGraph::VertexId vertexId;
- CompressedAssemblyGraphVertex(AssemblyGraph::VertexId vertexId) :
+ CompressedAssemblyGraphVertex(mode0::AssemblyGraph::VertexId vertexId) :
vertexId(vertexId) {}
};
@@ -51,10 +51,10 @@ public:
// with this edge.
// This includes the assembly graph vertices
// associated with the source and target of this edge.
- vector<AssemblyGraph::VertexId> vertices;
+ vector<mode0::AssemblyGraph::VertexId> vertices;
// The chain of sets of parallel assembly graph edges.
- vector< vector<AssemblyGraph::EdgeId> > edges;
+ vector< vector<mode0::AssemblyGraph::EdgeId> > edges;
// An id assigned to this edge of the compressed assembly graph
// and used in gfa and other output.
@@ -68,7 +68,7 @@ public:
{
return double(minMarkerCount + maxMarkerCount) / 2.;
}
- void fillMarkerCounts(const AssemblyGraph&);
+ void fillMarkerCounts(const mode0::AssemblyGraph&);
// Find the oriented reads that appear in marker graph vertices
// internal to this edge of the compressed assembly graph.
@@ -99,8 +99,8 @@ private:
class shasta::CompressedAssemblyGraph :
public CompressedAssemblyGraphBaseClass {
public:
- using VertexId = AssemblyGraph::VertexId;
- using EdgeId = AssemblyGraph::EdgeId;
+ using VertexId = mode0::AssemblyGraph::VertexId;
+ using EdgeId = mode0::AssemblyGraph::EdgeId;
// Create the CompressedAssemblyGraph from the AssemblyGraph.
@@ -180,7 +180,7 @@ private:
// Create an edge for each set of parallel edges of the assembly graph.
void createEdges(
- const AssemblyGraph&,
+ const mode0::AssemblyGraph&,
const vector<vertex_descriptor>& vertexTable
);
@@ -195,10 +195,10 @@ private:
// Fill in the assembly graph edges that go into each
// edge of the compressed assembly graph.
- void fillContributingEdges(const AssemblyGraph&);
+ void fillContributingEdges(const mode0::AssemblyGraph&);
// Fill in minimum and maximum marker counts for each edge.
- void fillMarkerCounts(const AssemblyGraph&);
+ void fillMarkerCounts(const mode0::AssemblyGraph&);
// Find the oriented reads that appear in marker graph vertices
// internal to each edge of the compressed assembly graph.
diff --git a/src/ConfigurationTable.cpp b/src/ConfigurationTable.cpp
index 8e69180..7e8d064 100644
--- a/src/ConfigurationTable.cpp
+++ b/src/ConfigurationTable.cpp
@@ -1609,6 +1609,51 @@ mode2.bubbleRemoval.minConcordantReadCount = 2
+)zzz"},
+ {"Nanopore-ncm23-May2024", R"zzz(# This assembly configuration is for nanopore reads generated using the
+# "Experimental extremely high-accuracy, ultra-long sequencing kit"
+# from the ONT December 2023 data release:
+# https://labs.epi2me.io/gm24385_ncm23_preview/
+
+# It uses Mode 3 assembly to create a phased assembly.
+# It was only tested for a human genome at coverage 40x to 60x,
+# but it should work at lower or higher coverage,
+# within reasonable limits, because it includes some
+# provisions for coverage adaptivity.
+
+[Reads]
+representation = 0
+minReadLength = 10000
+noCache = True
+palindromicReads.deltaThreshold = 300
+
+[Kmers]
+k = 30
+probability = 0.05
+
+[MinHash]
+minHashIterationCount = 50
+minBucketSize = 0
+maxBucketSize = 0
+minFrequency = 5
+
+[Align]
+alignMethod = 5
+sameChannelReadAlignment.suppressDeltaThreshold = 30
+minAlignedMarkerCount = 1000
+minAlignedFraction = 0.9
+maxSkip = 20
+maxDrift = 10
+maxTrim = 20
+
+[ReadGraph]
+maxAlignmentCount = 20
+strandSeparationMethod = 2
+
+[Assembly]
+mode = 3
+
+
)zzz"}
};
}
diff --git a/src/HashedKmerChecker.cpp b/src/HashedKmerChecker.cpp
new file mode 100644
index 0000000..6fc3c03
--- /dev/null
+++ b/src/HashedKmerChecker.cpp
@@ -0,0 +1,118 @@
+// Shasta.
+#include "HashedKmerChecker.hpp"
+#include "Kmer.hpp"
+#include "MemoryMappedObject.hpp"
+using namespace shasta;
+
+// MurmurHash.
+#include "MurmurHash2.hpp"
+
+// Standard library.
+#include <cmath>
+
+
+
+// We must guarantee that if a KmerId if a marker
+// its reverse complement is also a marker.
+// To do this we check both.
+// This will usually require two calls to MurmurHash2,
+// but this is probably still faster than two cache misses
+// in the old k-mer table.
+bool HashedKmerChecker::isMarker(KmerId kmerId) const
+{
+ // Check the KmerId.
+ if(MurmurHash2(&kmerId, sizeof(kmerId), 267457831) < hashThreshold) {
+ return true;
+ }
+
+ // Check its reverse complement.
+ const Kmer kmer(kmerId, k);
+ const Kmer kmerRc = kmer.reverseComplement(k);
+ const KmerId kmerIdRc = KmerId(kmerRc.id(k));
+ return MurmurHash2(&kmerIdRc, sizeof(kmerId), 267457831) < hashThreshold;
+}
+
+
+
+// Initial creation.
+HashedKmerChecker::HashedKmerChecker(
+ uint64_t k,
+ double markerDensity,
+ const MappedMemoryOwner& mappedMemoryOwner) :
+ MappedMemoryOwner(mappedMemoryOwner),
+ k(k)
+{
+ // Sanity check on the marker density.
+ if(markerDensity<0. || markerDensity>1.) {
+ throw runtime_error("Invalid marker density " +
+ to_string(markerDensity) + " requested.");
+ }
+
+
+
+ // Compute the hash threshold that achieves the required marker density.
+
+ // In this computation, we neglect self-complementary k-mers,
+ // which are a small minority of total.
+
+ // Call:
+ // - hashMax the maximum possible value of a hash
+ // - hashValue the hash value for a given KmerId
+ // - hashValueRc the hash value for its reverse complement (for length k)
+ // - p = hashThreshold / hashMax
+
+ // A KmerId is a marker if
+ // Event A: hashValue < hashThreshold
+ // OR
+ // Event B: hashValueRc < hashThreshold
+ // Event A occurs with probability P(A) = hashThreshold / hashMax = p.
+ // Event B also occurs with probability P(B) = hashThreshold / hashMax = p.
+
+ // If we use a good hash function, we can consider A and B uncorrelated.
+ // Therefore we can use the standard formula:
+ // P(A or B) = 1 - P(not(A or B)) =
+ // 1 - P((not A) and (not B)) =
+ // 1 - (P(not A)) * P(not B)) =
+ // 1 - (1 - P(A)) * (1 - P(B))
+ // It can also be verified by simple algebra that this is equal to the standard formula
+ // P(A or B) =
+ // P(A) + P(B) - P(A and B) =
+ // P(A) + P(B) - P(A) * P(B)
+ // but we don't need this part.
+
+ // Using the above we get:
+ // markerDensity =
+ // P(A or B) =
+ // 1 - (1 - P(A)) * (1 - P(B)) ==
+ // 1 - (1 - p)^2
+ // (Because P(A) = P(B) = p).
+ // From
+ // markerDensity = 1 - (1 - p)^2
+ // we get
+ // p = 1 - sqrt(1 - markerDensity)
+ // And finally hashThreshold = hashMax * p.
+
+ const double p = 1. - std::sqrt(1. - markerDensity);
+ const double hashMax = std::numeric_limits<uint32_t> :: max();
+ hashThreshold = uint32_t(std::round(double(hashMax) * p));
+
+ // Store k and the hash threshold in binary data.
+ MemoryMapped::Object<HashedKmerCheckerData> data;
+ data.createNew(largeDataName("HashedKmerChecker"), largeDataPageSize);
+ data->k = k;
+ data->hashThreshold = hashThreshold;
+
+}
+
+
+
+// Creation from binary data.
+HashedKmerChecker::HashedKmerChecker(
+ const MappedMemoryOwner& mappedMemoryOwner) :
+ MappedMemoryOwner(mappedMemoryOwner)
+{
+ MemoryMapped::Object<HashedKmerCheckerData> data;
+ data.accessExistingReadOnly(largeDataName("HashedKmerChecker"));
+ k = data->k;
+ hashThreshold = data->hashThreshold;
+}
diff --git a/src/HashedKmerChecker.hpp b/src/HashedKmerChecker.hpp
new file mode 100644
index 0000000..36b1f4c
--- /dev/null
+++ b/src/HashedKmerChecker.hpp
@@ -0,0 +1,43 @@
+#ifndef SHASTA_HASHED_KMER_CHECKER_HPP
+#define SHASTA_HASHED_KMER_CHECKER_HPP
+
+#include "KmerChecker.hpp"
+#include "MappedMemoryOwner.hpp"
+
+namespace shasta {
+ class HashedKmerChecker;
+}
+
+
+// The new implementation of the KmerChecker is not table based
+// and uses hashing instead.
+// It only supports marker generation method 0 (random generation)
+// but allow marker lengths k<32.
+class shasta::HashedKmerChecker :
+ public KmerChecker,
+ public MappedMemoryOwner {
+public:
+ bool isMarker(KmerId) const;
+
+ // Initial creation.
+ HashedKmerChecker(uint64_t k, double markerDensity, const MappedMemoryOwner&);
+
+ // Creation from binary data.
+ HashedKmerChecker(const MappedMemoryOwner&);
+
+private:
+ uint64_t k;
+ uint32_t hashThreshold;
+
+ // This is used to store the hashThreshold in binary data.
+ class HashedKmerCheckerData {
+ public:
+ uint64_t k;
+ uint32_t hashThreshold;
+ };
+};
+
+
+
+#endif
+
diff --git a/src/HttpServer.hpp b/src/HttpServer.hpp
index 186d7a8..51dc138 100644
--- a/src/HttpServer.hpp
+++ b/src/HttpServer.hpp
@@ -114,7 +114,14 @@ public:
}
try {
std::istringstream s(next);
- s >> value;
+ if constexpr(std::is_same_v<string, T>) {
+ // For string use getline to process correctly strings containing spaces.
+ // The constexpr is needed to force evaluation at compile time
+ // (otherwise this branch does not compile for types other than string).
+ std::getline(s, value);
+ } else {
+ s >> value;
+ }
} catch (...) {
return false;
}
diff --git a/src/Kmer.hpp b/src/Kmer.hpp
index aae6cfd..a7da37b 100644
--- a/src/Kmer.hpp
+++ b/src/Kmer.hpp
@@ -10,31 +10,12 @@ namespace shasta {
// Type used to represent a k-mer.
// This limits the maximum k-mer length that can be used.
// If this changes, KmerId must also be changed.
- using Kmer = ShortBaseSequence16;
+ using Kmer16 = ShortBaseSequence16;
+ using Kmer32 = ShortBaseSequence32;
+ using Kmer = Kmer32;
static_assert(
std::numeric_limits<KmerId>::digits == 2*Kmer::capacity,
"Kmer and KmerId types are inconsistent.");
-
- class KmerInfo;
}
-
-
-class shasta::KmerInfo {
-public:
-
- // Frequency of this k-mer in input reads.
- // Only filled in if selectKmersBasedOnFrequency
- // is used.
- uint64_t frequency = 0;
-
- KmerId reverseComplementedKmerId;
- bool isMarker;
- bool isRleKmer;
-
- // Hash function of the KmerId, used for downsampling markers
- // for alignments using method 3.
- uint32_t hash;
-};
-
#endif
diff --git a/src/KmerChecker.hpp b/src/KmerChecker.hpp
new file mode 100644
index 0000000..c7b17fd
--- /dev/null
+++ b/src/KmerChecker.hpp
@@ -0,0 +1,23 @@
+#ifndef SHASTA_KMER_CHECKER_HPP
+#define SHASTA_KMER_CHECKER_HPP
+
+// Shasta.
+#include "shastaTypes.hpp"
+
+namespace shasta {
+ class KmerChecker;
+ class HashedKmerChecker;
+}
+
+
+
+// The KmerChecker is an abstract class that knows how to find
+// out if a k-mer is a marker.
+// All implementations must guarantee that if a KmerId if a marker
+// its reverse complement is also a marker.
+class shasta::KmerChecker {
+public:
+ virtual bool isMarker(KmerId) const = 0;
+};
+
+#endif
diff --git a/src/KmerCheckerFactory.cpp b/src/KmerCheckerFactory.cpp
new file mode 100644
index 0000000..ab22c53
--- /dev/null
+++ b/src/KmerCheckerFactory.cpp
@@ -0,0 +1,118 @@
+#include "KmerCheckerFactory.hpp"
+#include "Kmer.hpp"
+#include "KmerTable.hpp"
+#include "HashedKmerChecker.hpp"
+#include "AssemblerOptions.hpp"
+#include "Reads.hpp"
+using namespace shasta;
+
+
+
+std::shared_ptr<KmerChecker> KmerCheckerFactory::createNew(
+ const KmersOptions& kmersOptions,
+ uint64_t threadCount,
+ const Reads& reads,
+ const MappedMemoryOwner& mappedMemoryOwner)
+{
+ // For generation method 0, always use the HashedKmerChecker.
+ if(kmersOptions.generationMethod == 0) {
+ return make_shared<HashedKmerChecker>(
+ kmersOptions.k,
+ kmersOptions.probability,
+ mappedMemoryOwner);
+ }
+
+ // In all other cases, we are limited to k<=16.
+ if(kmersOptions.k > int(Kmer16::capacity)) {
+ throw runtime_error("Kmer generation method " +
+ to_string(kmersOptions.generationMethod) +
+ " is only supported for a maximum marker length of 15.");
+ }
+
+ const int seed = 231;
+ switch(kmersOptions.generationMethod) {
+ case 0:
+ return make_shared<KmerTable0>(
+ kmersOptions.k,
+ kmersOptions.probability,
+ seed,
+ mappedMemoryOwner);
+
+ case 1:
+ return make_shared<KmerTable1>(
+ kmersOptions.k,
+ kmersOptions.probability,
+ seed,
+ kmersOptions.enrichmentThreshold,
+ reads,
+ threadCount,
+ mappedMemoryOwner);
+
+ case 2:
+ return make_shared<KmerTable2>(
+ kmersOptions.k,
+ kmersOptions.probability,
+ seed,
+ kmersOptions.enrichmentThreshold,
+ reads,
+ threadCount,
+ mappedMemoryOwner);
+
+ case 3:
+ return make_shared<KmerTable3>(
+ kmersOptions.k,
+ reads.representation,
+ kmersOptions.file,
+ mappedMemoryOwner);
+
+ case 4:
+ return make_shared<KmerTable4>(
+ kmersOptions.k,
+ kmersOptions.probability,
+ seed,
+ kmersOptions.distanceThreshold,
+ reads,
+ threadCount,
+ mappedMemoryOwner);
+
+ default:
+ throw runtime_error("Invalid --Kmers generationMethod. "
+ "Specify a value between 0 and 4, inclusive.");
+ }
+}
+
+
+
+std::shared_ptr<shasta::KmerChecker> KmerCheckerFactory::createFromBinaryData(
+ uint64_t k,
+ uint64_t generationMethod,
+ const Reads& reads,
+ const MappedMemoryOwner& mappedMemoryOwner)
+{
+ // For generation method 0, always use the HashedKmerChecker.
+ if(generationMethod == 0) {
+ return make_shared<HashedKmerChecker>(mappedMemoryOwner);
+ }
+
+ switch(generationMethod) {
+ case 0:
+ return make_shared<KmerTable0>(k, mappedMemoryOwner);
+
+ case 1:
+ return make_shared<KmerTable1>(k, reads, mappedMemoryOwner);
+
+ case 2:
+ return make_shared<KmerTable2>(k, reads, mappedMemoryOwner);
+
+ case 3:
+ return make_shared<KmerTable3>(k, mappedMemoryOwner);
+
+ case 4:
+ return make_shared<KmerTable4>(k, reads, mappedMemoryOwner);
+
+
+ default:
+ throw runtime_error("Invalid --Kmers generationMethod. "
+ "Specify a value between 0 and 4, inclusive.");
+ }
+}
diff --git a/src/KmerCheckerFactory.hpp b/src/KmerCheckerFactory.hpp
new file mode 100644
index 0000000..b934727
--- /dev/null
+++ b/src/KmerCheckerFactory.hpp
@@ -0,0 +1,39 @@
+#ifndef SHASTA_KMER_CHECKER_FACTORY_HPP
+#define SHASTA_KMER_CHECKER_FACTORY_HPP
+
+// Shasta.
+#include "KmerChecker.hpp"
+#include "memory.hpp"
+
+namespace shasta {
+ class KmerCheckerFactory;
+
+ class KmerChecker;
+ class KmersOptions;
+ class Reads;
+ class MappedMemoryOwner;
+
+}
+
+
+
+// The KmerCheckerFactory knows how to create the appropriate
+// type of KmerChecker for the options used.
+class shasta::KmerCheckerFactory {
+public:
+
+ static shared_ptr<KmerChecker> createNew(
+ const KmersOptions&,
+ uint64_t threadCount,
+ const Reads&,
+ const MappedMemoryOwner&);
+
+ static shared_ptr<KmerChecker> createFromBinaryData(
+ uint64_t k,
+ uint64_t generationMethod,
+ const Reads&,
+ const MappedMemoryOwner&);
+};
+
+#endif
+
diff --git a/src/AssemblerKmers.cpp b/src/KmerTable.cpp
index 37ce89f..01fd0ad 100644
--- a/src/AssemblerKmers.cpp
+++ b/src/KmerTable.cpp
@@ -1,46 +1,31 @@
// Shasta.
-#include "Assembler.hpp"
+#include "KmerTable.hpp"
+#include "AssemblerOptions.hpp"
#include "deduplicate.hpp"
-#include "MurmurHash2.hpp"
+#include "Kmer.hpp"
#include "Reads.hpp"
-#include "timestamp.hpp"
using namespace shasta;
// Standard library.
#include "fstream.hpp"
#include <random>
-
-
-void Assembler::accessKmers()
-{
- kmerTable.accessExistingReadOnly(largeDataName("Kmers"));
- if(kmerTable.size() != (1ULL<< (2*assemblerInfo->k))) {
- throw runtime_error("Size of k-mer vector is inconsistent with stored value of k.");
- }
-}
-
-void Assembler::checkKmersAreOpen()const
-{
- if(!kmerTable.isOpen) {
- throw runtime_error("Kmers are not accessible.");
- }
-}
+// Explicit template instantiations.
+#include "MultithreadedObject.tpp"
+template class MultithreadedObject<KmerTable1>;
+template class MultithreadedObject<KmerTable2>;
+template class MultithreadedObject<KmerTable4>;
// Randomly select the k-mers to be used as markers.
-void Assembler::randomlySelectKmers(
- size_t k, // k-mer length.
+KmerTable0::KmerTable0(
+ uint64_t k,
double probability, // The probability that a k-mer is selected as a marker.
- int seed // For random number generator.
-)
+ int seed , // For random number generator.
+ const MappedMemoryOwner& mappedMemoryOwner
+ ) : KmerTable(k, true, mappedMemoryOwner)
{
- // Sanity check on the value of k, then store it.
- if(k > Kmer::capacity) {
- throw runtime_error("K-mer capacity exceeded.");
- }
- assemblerInfo->k = k;
// The total number of k-mers of this length.
// This includes both RLE and non-RLE k-mers.
@@ -57,12 +42,6 @@ void Assembler::randomlySelectKmers(
- // Fill in the fields of the k-mer table
- // that depends only on k.
- initializeKmerTable();
-
-
-
// Compute the probability p with which we select
// each k-mer and its reverse complement
// in order to achieve the required k-mer fraction.
@@ -98,57 +77,31 @@ void Assembler::randomlySelectKmers(
}
- // Do some counting.
- uint64_t rleKmerCount = 0;
- uint64_t markerKmerCount = 0;
- uint64_t markerRleKmerCount = 0;
- for(uint64_t kmerId=0; kmerId<kmerCount; kmerId++) {
- const KmerInfo& kmerInfo = kmerTable[kmerId];
- if(kmerInfo.isRleKmer) {
- ++rleKmerCount;
- }
- if(kmerInfo.isMarker) {
- ++markerKmerCount;
- }
- if(kmerInfo.isRleKmer and kmerInfo.isMarker) {
- ++markerRleKmerCount;
- }
- }
-
-
-
- // Summary messages.
- if(assemblerInfo->readRepresentation == 0) {
+}
- // We are using the raw representation of the reads.
- cout << "Total number of k-mers of length " << k << " is " << kmerCount << endl;
- cout << "Of those, " << markerKmerCount << " will be used as markers." << endl;
- cout << "Fraction of k-mers used as markers: requested " <<
- probability << ", actual " <<
- double(markerKmerCount)/double(kmerCount) << "." << endl;
+KmerTable::KmerTable(
+ uint64_t k,
+ bool createNew,
+ const MappedMemoryOwner& mappedMemoryOwner) :
+ MappedMemoryOwner(mappedMemoryOwner), k(k)
+{
+ if(createNew) {
+ createKmerTable();
} else {
-
- // We are using the RLE representation of the reads.
- cout << "Total number of k-mers of length " << k << " is " << kmerCount << endl;
- cout << "Number of RLE k-mers of length " << k << " is " << rleKmerCount << endl;
- cout << "Of those, " << markerRleKmerCount << " will be used as markers." << endl;
- cout << "Fraction of k-mers used as markers: requested " <<
- probability << ", actual " <<
- double(markerRleKmerCount)/double(rleKmerCount) << "." << endl;
-
+ accessKmerTable();
}
-
}
-void Assembler::initializeKmerTable()
+void KmerTable::createKmerTable()
{
+ SHASTA_ASSERT(k <= Kmer16::capacity);
+
// Create the kmer table with the necessary size.
kmerTable.createNew(largeDataName("Kmers"), largeDataPageSize);
- const size_t k = assemblerInfo->k;
const size_t kmerCount = 1ULL << (2ULL*k);
kmerTable.resize(kmerCount);
@@ -156,7 +109,7 @@ void Assembler::initializeKmerTable()
for(uint64_t kmerId=0; kmerId<kmerCount; kmerId++) {
const Kmer kmer(kmerId, k);
const Kmer reverseComplementedKmer = kmer.reverseComplement(k);
- kmerTable[kmerId].reverseComplementedKmerId = KmerId(reverseComplementedKmer.id(k));
+ kmerTable[kmerId].reverseComplementedKmerId = KmerId16(reverseComplementedKmer.id(k));
}
for(uint64_t kmerId=0; kmerId<kmerCount; kmerId++) {
const uint64_t reverseComplementedKmerId = kmerTable[kmerId].reverseComplementedKmerId;
@@ -178,48 +131,23 @@ void Assembler::initializeKmerTable()
}
}
-
- // Fill in hash values used for downsampling.
- for(uint64_t kmerId=0; kmerId<kmerCount; kmerId++) {
- const uint64_t n = kmerId + kmerTable[kmerId].reverseComplementedKmerId;
- kmerTable[kmerId].hash = MurmurHash2(&n, sizeof(n), 13477);
- }
-
}
-void Assembler::writeKmers(const string& fileName) const
+void KmerTable::accessKmerTable()
{
- checkKmersAreOpen();
-
- // Get the k-mer length.
- const size_t k = assemblerInfo->k;
- const size_t kmerCount = 1ULL << (2ULL*k);
- SHASTA_ASSERT(kmerTable.size() == kmerCount);
-
- // Open the output file and write the header line.
- ofstream file(fileName);
- file << "KmerId,Kmer,IsMarker,ReverseComplementedKmerId,ReverseComplementedKmer\n";
-
- // Write a line for each k-mer.
- for(uint64_t kmerId=0; kmerId<kmerCount; kmerId++) {
- file << kmerId << ",";
- file << Kmer(kmerId, k) << ",";
- file << int(kmerTable[kmerId].isMarker) << ",";
- file << kmerTable[kmerId].reverseComplementedKmerId << ",";
- file << Kmer(kmerTable[kmerId].reverseComplementedKmerId, k) << "\n";
- }
+ kmerTable.accessExistingReadOnly(largeDataName("Kmers"));
+ SHASTA_ASSERT(kmerTable.size() == 1ULL << (2ULL*k));
}
// Select marker k-mers randomly, but excluding
// the ones that have high frequency in the reads.
-void Assembler::selectKmersBasedOnFrequency(
+KmerTable1::KmerTable1(
- // k-mer length.
- size_t k,
+ uint64_t k,
// The desired marker density
double markerDensity,
@@ -232,15 +160,16 @@ void Assembler::selectKmersBasedOnFrequency(
// over what a random distribution would give.
double enrichmentThreshold,
- size_t threadCount
-)
-{
+ const Reads& reads,
- // Sanity check on the value of k, then store it.
- if(k > Kmer::capacity) {
- throw runtime_error("K-mer capacity exceeded.");
- }
- assemblerInfo->k = k;
+ size_t threadCount,
+
+ const MappedMemoryOwner& mappedMemoryOwner) :
+
+ KmerTable(k, true, mappedMemoryOwner),
+ MultithreadedObject<KmerTable1>(*this),
+ reads(reads)
+{
// Sanity check.
if(markerDensity<0. || markerDensity>1.) {
@@ -253,13 +182,9 @@ void Assembler::selectKmersBasedOnFrequency(
threadCount = std::thread::hardware_concurrency();
}
- // Fill in the fields of the k-mer table
- // that depends only on k.
- initializeKmerTable();
-
// Compute the frequency of all k-mers in oriented reads.
- setupLoadBalancing(reads->readCount(), 1000);
- runThreads(&Assembler::computeKmerFrequency, threadCount);
+ setupLoadBalancing(reads.readCount(), 1000);
+ runThreads(&KmerTable1::computeKmerFrequency, threadCount);
// Compute the total number of k-mer occurrences in reads
// and the number of k-mers that can possibly occur.
@@ -271,7 +196,7 @@ void Assembler::selectKmersBasedOnFrequency(
for(uint64_t kmerId=0; kmerId!=kmerTable.size(); kmerId++) {
const KmerInfo& info = kmerTable[kmerId];
totalKmerOccurrences += info.frequency;
- if(assemblerInfo->readRepresentation == 0) {
+ if(reads.representation == 0) {
++possibleKmerCount;
} else {
if(info.isRleKmer) {
@@ -284,7 +209,7 @@ void Assembler::selectKmersBasedOnFrequency(
- if(assemblerInfo->readRepresentation == 0) {
+ if(reads.representation == 0) {
// We are using raw read representation.
cout <<
@@ -345,7 +270,7 @@ void Assembler::selectKmersBasedOnFrequency(
vector<KmerId> candidateKmers;
for(uint64_t kmerId=0; kmerId<kmerTable.size(); kmerId++) {
const KmerInfo& info = kmerTable[kmerId];
- if((assemblerInfo->readRepresentation==1) and (not info.isRleKmer)) {
+ if((reads.representation==1) and (not info.isRleKmer)) {
continue;
}
const uint64_t frequency = info.frequency;
@@ -376,6 +301,7 @@ void Assembler::selectKmersBasedOnFrequency(
// until we have enough.
uint64_t kmerOccurrencesCount = 0;
uint64_t kmerCount = 0;
+ const uint64_t giveUpCount = uint64_t(0.9 * double(candidateKmers.size()));
const uint64_t desiredKmerOccurrencesCount =
uint64_t(markerDensity * double(totalKmerOccurrences));
while(kmerOccurrencesCount < desiredKmerOccurrencesCount) {
@@ -408,6 +334,10 @@ void Assembler::selectKmersBasedOnFrequency(
reverseComplementedInfo.isMarker = true;
kmerOccurrencesCount += reverseComplementedInfo.frequency;
++kmerCount;
+
+ if(kmerCount >= giveUpCount) {
+ throw runtime_error("Giving up after selecting as markers 90% of the candidate kmers.");
+ }
}
cout << "Selected " << kmerCount << " k-mers as markers." << endl;
@@ -415,7 +345,8 @@ void Assembler::selectKmersBasedOnFrequency(
}
-void Assembler::computeKmerFrequency(size_t threadId)
+
+void KmerTable1::computeKmerFrequency(size_t threadId)
{
// Create a frequency vector for this thread.
MemoryMapped::Vector<uint64_t> frequency;
@@ -428,7 +359,6 @@ void Assembler::computeKmerFrequency(size_t threadId)
// Loop over all batches assigned to this thread.
- const size_t k = assemblerInfo->k;
uint64_t begin, end;
while(getNextBatch(begin, end)) {
@@ -436,7 +366,7 @@ void Assembler::computeKmerFrequency(size_t threadId)
for(ReadId readId=ReadId(begin); readId!=ReadId(end); readId++) {
// Access the sequence of this read.
- const LongBaseSequenceView read = reads->getRead(readId);
+ const LongBaseSequenceView read = reads.getRead(readId);
// If the read is pathologically short, it has no k-mers.
if(read.baseCount < k) {
@@ -489,17 +419,18 @@ void Assembler::computeKmerFrequency(size_t threadId)
// Read the k-mers from file.
-void Assembler::readKmersFromFile(uint64_t k, const string& fileName)
+KmerTable3::KmerTable3(
+ uint64_t k,
+ uint64_t readRepresentation,
+ const string& fileName,
+ const MappedMemoryOwner& mappedMemoryOwner) :
+ KmerTable(k, true, mappedMemoryOwner)
{
- // Sanity check on the value of k, then store it.
- if(k > Kmer::capacity) {
- throw runtime_error("K-mer capacity exceeded.");
+ if(fileName.empty() or
+ fileName[0] != '/') {
+ throw runtime_error("Option --Kmers.file must specify an absolute path. "
+ "A relative path is not accepted.");
}
- assemblerInfo->k = k;
-
- // Fill in the fields of the k-mer table
- // that depends only on k.
- initializeKmerTable();
// Open the file.
ifstream file(fileName);
@@ -541,7 +472,7 @@ void Assembler::readKmersFromFile(uint64_t k, const string& fileName)
const KmerId kmerId = KmerId(kmer.id(k));
SHASTA_ASSERT(kmerId < kmerTable.size());
KmerInfo& kmerInfo = kmerTable[kmerId];
- if((assemblerInfo->readRepresentation==1) and (not kmerInfo.isRleKmer)) {
+ if((readRepresentation==1) and (not kmerInfo.isRleKmer)) {
throw runtime_error("Non-RLE k-mer (duplicate consecutive bases) in " +
fileName + ":\n" + line);
}
@@ -561,7 +492,7 @@ void Assembler::readKmersFromFile(uint64_t k, const string& fileName)
if(kmerInfo.isMarker) {
++usedKmerCount;
}
- if(assemblerInfo->readRepresentation == 0) {
+ if(readRepresentation == 0) {
++possibleKmerCount;
} else {
if(kmerInfo.isRleKmer) {
@@ -577,31 +508,19 @@ void Assembler::readKmersFromFile(uint64_t k, const string& fileName)
// In this version, marker k-mers are selected randomly, but excluding
// any k-mer that is over-enriched even in a single oriented read.
-void Assembler::selectKmers2(
-
- // k-mer length.
- size_t k,
-
- // The desired marker density
+KmerTable2::KmerTable2(
+ uint64_t k,
double markerDensity,
-
- // Seed for random number generator.
int seed,
-
- // Exclude k-mers enriched by more than this amount,
- // even in a single oriented read.
- // Enrichment is the ratio of k-mer frequency in reads
- // over what a random distribution would give.
double enrichmentThreshold,
-
- size_t threadCount
-)
+ const Reads& reads,
+ uint64_t threadCount,
+ const MappedMemoryOwner& mappedMemoryOwner) :
+ KmerTable(k, true, mappedMemoryOwner),
+ MultithreadedObject<KmerTable2>(*this),
+ reads(reads),
+ enrichmentThreshold(enrichmentThreshold)
{
- // Sanity check on the value of k, then store it.
- if(k > Kmer::capacity) {
- throw runtime_error("K-mer capacity exceeded.");
- }
- assemblerInfo->k = k;
// Sanity check.
if(markerDensity<0. || markerDensity>1.) {
@@ -614,31 +533,24 @@ void Assembler::selectKmers2(
threadCount = std::thread::hardware_concurrency();
}
- // Fill in the fields of the k-mer table
- // that depends only on k.
- initializeKmerTable();
-
- // Store the enrichmentThreshold so all threads can see it.
- selectKmers2Data.enrichmentThreshold = enrichmentThreshold;
-
// For each KmerId that is an RLE k-mer, compute the
// global frequency (total number of occurrences in all
// oriented reads) and the number of reads in
// which the k-mer is over-enriched.
- selectKmers2Data.globalFrequency.createNew(
+ globalFrequency.createNew(
largeDataName("tmp-SelectKmers2-GlobalFrequency"), largeDataPageSize);
- selectKmers2Data.overenrichedReadCount.createNew(
+ overenrichedReadCount.createNew(
largeDataName("tmp-SelectKmers2-OverenrichedReadCount"), largeDataPageSize);
- selectKmers2Data.globalFrequency.resize(kmerTable.size());
- selectKmers2Data.overenrichedReadCount.resize(kmerTable.size());
+ globalFrequency.resize(kmerTable.size());
+ overenrichedReadCount.resize(kmerTable.size());
fill(
- selectKmers2Data.globalFrequency.begin(),
- selectKmers2Data.globalFrequency.end(), 0);
+ globalFrequency.begin(),
+ globalFrequency.end(), 0);
fill(
- selectKmers2Data.overenrichedReadCount.begin(),
- selectKmers2Data.overenrichedReadCount.end(), 0);
- setupLoadBalancing(reads->readCount(), 100);
- runThreads(&Assembler::selectKmers2ThreadFunction, threadCount);
+ overenrichedReadCount.begin(),
+ overenrichedReadCount.end(), 0);
+ setupLoadBalancing(reads.readCount(), 100);
+ runThreads(&KmerTable2::threadFunction, threadCount);
@@ -647,8 +559,8 @@ void Assembler::selectKmers2(
uint64_t totalKmerOccurrences = 0;
uint64_t possibleKmerCount = 0;
for(uint64_t kmerId=0; kmerId!=kmerTable.size(); kmerId++) {
- totalKmerOccurrences += selectKmers2Data.globalFrequency[kmerId];
- if(assemblerInfo->readRepresentation == 0) {
+ totalKmerOccurrences += globalFrequency[kmerId];
+ if(reads.representation == 0) {
++ possibleKmerCount;
} else {
if(kmerTable[kmerId].isRleKmer) {
@@ -667,7 +579,7 @@ void Assembler::selectKmers2(
"GlobalFrequency,GlobalEnrichment,NumberOfReadsOverenriched\n";
for(uint64_t kmerId=0; kmerId<kmerTable.size(); kmerId++) {
const KmerInfo& info = kmerTable[kmerId];
- const uint64_t frequency = selectKmers2Data.globalFrequency[kmerId];
+ const uint64_t frequency = globalFrequency[kmerId];
const Kmer kmer(kmerId, k);
const Kmer reverseComplementedKmer(info.reverseComplementedKmerId, k);
@@ -680,17 +592,20 @@ void Assembler::selectKmers2(
csv << frequency << ",";
csv << double(frequency) / averageOccurrenceCount;
csv << ",";
- csv << selectKmers2Data.overenrichedReadCount[kmerId];
+ csv << overenrichedReadCount[kmerId];
csv << "\n";
}
+ csv.close();
+
// Gather k-mers that are not overenriched in any read and therefore
// can be used as markers.
vector<KmerId> candidateKmers;
for(uint64_t kmerId=0; kmerId<kmerTable.size(); kmerId++) {
- if(kmerTable[kmerId].isRleKmer and selectKmers2Data.overenrichedReadCount[kmerId] == 0) {
+ const bool readIsUsable = (reads.representation==0) ? true : kmerTable[kmerId].isRleKmer;
+ if(readIsUsable and overenrichedReadCount[kmerId] == 0) {
candidateKmers.push_back(KmerId(kmerId));
}
}
@@ -705,7 +620,7 @@ void Assembler::selectKmers2(
" corresponds to one occurrence every " <<
double(possibleKmerCount) / enrichmentThreshold <<
" bases";
- if(assemblerInfo->readRepresentation == 1) {
+ if(reads.representation == 1) {
cout << " (in RLE representation)";
}
cout << "." << endl;
@@ -728,6 +643,7 @@ void Assembler::selectKmers2(
uint64_t kmerCount = 0;
const uint64_t desiredKmerOccurrencesCount =
uint64_t(markerDensity * double(totalKmerOccurrences));
+ const uint64_t giveUpCount = uint64_t(0.9 * double(candidateKmers.size()));
while(kmerOccurrencesCount < desiredKmerOccurrencesCount) {
// Generate a random index into the candidateKmers vector.
@@ -743,7 +659,7 @@ void Assembler::selectKmers2(
// This k-mer is not already selected as a marker.
// Let's add it.
info.isMarker = true;
- kmerOccurrencesCount += selectKmers2Data.globalFrequency[kmerId];
+ kmerOccurrencesCount += globalFrequency[kmerId];
++kmerCount;
// If this k-mer is palindromic, we are done.
@@ -756,8 +672,12 @@ void Assembler::selectKmers2(
SHASTA_ASSERT(!reverseComplementedInfo.isMarker);
SHASTA_ASSERT(reverseComplementedInfo.frequency == info.frequency);
reverseComplementedInfo.isMarker = true;
- kmerOccurrencesCount += selectKmers2Data.globalFrequency[info.reverseComplementedKmerId];
+ kmerOccurrencesCount += globalFrequency[info.reverseComplementedKmerId];
++kmerCount;
+
+ if(kmerCount >= giveUpCount) {
+ throw runtime_error("Giving up after selecting as markers 90% of the candidate kmers.");
+ }
}
cout << "Selected " << kmerCount << " k-mers as markers." << endl;
cout << "These k-mers have a total " << kmerOccurrencesCount <<
@@ -768,36 +688,33 @@ void Assembler::selectKmers2(
-void Assembler::selectKmers2ThreadFunction(size_t threadId)
+void KmerTable2::threadFunction(size_t threadId)
{
// Initialize globalFrequency for this thread.
- MemoryMapped::Vector<uint64_t> globalFrequency;
- globalFrequency.createNew(
- largeDataName("tmp-SelectKmers2-GlobalFrequency-" + to_string(threadId)),
+ MemoryMapped::Vector<uint64_t> threadGlobalFrequency;
+ threadGlobalFrequency.createNew(
+ largeDataName("tmp-KmerTable2-GlobalFrequency-" + to_string(threadId)),
largeDataPageSize);
- globalFrequency.resize(kmerTable.size());
- fill(globalFrequency.begin(), globalFrequency.end(), 0);
+ threadGlobalFrequency.resize(kmerTable.size());
+ fill(threadGlobalFrequency.begin(), threadGlobalFrequency.end(), 0);
// Initialize overenrichedReadCount for this thread.
- MemoryMapped::Vector<ReadId> overenrichedReadCount;
- overenrichedReadCount.createNew(
- largeDataName("tmp-SelectKmers2-OverenrichedReadCount-" + to_string(threadId)),
+ MemoryMapped::Vector<ReadId> threadOverenrichedReadCount;
+ threadOverenrichedReadCount.createNew(
+ largeDataName("tmp-KmerTable2-OverenrichedReadCount-" + to_string(threadId)),
largeDataPageSize);
- overenrichedReadCount.resize(kmerTable.size());
- fill(overenrichedReadCount.begin(), overenrichedReadCount.end(), 0);
+ threadOverenrichedReadCount.resize(kmerTable.size());
+ fill(threadOverenrichedReadCount.begin(), threadOverenrichedReadCount.end(), 0);
// Vectors to hold KmerIds and their frequencies for a single read.
vector<KmerId> readKmerIds;
vector<uint32_t> readKmerIdFrequencies;
- // Access the enrichmentThreshold.
- const double enrichmentThreshold = selectKmers2Data.enrichmentThreshold;
-
// Compute the total number of possible k-mers.
// It is needed below for overenrichment computations.
uint64_t possibleKmerCount = 0;
for(const KmerInfo& kmerInfo: kmerTable) {
- if(assemblerInfo->readRepresentation == 0) {
+ if(reads.representation == 0) {
++possibleKmerCount;
} else {
if(kmerInfo.isRleKmer) {
@@ -808,7 +725,6 @@ void Assembler::selectKmers2ThreadFunction(size_t threadId)
// Loop over all batches assigned to this thread.
- const size_t k = assemblerInfo->k;
uint64_t begin, end;
while(getNextBatch(begin, end)) {
@@ -816,7 +732,7 @@ void Assembler::selectKmers2ThreadFunction(size_t threadId)
for(ReadId readId=ReadId(begin); readId!=ReadId(end); readId++) {
// Access the sequence of this read.
- const LongBaseSequenceView read = reads->getRead(readId);
+ const LongBaseSequenceView read = reads.getRead(readId);
// If the read is pathologically short, it has no k-mers.
if(read.baseCount < k) {
@@ -836,10 +752,10 @@ void Assembler::selectKmers2ThreadFunction(size_t threadId)
readKmerIds.push_back(kmerId);
// Increment its global frequency.
- ++globalFrequency[kmerId];
+ ++threadGlobalFrequency[kmerId];
// Also increment the frequency of the reverse complemented k-mer.
- ++globalFrequency[kmerTable[kmerId].reverseComplementedKmerId];
+ ++threadGlobalFrequency[kmerTable[kmerId].reverseComplementedKmerId];
// Check if we reached the end of the read.
if(position+k == read.baseCount) {
@@ -866,30 +782,30 @@ void Assembler::selectKmers2ThreadFunction(size_t threadId)
const KmerId kmerId = readKmerIds[i];
const uint32_t frequency = readKmerIdFrequencies[i];
if(frequency > frequencyThreshold) {
- ++overenrichedReadCount[kmerId];
- ++overenrichedReadCount[kmerTable[kmerId].reverseComplementedKmerId];
+ ++threadOverenrichedReadCount[kmerId];
+ ++threadOverenrichedReadCount[kmerTable[kmerId].reverseComplementedKmerId];
}
}
}
}
+
// Add our globalFrequency and overenrichedReadCount
// to the values computer by the other threads.
{
std::lock_guard<std::mutex> lock(mutex);
for(uint64_t kmerId=0; kmerId!=globalFrequency.size(); kmerId++) {
- selectKmers2Data.globalFrequency[kmerId] += globalFrequency[kmerId];
- selectKmers2Data.overenrichedReadCount[kmerId] += overenrichedReadCount[kmerId];
+ globalFrequency[kmerId] += threadGlobalFrequency[kmerId];
+ overenrichedReadCount[kmerId] += threadOverenrichedReadCount[kmerId];
}
}
- globalFrequency.remove();
- overenrichedReadCount.remove();
+ threadGlobalFrequency.remove();
+ threadOverenrichedReadCount.remove();
}
-
// In this version, marker k-mers are selected randomly, but excluding
// k-mers that appear repeated at short distances in any oriented read.
// More precisely, for each k-mer we compute the minimum distance
@@ -897,32 +813,19 @@ void Assembler::selectKmers2ThreadFunction(size_t threadId)
// K-mers for which this minimum distance is less than distanceThreshold
// are not used as markers. Marker k-mers are selected randomly among the
// remaining k-mers, until the desired marker density is achieved.
-void Assembler::selectKmers4(
-
- // k-mer length.
+KmerTable4::KmerTable4(
uint64_t k,
-
- // The desired marker density
double markerDensity,
-
- // Seed for random number generator.
- uint64_t seed,
-
- // Exclude k-mers that appear in any read in two copies,
- // with the two copies closer than this distance (in RLE bases).
+ int seed,
uint64_t distanceThreshold,
-
- size_t threadCount
-)
+ const Reads& reads,
+ uint64_t threadCount,
+ const MappedMemoryOwner& mappedMemoryOwner) :
+ KmerTable(k, true, mappedMemoryOwner),
+ MultithreadedObject<KmerTable4>(*this),
+ reads(reads)
{
const bool debug = false;
- cout << timestamp << "Begin selectKmers4." << endl;
-
- // Sanity check on the value of k, then store it.
- if(k > Kmer::capacity) {
- throw runtime_error("K-mer capacity exceeded.");
- }
- assemblerInfo->k = k;
// Sanity check.
if(markerDensity<0. || markerDensity>1.) {
@@ -935,41 +838,37 @@ void Assembler::selectKmers4(
threadCount = std::thread::hardware_concurrency();
}
- // Fill in the fields of the k-mer table
- // that depends only on k.
- initializeKmerTable();
-
// Initialize the global frequency of all k-mers.
- selectKmers4Data.globalFrequency.createNew(
- largeDataName("tmp-SelectKmers4-GlobalFrequency"), largeDataPageSize);
- selectKmers4Data.globalFrequency.resize(kmerTable.size());
+ globalFrequency.createNew(
+ largeDataName("tmp-KmerTable44-GlobalFrequency"), largeDataPageSize);
+ globalFrequency.resize(kmerTable.size());
fill(
- selectKmers4Data.globalFrequency.begin(),
- selectKmers4Data.globalFrequency.end(), 0);
+ globalFrequency.begin(),
+ globalFrequency.end(), 0);
// Initialize the minimumDistance vector, which stores
// the minimum RLE distance between any two copies of each k-mer
// in any oriented read.
- selectKmers4Data.minimumDistance.createNew(
- largeDataName("tmp-selectKmers4-minimumDistance"), largeDataPageSize);
+ minimumDistance.createNew(
+ largeDataName("tmp-KmerTable4-minimumDistance"), largeDataPageSize);
const uint64_t kmerCount = kmerTable.size();
- selectKmers4Data.minimumDistance.resize(kmerCount);
+ minimumDistance.resize(kmerCount);
for(uint64_t i=0; i<kmerCount; i++) {
- selectKmers4Data.minimumDistance[i].second = std::numeric_limits<uint32_t>::max();
+ minimumDistance[i].second = std::numeric_limits<uint32_t>::max();
}
// Compute the minimumDistance vector.
- setupLoadBalancing(reads->readCount(), 100);
- runThreads(&Assembler::selectKmers4ThreadFunction, threadCount);
+ setupLoadBalancing(reads.readCount(), 100);
+ runThreads(&KmerTable4::threadFunction, threadCount);
// Write out what we found.
if(debug) {
const uint64_t totalFrequency = std::accumulate(
- selectKmers4Data.globalFrequency.begin(),
- selectKmers4Data.globalFrequency.end(), 0ULL);
+ globalFrequency.begin(),
+ globalFrequency.end(), 0ULL);
cout << "Total number of k-mer occurrences in all oriented reads is " << totalFrequency << endl;
ofstream csv("KmerInfo.csv");
csv << "KmerId,Kmer,KmerIdRc,KmerRc,Frequency,FrequencyRc,TotalFrequency,"
@@ -980,13 +879,13 @@ void Assembler::selectKmers4(
continue;
}
- const uint64_t frequency = selectKmers4Data.globalFrequency[kmerId];
- const uint64_t frequencyReverseComplement = selectKmers4Data.globalFrequency[info.reverseComplementedKmerId];
+ const uint64_t frequency = globalFrequency[kmerId];
+ const uint64_t frequencyReverseComplement = globalFrequency[info.reverseComplementedKmerId];
const uint64_t totalFrequency = frequency + frequencyReverseComplement;
- const uint32_t minimumDistance = selectKmers4Data.minimumDistance[kmerId].second;
- const uint32_t minimumDistanceReverseComplement =
- selectKmers4Data.minimumDistance[info.reverseComplementedKmerId].second;
+ const uint32_t kmerMinimumDistance = minimumDistance[kmerId].second;
+ const uint32_t kmerMinimumDistanceReverseComplement =
+ minimumDistance[info.reverseComplementedKmerId].second;
const Kmer kmer(kmerId, k);
const Kmer reverseComplementedKmer(info.reverseComplementedKmerId, k);
@@ -999,9 +898,9 @@ void Assembler::selectKmers4(
csv << frequency << ",";
csv << frequencyReverseComplement << ",";
csv << totalFrequency << ",";
- csv << minimumDistance << ",";
- csv << minimumDistanceReverseComplement << ",";
- csv << min(minimumDistance, minimumDistanceReverseComplement) << "\n";
+ csv << kmerMinimumDistance << ",";
+ csv << kmerMinimumDistanceReverseComplement << ",";
+ csv << min(kmerMinimumDistance, kmerMinimumDistanceReverseComplement) << "\n";
}
}
@@ -1013,11 +912,11 @@ void Assembler::selectKmers4(
uint64_t rleKmerCount = 0;
for(uint64_t kmerId=0; kmerId!=kmerTable.size(); kmerId++) {
const KmerInfo& info = kmerTable[kmerId];
- if(not info.isRleKmer) {
- SHASTA_ASSERT(selectKmers4Data.globalFrequency[kmerId] == 0);
+ if((reads.representation==1) and (not info.isRleKmer)) {
+ SHASTA_ASSERT(globalFrequency[kmerId] == 0);
continue;
}
- totalKmerOccurrences += selectKmers4Data.globalFrequency[kmerId];
+ totalKmerOccurrences += globalFrequency[kmerId];
if(kmerTable[kmerId].isRleKmer) {
++rleKmerCount;
}
@@ -1039,7 +938,7 @@ void Assembler::selectKmers4(
for(uint64_t kmerId=0; kmerId<kmerTable.size(); kmerId++) {
const KmerInfo& info = kmerTable[kmerId];
const KmerId kmerIdRc = info.reverseComplementedKmerId;
- if(not info.isRleKmer) {
+ if((reads.representation==1) and (not info.isRleKmer)) {
continue;
}
if(kmerIdRc == kmerId) {
@@ -1050,18 +949,18 @@ void Assembler::selectKmers4(
if(kmerId > kmerIdRc) {
continue;
}
- if(selectKmers4Data.minimumDistance[kmerId].second < distanceThreshold) {
+ if(minimumDistance[kmerId].second < distanceThreshold) {
// Too close. skip.
continue;
}
- if(selectKmers4Data.minimumDistance[kmerIdRc].second < distanceThreshold) {
+ if(minimumDistance[kmerIdRc].second < distanceThreshold) {
// Too close. Skip.
continue;
}
candidateKmers.push_back(KmerId(kmerId));
- candidateFrequency += selectKmers4Data.globalFrequency[kmerId];
- candidateFrequency += selectKmers4Data.globalFrequency[kmerIdRc];
+ candidateFrequency += globalFrequency[kmerId];
+ candidateFrequency += globalFrequency[kmerIdRc];
}
cout << "Markers will be chosen randomly from the a pool of " <<
2*candidateKmers.size() << " RLE k-mers." << endl;
@@ -1105,8 +1004,8 @@ void Assembler::selectKmers4(
// Increment counters.
markerCount += 2;
- markerOccurrencesCount += selectKmers4Data.globalFrequency[kmerId];
- markerOccurrencesCount += selectKmers4Data.globalFrequency[kmerIdRc];
+ markerOccurrencesCount += globalFrequency[kmerId];
+ markerOccurrencesCount += globalFrequency[kmerIdRc];
// Remove kmerId from the vector of candidates.
if(i != candidateKmers.size()-1) {
@@ -1124,28 +1023,23 @@ void Assembler::selectKmers4(
// Clean up.
- selectKmers4Data.minimumDistance.remove();
- selectKmers4Data.globalFrequency.remove();
+ minimumDistance.remove();
+ globalFrequency.remove();
- // Done.
- cout << timestamp << "End selectKmers4." << endl;
}
-void Assembler::selectKmers4ThreadFunction(size_t threadId)
+void KmerTable4::threadFunction(size_t threadId)
{
- // K-mer length.
- const size_t k = assemblerInfo->k;
-
// Initialize globalFrequency for this thread.
// Having all threads accumulate atomically on the global frequency vector is too slow.
- MemoryMapped::Vector<uint64_t> globalFrequency;
- globalFrequency.createNew(
- largeDataName("tmp-SelectKmers4-GlobalFrequency-" + to_string(threadId)),
+ MemoryMapped::Vector<uint64_t> threadGlobalFrequency;
+ threadGlobalFrequency.createNew(
+ largeDataName("tmp-KmerTable4-GlobalFrequency-" + to_string(threadId)),
largeDataPageSize);
- globalFrequency.resize(kmerTable.size());
- fill(globalFrequency.begin(), globalFrequency.end(), 0);
+ threadGlobalFrequency.resize(kmerTable.size());
+ fill(threadGlobalFrequency.begin(), threadGlobalFrequency.end(), 0);
// Vector to hold pairs(KmerId, RLE position) for one read.
vector< pair<KmerId, uint32_t> > readKmers;
@@ -1158,7 +1052,7 @@ void Assembler::selectKmers4ThreadFunction(size_t threadId)
for(ReadId readId=ReadId(begin); readId!=ReadId(end); readId++) {
// Access the sequence of this read.
- const LongBaseSequenceView read = reads->getRead(readId);
+ const LongBaseSequenceView read = reads.getRead(readId);
// If the read is pathologically short, it has no k-mers.
if(read.baseCount < k) {
@@ -1178,8 +1072,8 @@ void Assembler::selectKmers4ThreadFunction(size_t threadId)
readKmers.push_back(make_pair(kmerId, position));
// Update the frequency of this k-mer.
- ++globalFrequency[kmerId];
- ++globalFrequency[kmerTable[kmerId].reverseComplementedKmerId];
+ ++threadGlobalFrequency[kmerId];
+ ++threadGlobalFrequency[kmerTable[kmerId].reverseComplementedKmerId];
// Check if we reached the end of the read.
if(position+k == read.baseCount) {
@@ -1205,7 +1099,7 @@ void Assembler::selectKmers4ThreadFunction(size_t threadId)
}
const uint32_t distance = p1.second - p0.second;
- pair<std::mutex, uint32_t>& p = selectKmers4Data.minimumDistance[kmerId0];
+ pair<std::mutex, uint32_t>& p = minimumDistance[kmerId0];
std::lock_guard<std::mutex> lock(p.first);;
p.second = min(p.second, distance);
}
@@ -1216,9 +1110,63 @@ void Assembler::selectKmers4ThreadFunction(size_t threadId)
{
std::lock_guard<std::mutex> lock(mutex);
for(uint64_t kmerId=0; kmerId!=globalFrequency.size(); kmerId++) {
- selectKmers4Data.globalFrequency[kmerId] += globalFrequency[kmerId];
+ globalFrequency[kmerId] += threadGlobalFrequency[kmerId];
}
}
- globalFrequency.remove();
+ threadGlobalFrequency.remove();
+}
+
+
+
+KmerTable0::KmerTable0(
+ uint64_t k,
+ const MappedMemoryOwner& mappedMemoryOwner) :
+ KmerTable(k, false, mappedMemoryOwner)
+{
+}
+
+
+
+KmerTable1::KmerTable1(
+ uint64_t k,
+ const Reads& reads,
+ const MappedMemoryOwner& mappedMemoryOwner) :
+ KmerTable(k, false, mappedMemoryOwner),
+ MultithreadedObject<KmerTable1>(*this),
+ reads(reads)
+{
+}
+
+
+
+KmerTable2::KmerTable2(
+ uint64_t k,
+ const Reads& reads,
+ const MappedMemoryOwner& mappedMemoryOwner) :
+ KmerTable(k, false, mappedMemoryOwner),
+ MultithreadedObject<KmerTable2>(*this),
+ reads(reads)
+{
+}
+
+
+
+KmerTable3::KmerTable3(
+ uint64_t k,
+ const MappedMemoryOwner& mappedMemoryOwner) :
+ KmerTable(k, false, mappedMemoryOwner)
+{
+}
+
+
+
+KmerTable4::KmerTable4(
+ uint64_t k,
+ const Reads& reads,
+ const MappedMemoryOwner& mappedMemoryOwner) :
+ KmerTable(k, false, mappedMemoryOwner),
+ MultithreadedObject<KmerTable4>(*this),
+ reads(reads)
+{
}
diff --git a/src/KmerTable.hpp b/src/KmerTable.hpp
new file mode 100644
index 0000000..2b134c1
--- /dev/null
+++ b/src/KmerTable.hpp
@@ -0,0 +1,215 @@
+#ifndef SHASTA_KMER_TABLE_HPP
+#define SHASTA_KMER_TABLE_HPP
+
+// Shasta.
+#include "KmerChecker.hpp"
+#include "MappedMemoryOwner.hpp"
+#include "MemoryMappedVector.hpp"
+#include "MultithreadedObject.hpp"
+
+// Standard library.
+#include "utility.hpp"
+
+namespace shasta {
+
+ class KmerTable;
+ class KmerTable0;
+ class KmerTable1;
+ class KmerTable2;
+ class KmerTable3;
+ class KmerTable4;
+
+ class KmersOptions;
+ class Reads;
+}
+
+
+
+// Old implementations of KmerChecker are table based.
+// There are derived classes to support all 5 marker generation methods
+// but they are limited to k-mer lengths k<16.
+class shasta::KmerTable :
+ public KmerChecker,
+ public MappedMemoryOwner {
+public:
+
+ bool isMarker(KmerId kmerId) const
+ {
+ return kmerTable[kmerId].isMarker;
+ }
+
+ KmerTable(uint64_t k, bool createNew, const MappedMemoryOwner&);
+
+protected:
+
+ class KmerInfo {
+ public:
+
+ // Frequency of this k-mer in input reads.
+ // This is only used in some of the derived classes and
+ // so there is opportunity for some cleanup here.
+ uint64_t frequency = 0;
+
+ KmerId16 reverseComplementedKmerId;
+ bool isMarker;
+ bool isRleKmer;
+ };
+
+ uint64_t k;
+ MemoryMapped::Vector<KmerInfo> kmerTable;
+
+private:
+ void createKmerTable();
+ void accessKmerTable();
+
+};
+
+
+
+// Marker k-mer generation method 0 (used when --Kmers.generationMethod 0).
+class shasta::KmerTable0 : public KmerTable {
+public:
+
+ // Construct from scratch.
+ KmerTable0(
+ uint64_t k,
+ double probability,
+ int seed,
+ const MappedMemoryOwner&);
+
+ // Construct from binary data.
+ KmerTable0(
+ uint64_t k,
+ const MappedMemoryOwner&);
+
+};
+
+
+
+// Marker k-mer generation method 1 (used when --Kmers.generationMethod 1).
+class shasta::KmerTable1 :
+ public KmerTable,
+ public MultithreadedObject<KmerTable1> {
+public:
+
+ // Construct from scratch.
+ KmerTable1(
+ uint64_t k,
+ double probability,
+ int seed,
+ double enrichmentThreshold,
+ const Reads&,
+ uint64_t threadCount,
+ const MappedMemoryOwner&);
+
+ // Construct from binary data.
+ KmerTable1(
+ uint64_t k,
+ const Reads&,
+ const MappedMemoryOwner&);
+
+private:
+ const Reads& reads;
+ void computeKmerFrequency(size_t threadId);
+};
+
+
+
+// Marker k-mer generation method 2 (used when --Kmers.generationMethod 2).
+class shasta::KmerTable2 :
+ public KmerTable,
+ public MultithreadedObject<KmerTable2> {
+public:
+
+ // Construct from scratch.
+ KmerTable2(
+ uint64_t k,
+ double probability,
+ int seed,
+ double enrichmentThreshold,
+ const Reads&,
+ uint64_t threadCount,
+ const MappedMemoryOwner&);
+
+ // Construct from binary data.
+ KmerTable2(
+ uint64_t k,
+ const Reads&,
+ const MappedMemoryOwner&);
+
+private:
+ const Reads& reads;
+ double enrichmentThreshold;
+
+ // The number of times each k-mer appears in an oriented read.
+ // Indexed by KmerId.
+ MemoryMapped::Vector<uint64_t> globalFrequency;
+
+ // The number of oriented reads that each k-mer is
+ // over-enriched in by more than a factor enrichmentThreshold.
+ // Indexed by KmerId.
+ MemoryMapped::Vector<ReadId> overenrichedReadCount;
+
+ void threadFunction(size_t threadId);
+};
+
+
+
+// Marker k-mer generation method 3 (used when --Kmers.generationMethod 3).
+class shasta::KmerTable3: public KmerTable {
+public:
+
+ // Construct from scratch.
+ KmerTable3(
+ uint64_t k,
+ uint64_t readRepresentation,
+ const string& fileName,
+ const MappedMemoryOwner&);
+
+ // Construct from binary data.
+ KmerTable3(
+ uint64_t k,
+ const MappedMemoryOwner&);
+
+};
+
+
+
+// Marker k-mer generation method 4 (used when --Kmers.generationMethod 4).
+class shasta::KmerTable4 :
+ public KmerTable,
+ public MultithreadedObject<KmerTable4> {
+public:
+
+ // Construct from scratch.
+ KmerTable4(
+ uint64_t k,
+ double probability,
+ int seed,
+ uint64_t distanceThreshold,
+ const Reads&,
+ uint64_t threadCount,
+ const MappedMemoryOwner&);
+
+ // Construct from binary data.
+ KmerTable4(
+ uint64_t k,
+ const Reads&,
+ const MappedMemoryOwner&);
+
+public:
+ const Reads& reads;
+
+ void threadFunction(size_t threadId);
+
+ // The number of times each k-mer appears in an oriented read.
+ // Indexed by KmerId.
+ MemoryMapped::Vector<uint64_t> globalFrequency;
+
+ // The minimum distance at which two copies of each k-mer
+ // appear in any oriented read.
+ // Indexed by KmerId.
+ MemoryMapped::Vector< pair<std::mutex, uint32_t> > minimumDistance;
+};
+
+#endif
diff --git a/src/LocalAssemblyGraph.cpp b/src/LocalAssemblyGraph.cpp
index 573b4f7..95dc083 100644
--- a/src/LocalAssemblyGraph.cpp
+++ b/src/LocalAssemblyGraph.cpp
@@ -2,6 +2,7 @@
#include "LocalAssemblyGraph.hpp"
#include "approximateTopologicalSort.hpp"
using namespace shasta;
+using namespace mode0;
// Boost libraries.
#include <boost/graph/graphviz.hpp>
@@ -214,7 +215,7 @@ void LocalAssemblyGraph::Writer::operator()(std::ostream& s, vertex_descriptor v
"\"";
// URL.
- s << " URL=\"exploreMarkerGraph?"
+ s << " URL=\"exploreMarkerGraph0?"
"?vertexId=" << vertex.markerGraphVertexId <<
"&maxDistance=10"
"&timeout=30"
diff --git a/src/LocalAssemblyGraph.hpp b/src/LocalAssemblyGraph.hpp
index ab0be79..cefd969 100644
--- a/src/LocalAssemblyGraph.hpp
+++ b/src/LocalAssemblyGraph.hpp
@@ -4,7 +4,7 @@
/*******************************************************************************
-The local marker graph created by class LocalMarkerGraph is a subgraph
+The local marker graph created by class LocalAssemblyGraph is a subgraph
of the global assembly graph, created by starting at a given vertex,
and extending out to a specified distance in both directions.
Distance is number of edges on the global assembly graph.
@@ -40,7 +40,7 @@ public:
// The vertex id of the vertex of the global assembly
// graph that corresponds to this vertex.
- AssemblyGraph::VertexId assemblyGraphVertexId;
+ mode0::AssemblyGraph::VertexId assemblyGraphVertexId;
// The vertex id of the vertex of the global marker
// graph that corresponds to this vertex.
@@ -54,7 +54,7 @@ public:
size_t rank = 0;
LocalAssemblyGraphVertex(
- AssemblyGraph::VertexId assemblyGraphVertexId,
+ mode0::AssemblyGraph::VertexId assemblyGraphVertexId,
MarkerGraph::VertexId markerGraphVertexId,
int distance) :
assemblyGraphVertexId(assemblyGraphVertexId),
@@ -70,7 +70,7 @@ class shasta::LocalAssemblyGraphEdge {
public:
// The global edge id of the edge of the global assembly
// graph that corresponds to this edge.
- AssemblyGraph::EdgeId edgeId;
+ mode0::AssemblyGraph::EdgeId edgeId;
// Field used by approximateTopologicalSort.
bool isDagEdge = true;
@@ -87,11 +87,11 @@ class shasta::LocalAssemblyGraph :
public:
LocalAssemblyGraph(
- AssemblyGraph&
+ mode0::AssemblyGraph&
);
- using VertexId = AssemblyGraph::VertexId;
- using EdgeId = AssemblyGraph::EdgeId;
+ using VertexId = mode0::AssemblyGraph::VertexId;
+ using EdgeId = mode0::AssemblyGraph::EdgeId;
// Add a vertex with the given vertex ids
// and return its vertex descriptor.
@@ -136,7 +136,7 @@ private:
std::map<VertexId, vertex_descriptor> vertexMap;
// Reference to the global assembly graph.
- AssemblyGraph& globalAssemblyGraph;
+ mode0::AssemblyGraph& globalAssemblyGraph;
// Writer class used for Graphviz output.
class Writer {
diff --git a/src/LocalMarkerGraph-Write.cpp b/src/LocalMarkerGraph0-Write.cpp
index 689f1a2..1bbce65 100644
--- a/src/LocalMarkerGraph-Write.cpp
+++ b/src/LocalMarkerGraph0-Write.cpp
@@ -1,10 +1,11 @@
// Shasta.
-#include "LocalMarkerGraph.hpp"
+#include "LocalMarkerGraph0.hpp"
#include "ConsensusCaller.hpp"
#include "Marker.hpp"
#include "MemoryMappedVectorOfVectors.hpp"
#include "orderPairs.hpp"
using namespace shasta;
+using namespace mode0;
// Boost libraries.
#include <boost/graph/graphviz.hpp>
@@ -15,9 +16,9 @@ using namespace shasta;
// Write the graph in Graphviz format.
-void LocalMarkerGraph::write(
+void LocalMarkerGraph0::write(
const string& fileName,
- const LocalMarkerGraphRequestParameters& localMarkerGraphRequestParameters) const
+ const LocalMarkerGraph0RequestParameters& localMarkerGraphRequestParameters) const
{
ofstream outputFileStream(fileName);
if(!outputFileStream) {
@@ -25,19 +26,19 @@ void LocalMarkerGraph::write(
}
write(outputFileStream, localMarkerGraphRequestParameters);
}
-void LocalMarkerGraph::write(
+void LocalMarkerGraph0::write(
ostream& s,
- const LocalMarkerGraphRequestParameters& localMarkerGraphRequestParameters) const
+ const LocalMarkerGraph0RequestParameters& localMarkerGraphRequestParameters) const
{
Writer writer(*this, localMarkerGraphRequestParameters);
boost::write_graphviz(s, *this, writer, writer, writer,
- boost::get(&LocalMarkerGraphVertex::vertexId, *this));
+ boost::get(&LocalMarkerGraph0Vertex::vertexId, *this));
}
-LocalMarkerGraph::Writer::Writer(
- const LocalMarkerGraph& graph,
- const LocalMarkerGraphRequestParameters& parameters) :
- LocalMarkerGraphRequestParameters(parameters),
+LocalMarkerGraph0::Writer::Writer(
+ const LocalMarkerGraph0& graph,
+ const LocalMarkerGraph0RequestParameters& parameters) :
+ LocalMarkerGraph0RequestParameters(parameters),
graph(graph)
{
}
@@ -45,26 +46,26 @@ LocalMarkerGraph::Writer::Writer(
// Vertex and edge colors.
-const string LocalMarkerGraph::Writer::vertexColorZeroDistance = "#6666ff";
-const string LocalMarkerGraph::Writer::vertexColorIntermediateDistance = "#00ccff";
-const string LocalMarkerGraph::Writer::vertexColorMaxDistance = "#66ffff";
-const string LocalMarkerGraph::Writer::edgeArrowColorRemovedDuringTransitiveReduction = "#ff0000";
-const string LocalMarkerGraph::Writer::edgeArrowColorRemovedDuringPruning = "#ff00ff";
-const string LocalMarkerGraph::Writer::edgeArrowColorRemovedDuringSuperBubbleRemoval = "#009900";
-const string LocalMarkerGraph::Writer::edgeArrowColorRemovedAsLowCoverageCrossEdge = "#c0c000";
-const string LocalMarkerGraph::Writer::edgeArrowColorRemovedWhileSplittingSecondaryEdges = "#ff0000";
-const string LocalMarkerGraph::Writer::edgeArrowColorNotRemovedNotAssembled = "#fcba03";
-const string LocalMarkerGraph::Writer::edgeArrowColorNotRemovedAssembled = "#000000";
-const string LocalMarkerGraph::Writer::edgeLabelColorRemovedDuringTransitiveReduction = "#ff9999";
-const string LocalMarkerGraph::Writer::edgeLabelColorRemovedDuringPruning = "#c03280";
-const string LocalMarkerGraph::Writer::edgeLabelColorRemovedDuringSuperBubbleRemoval = "#99ff99";
-const string LocalMarkerGraph::Writer::edgeLabelColorRemovedAsLowCoverageCrossEdge = "#e0e000";
-const string LocalMarkerGraph::Writer::edgeLabelColorNotRemovedNotAssembled = "#996600";
-const string LocalMarkerGraph::Writer::edgeLabelColorNotRemovedAssembled = "#999999";
-
-
-
-string LocalMarkerGraph::Writer::vertexColor(const LocalMarkerGraphVertex& vertex) const
+const string LocalMarkerGraph0::Writer::vertexColorZeroDistance = "#6666ff";
+const string LocalMarkerGraph0::Writer::vertexColorIntermediateDistance = "#00ccff";
+const string LocalMarkerGraph0::Writer::vertexColorMaxDistance = "#66ffff";
+const string LocalMarkerGraph0::Writer::edgeArrowColorRemovedDuringTransitiveReduction = "#ff0000";
+const string LocalMarkerGraph0::Writer::edgeArrowColorRemovedDuringPruning = "#ff00ff";
+const string LocalMarkerGraph0::Writer::edgeArrowColorRemovedDuringSuperBubbleRemoval = "#009900";
+const string LocalMarkerGraph0::Writer::edgeArrowColorRemovedAsLowCoverageCrossEdge = "#c0c000";
+const string LocalMarkerGraph0::Writer::edgeArrowColorRemovedWhileSplittingSecondaryEdges = "#ff0000";
+const string LocalMarkerGraph0::Writer::edgeArrowColorNotRemovedNotAssembled = "#fcba03";
+const string LocalMarkerGraph0::Writer::edgeArrowColorNotRemovedAssembled = "#000000";
+const string LocalMarkerGraph0::Writer::edgeLabelColorRemovedDuringTransitiveReduction = "#ff9999";
+const string LocalMarkerGraph0::Writer::edgeLabelColorRemovedDuringPruning = "#c03280";
+const string LocalMarkerGraph0::Writer::edgeLabelColorRemovedDuringSuperBubbleRemoval = "#99ff99";
+const string LocalMarkerGraph0::Writer::edgeLabelColorRemovedAsLowCoverageCrossEdge = "#e0e000";
+const string LocalMarkerGraph0::Writer::edgeLabelColorNotRemovedNotAssembled = "#996600";
+const string LocalMarkerGraph0::Writer::edgeLabelColorNotRemovedAssembled = "#999999";
+
+
+
+string LocalMarkerGraph0::Writer::vertexColor(const LocalMarkerGraph0Vertex& vertex) const
{
if(vertexColoring == "none") {
return "black";
@@ -103,7 +104,7 @@ string LocalMarkerGraph::Writer::vertexColor(const LocalMarkerGraphVertex& verte
-string LocalMarkerGraph::Writer::edgeArrowColor(const LocalMarkerGraphEdge& edge) const
+string LocalMarkerGraph0::Writer::edgeArrowColor(const LocalMarkerGraph0Edge& edge) const
{
if(edgeColoring == "none") {
@@ -164,7 +165,7 @@ string LocalMarkerGraph::Writer::edgeArrowColor(const LocalMarkerGraphEdge& edge
-string LocalMarkerGraph::Writer::edgeLabelColor(const LocalMarkerGraphEdge& edge) const
+string LocalMarkerGraph0::Writer::edgeLabelColor(const LocalMarkerGraph0Edge& edge) const
{
if(edgeColoring == "none") {
return "white";
@@ -211,7 +212,7 @@ string LocalMarkerGraph::Writer::edgeLabelColor(const LocalMarkerGraphEdge& edge
-void LocalMarkerGraph::writeColorLegendVerticesByDistance(ostream& html)
+void LocalMarkerGraph0::writeColorLegendVerticesByDistance(ostream& html)
{
html <<
"<table>"
@@ -226,7 +227,7 @@ void LocalMarkerGraph::writeColorLegendVerticesByDistance(ostream& html)
-void LocalMarkerGraph::writeColorLegendEdgeArrowsByFlags(ostream& html)
+void LocalMarkerGraph0::writeColorLegendEdgeArrowsByFlags(ostream& html)
{
if(assemblyMode == 2) {
html <<
@@ -261,7 +262,7 @@ void LocalMarkerGraph::writeColorLegendEdgeArrowsByFlags(ostream& html)
-void LocalMarkerGraph::writeColorLegendEdgeLabelsByFlags(ostream& html)
+void LocalMarkerGraph0::writeColorLegendEdgeLabelsByFlags(ostream& html)
{
html <<
"<table>"
@@ -283,7 +284,7 @@ void LocalMarkerGraph::writeColorLegendEdgeLabelsByFlags(ostream& html)
-void LocalMarkerGraph::Writer::operator()(std::ostream& s) const
+void LocalMarkerGraph0::Writer::operator()(std::ostream& s) const
{
// This turns off the tooltip on the graph and the edges.
s << "tooltip = \" \";\n";
@@ -316,9 +317,9 @@ void LocalMarkerGraph::Writer::operator()(std::ostream& s) const
-void LocalMarkerGraph::Writer::operator()(std::ostream& s, vertex_descriptor v) const
+void LocalMarkerGraph0::Writer::operator()(std::ostream& s, vertex_descriptor v) const
{
- const LocalMarkerGraphVertex& vertex = graph[v];
+ const LocalMarkerGraph0Vertex& vertex = graph[v];
const auto coverage = vertex.markerInfos.size();
const string color = vertexColor(vertex);
SHASTA_ASSERT(coverage > 0);
@@ -464,10 +465,10 @@ void LocalMarkerGraph::Writer::operator()(std::ostream& s, vertex_descriptor v)
-void LocalMarkerGraph::Writer::operator()(std::ostream& s, edge_descriptor e) const
+void LocalMarkerGraph0::Writer::operator()(std::ostream& s, edge_descriptor e) const
{
- const LocalMarkerGraphEdge& edge = graph[e];
+ const LocalMarkerGraph0Edge& edge = graph[e];
const size_t coverage = edge.coverage();
const string arrowColor = edgeArrowColor(edge);
const string labelColor = edgeLabelColor(edge);
@@ -607,7 +608,7 @@ void LocalMarkerGraph::Writer::operator()(std::ostream& s, edge_descriptor e) co
// Verbose labels include the detail of all oriented read ids on this edge.
if(edgeLabels == 2) {
- vector< pair<OrientedReadId, LocalMarkerGraphEdge::Sequence> > table;
+ vector< pair<OrientedReadId, LocalMarkerGraph0Edge::Sequence> > table;
for(const auto& info: edge.infos) {
const auto& sequence = info.first;
const auto& intervals = info.second;
@@ -616,7 +617,7 @@ void LocalMarkerGraph::Writer::operator()(std::ostream& s, edge_descriptor e) co
}
}
sort(table.begin(), table.end(),
- OrderPairsByFirstOnly<OrientedReadId, LocalMarkerGraphEdge::Sequence>());
+ OrderPairsByFirstOnly<OrientedReadId, LocalMarkerGraph0Edge::Sequence>());
s << "<hr/>";
for(const auto& p: table) {
diff --git a/src/LocalMarkerGraph.cpp b/src/LocalMarkerGraph0.cpp
index b9c9262..a7ac4b2 100644
--- a/src/LocalMarkerGraph.cpp
+++ b/src/LocalMarkerGraph0.cpp
@@ -1,5 +1,5 @@
// Shasta.
-#include "LocalMarkerGraph.hpp"
+#include "LocalMarkerGraph0.hpp"
#include "approximateTopologicalSort.hpp"
#include "findMarkerId.hpp"
#include "orderPairs.hpp"
@@ -10,12 +10,13 @@ using namespace shasta;
-LocalMarkerGraph::LocalMarkerGraph(
+LocalMarkerGraph0::LocalMarkerGraph0(
uint64_t readRepresentation,
uint32_t k,
uint64_t assemblyMode,
const Reads& reads,
const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
+ const MarkerGraph& markerGraph,
const MemoryMapped::Vector<MarkerGraph::CompressedVertexId>& globalMarkerGraphVertex,
const ConsensusCaller& consensusCaller
) :
@@ -24,6 +25,7 @@ LocalMarkerGraph::LocalMarkerGraph(
assemblyMode(assemblyMode),
reads(reads),
markers(markers),
+ markerGraph(markerGraph),
globalMarkerGraphVertex(globalMarkerGraphVertex),
consensusCaller(consensusCaller)
{
@@ -34,8 +36,8 @@ LocalMarkerGraph::LocalMarkerGraph(
// Find out if a vertex with the given MarkerGraph::VertexId exists.
// If it exists, return make_pair(true, v).
// Otherwise, return make_pair(false, null_vertex());
-std::pair<bool, LocalMarkerGraph::vertex_descriptor>
- LocalMarkerGraph::findVertex(MarkerGraph::VertexId vertexId) const
+std::pair<bool, LocalMarkerGraph0::vertex_descriptor>
+ LocalMarkerGraph0::findVertex(MarkerGraph::VertexId vertexId) const
{
const auto it = vertexMap.find(vertexId);
if(it == vertexMap.end()) {
@@ -50,8 +52,8 @@ std::pair<bool, LocalMarkerGraph::vertex_descriptor>
// Add a vertex with the given MarkerGraph::VertexId
// and return its vertex descriptor.
// A vertex with this MarkerGraph::VertexId must not exist.
-LocalMarkerGraph::vertex_descriptor
- LocalMarkerGraph::addVertex(
+LocalMarkerGraph0::vertex_descriptor
+ LocalMarkerGraph0::addVertex(
MarkerGraph::VertexId vertexId,
uint64_t distance,
span<MarkerId> vertexMarkers)
@@ -60,14 +62,14 @@ LocalMarkerGraph::vertex_descriptor
SHASTA_ASSERT(vertexMap.find(vertexId) == vertexMap.end());
// Add the vertex and store it in the vertex map.
- const vertex_descriptor v = add_vertex(LocalMarkerGraphVertex(vertexId, distance), *this);
+ const vertex_descriptor v = add_vertex(LocalMarkerGraph0Vertex(vertexId, distance), *this);
vertexMap.insert(make_pair(vertexId, v));
// Fill in the marker information for this vertex.
- LocalMarkerGraphVertex& vertex = (*this)[v];
+ LocalMarkerGraph0Vertex& vertex = (*this)[v];
vertex.markerInfos.reserve(vertexMarkers.size());
for(const MarkerId markerId: vertexMarkers) {
- LocalMarkerGraphVertex::MarkerInfo markerInfo;
+ LocalMarkerGraph0Vertex::MarkerInfo markerInfo;
markerInfo.markerId = markerId;
tie(markerInfo.orientedReadId, markerInfo.ordinal) =
findMarkerId(markerId, markers);
@@ -80,29 +82,17 @@ LocalMarkerGraph::vertex_descriptor
// Get the KmerId for a vertex.
-KmerId LocalMarkerGraph::getKmerId(vertex_descriptor v) const
+KmerId LocalMarkerGraph0::getKmerId(vertex_descriptor v) const
{
- const LocalMarkerGraphVertex& vertex = (*this)[v];
- SHASTA_ASSERT(!vertex.markerInfos.empty());
- const MarkerId firstMarkerId = vertex.markerInfos.front().markerId;
- const CompressedMarker& firstMarker = markers.begin()[firstMarkerId];
- const KmerId kmerId = firstMarker.kmerId;
-
- // Sanity check that all markers have the same kmerId.
- // At some point this can be removed.
- for(const auto& markerInfo: vertex.markerInfos){
- const CompressedMarker& marker = markers.begin()[markerInfo.markerId];
- SHASTA_ASSERT(marker.kmerId == kmerId);
- }
-
- return kmerId;
+ const LocalMarkerGraph0Vertex& vertex = (*this)[v];
+ return markerGraph.getVertexKmerId(vertex.vertexId, k, reads, markers);
}
// Get the repeat counts for a MarkerInfo of a vertex.
-vector<uint8_t> LocalMarkerGraph::getRepeatCounts(
- const LocalMarkerGraphVertex::MarkerInfo& markerInfo) const
+vector<uint8_t> LocalMarkerGraph0::getRepeatCounts(
+ const LocalMarkerGraph0Vertex::MarkerInfo& markerInfo) const
{
if(readRepresentation == 1) {
@@ -132,20 +122,20 @@ vector<uint8_t> LocalMarkerGraph::getRepeatCounts(
// Fill in the ConsensusInfo's for each vertex.
-void LocalMarkerGraph::computeVertexConsensusInfo()
+void LocalMarkerGraph0::computeVertexConsensusInfo()
{
- LocalMarkerGraph& graph = *this;
- BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph) {
+ LocalMarkerGraph0& graph = *this;
+ BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph0) {
computeVertexConsensusInfo(v);
}
}
-void LocalMarkerGraph::computeVertexConsensusInfo( vertex_descriptor v)
+void LocalMarkerGraph0::computeVertexConsensusInfo( vertex_descriptor v)
{
// Short-hands for the graph and the vertex.
- LocalMarkerGraph& graph = *this;
- LocalMarkerGraphVertex& vertex = graph[v];
+ LocalMarkerGraph0& graph = *this;
+ LocalMarkerGraph0Vertex& vertex = graph[v];
// Get the marker k-mer of this vertex.
const KmerId kmerId = graph.getKmerId(v);
@@ -174,22 +164,22 @@ void LocalMarkerGraph::computeVertexConsensusInfo( vertex_descriptor v)
// Store sequence information in the edge.
// This version takes as input a vector of the
-// LocalMarkerGraphEdge::Info that caused the edge to be created.
-void LocalMarkerGraph::storeEdgeInfo(
+// LocalMarkerGraph0Edge::Info that caused the edge to be created.
+void LocalMarkerGraph0::storeEdgeInfo(
edge_descriptor e,
const vector<MarkerInterval>& intervals)
{
- LocalMarkerGraph& graph = *this;
- LocalMarkerGraphEdge& edge = graph[e];
+ LocalMarkerGraph0& graph = *this;
+ LocalMarkerGraph0Edge& edge = graph[e];
// Map to store the oriented read ids and ordinals, grouped by sequence.
- std::map<LocalMarkerGraphEdge::Sequence, vector<MarkerIntervalWithRepeatCounts> > sequenceTable;
+ std::map<LocalMarkerGraph0Edge::Sequence, vector<MarkerIntervalWithRepeatCounts> > sequenceTable;
for(const MarkerInterval& interval: intervals) {
const CompressedMarker& marker0 = markers.begin(interval.orientedReadId.getValue())[interval.ordinals[0]];
const CompressedMarker& marker1 = markers.begin(interval.orientedReadId.getValue())[interval.ordinals[1]];
// Fill in the sequence information and, if necessary, the base repeat counts.
- LocalMarkerGraphEdge::Sequence sequence;
+ LocalMarkerGraph0Edge::Sequence sequence;
MarkerIntervalWithRepeatCounts intervalWithRepeatCounts(interval);
if(marker1.position <= marker0.position + k) {
@@ -265,7 +255,7 @@ void LocalMarkerGraph::storeEdgeInfo(
// Sort by decreasing size of the infos vector.
sort(edge.infos.begin(), edge.infos.end(),
OrderPairsBySizeOfSecondGreater<
- LocalMarkerGraphEdge::Sequence,
+ LocalMarkerGraph0Edge::Sequence,
vector<MarkerIntervalWithRepeatCounts> >());
}
@@ -276,7 +266,7 @@ void LocalMarkerGraph::storeEdgeInfo(
// If found, returns pair(true, ordinal).
// Otherwise, returns pair(false, don't care).
// If more than an ordinal is found, the first one is returned.
-pair<bool, uint32_t> LocalMarkerGraphVertex::getOrdinal(
+pair<bool, uint32_t> LocalMarkerGraph0Vertex::getOrdinal(
OrientedReadId orientedReadId) const
{
for(const MarkerInfo& markerInfo: markerInfos) {
@@ -292,7 +282,7 @@ pair<bool, uint32_t> LocalMarkerGraphVertex::getOrdinal(
// Look for the ordinals for a given oriented read id.
// If found, returns true.
// If more than an ordinal pairs is found, the first one is returned.
-bool LocalMarkerGraphEdge::getOrdinals(
+bool LocalMarkerGraph0Edge::getOrdinals(
OrientedReadId orientedReadId,
array<uint32_t, 2>& ordinals) const
{
@@ -314,12 +304,12 @@ bool LocalMarkerGraphEdge::getOrdinals(
// Approximate topological sort, adding edges
// in order of decreasing coverage. The topological sort
// stored in LocalMarkerGrapg2Vertex::rank.
-void LocalMarkerGraph::approximateTopologicalSort()
+void LocalMarkerGraph0::approximateTopologicalSort()
{
- LocalMarkerGraph& graph = *this;
+ LocalMarkerGraph0& graph = *this;
vector<pair<uint32_t, edge_descriptor> > edgeTable;
- BGL_FORALL_EDGES(e, graph, LocalMarkerGraph) {
+ BGL_FORALL_EDGES(e, graph, LocalMarkerGraph0) {
edgeTable.push_back(make_pair(graph[e].coverage(), e));
}
sort(edgeTable.begin(), edgeTable.end(),
@@ -335,7 +325,7 @@ void LocalMarkerGraph::approximateTopologicalSort()
// Also store the vertices in topological sort order.
vector< pair<size_t, vertex_descriptor> > vertexTable;
- BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph) {
+ BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph0) {
vertexTable.push_back(make_pair(graph[v].rank, v));
}
sort(vertexTable.begin(), vertexTable.end());
diff --git a/src/LocalMarkerGraph.hpp b/src/LocalMarkerGraph0.hpp
index f3b055b..fd38654 100644
--- a/src/LocalMarkerGraph.hpp
+++ b/src/LocalMarkerGraph0.hpp
@@ -1,9 +1,9 @@
-#ifndef SHASTA_LOCAL_MARKER_GRAPH_HPP
-#define SHASTA_LOCAL_MARKER_GRAPH_HPP
+#ifndef SHASTA_LOCAL_MARKER_GRAPH0_HPP
+#define SHASTA_LOCAL_MARKER_GRAPH0_HPP
/*******************************************************************************
-The local marker graph created by class LocalMarkerGraph is a subgraph
+The local marker graph created by class LocalMarkerGraph0 is a subgraph
of the global marker graph, created by starting at a given vertex,
and extending out to a specified distance in both directions.
Distance is number of edges on the global marker graph.
@@ -17,7 +17,7 @@ a group of aligned markers.
#include "AssemblyGraph.hpp"
#include "Coverage.hpp"
#include "Kmer.hpp"
-#include "LocalMarkerGraphRequestParameters.hpp"
+#include "LocalMarkerGraph0RequestParameters.hpp"
#include "MarkerGraph.hpp"
#include "Reads.hpp"
@@ -26,26 +26,25 @@ a group of aligned markers.
namespace shasta {
- class LocalMarkerGraphVertex;
- class LocalMarkerGraphEdge;
- class LocalMarkerGraph;
- using LocalMarkerGraphBaseClass = boost::adjacency_list<
+ class LocalMarkerGraph0Vertex;
+ class LocalMarkerGraph0Edge;
+ class LocalMarkerGraph0;
+ using LocalMarkerGraph0BaseClass = boost::adjacency_list<
boost::listS, // Permit parallel edges created by createMarkerGraphEdgesStrict
boost::listS,
boost::bidirectionalS,
- LocalMarkerGraphVertex,
- LocalMarkerGraphEdge
+ LocalMarkerGraph0Vertex,
+ LocalMarkerGraph0Edge
>;
class CompressedMarker;
class ConsensusCaller;
- class LocalMarkerGraphRequestParameters;
class LongBaseSequences;
}
-class shasta::LocalMarkerGraphVertex {
+class shasta::LocalMarkerGraph0Vertex {
public:
// The global vertex id of the vertex of the global marker
@@ -64,7 +63,7 @@ public:
};
vector<MarkerInfo> markerInfos;
- LocalMarkerGraphVertex(
+ LocalMarkerGraph0Vertex(
MarkerGraph::VertexId vertexId,
uint64_t distance) :
vertexId(vertexId),
@@ -103,7 +102,7 @@ public:
-class shasta::LocalMarkerGraphEdge {
+class shasta::LocalMarkerGraph0Edge {
public:
// Class to describe the intervening sequence between
@@ -182,7 +181,7 @@ public:
// in the assembly graph. However, after detangling a marker
// graph edge can correspond to multiple locations in the
// assembly graph.
- vector< pair<AssemblyGraph::EdgeId, uint32_t> > assemblyGraphLocations;
+ vector< pair<mode0::AssemblyGraph::EdgeId, uint32_t> > assemblyGraphLocations;
// Flag that is set if the edge was removed during
// approximate transitive reduction by flagWeakMarkerGraphEdges.
@@ -219,16 +218,17 @@ public:
-class shasta::LocalMarkerGraph :
- public LocalMarkerGraphBaseClass {
+class shasta::LocalMarkerGraph0 :
+ public LocalMarkerGraph0BaseClass {
public:
- LocalMarkerGraph(
+ LocalMarkerGraph0(
uint64_t readRepresentation,
uint32_t k,
uint64_t assemblyMode,
const Reads& reads,
const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
+ const MarkerGraph&,
const MemoryMapped::Vector<MarkerGraph::CompressedVertexId>& globalMarkerGraphVertex,
const ConsensusCaller&
);
@@ -254,7 +254,7 @@ public:
KmerId getKmerId(vertex_descriptor) const;
// Get the repeat counts for a MarkerInfo of a vertex.
- vector<uint8_t> getRepeatCounts(const LocalMarkerGraphVertex::MarkerInfo&) const;
+ vector<uint8_t> getRepeatCounts(const LocalMarkerGraph0Vertex::MarkerInfo&) const;
// Fill in the ConsensusInfo's for each vertex.
void computeVertexConsensusInfo();
@@ -262,17 +262,17 @@ public:
// Store sequence information in the edge.
// Takes as input a vector of the
- // LocalMarkerGraphEdge::Info that caused the edge to be created.
+ // LocalMarkerGraph0Edge::Info that caused the edge to be created.
void storeEdgeInfo(edge_descriptor, const vector<MarkerInterval>&);
// Write in Graphviz format.
void write(
ostream&,
- const LocalMarkerGraphRequestParameters&) const;
+ const LocalMarkerGraph0RequestParameters&) const;
void write(
const string& fileName,
- const LocalMarkerGraphRequestParameters&) const;
+ const LocalMarkerGraph0RequestParameters&) const;
// Approximate topological sort, adding edges
@@ -301,6 +301,7 @@ private:
// (not just those in this local marker graph).
const Reads& reads;
const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers;
+ const MarkerGraph& markerGraph;
// A reference to the vector containing the global marker graph vertex id
// corresponding to each marker.
@@ -313,16 +314,16 @@ private:
// Class used for graphviz output.
- class Writer : public LocalMarkerGraphRequestParameters {
+ class Writer : public LocalMarkerGraph0RequestParameters {
public:
Writer(
- const LocalMarkerGraph&,
- const LocalMarkerGraphRequestParameters&);
+ const LocalMarkerGraph0&,
+ const LocalMarkerGraph0RequestParameters&);
void operator()(ostream&) const;
void operator()(ostream&, vertex_descriptor) const;
void operator()(ostream&, edge_descriptor) const;
- const LocalMarkerGraph& graph;
+ const LocalMarkerGraph0& graph;
// Vertex and edge colors.
static const string vertexColorZeroDistance;
@@ -341,9 +342,9 @@ private:
static const string edgeLabelColorRemovedAsLowCoverageCrossEdge;
static const string edgeLabelColorNotRemovedNotAssembled;
static const string edgeLabelColorNotRemovedAssembled;
- string vertexColor(const LocalMarkerGraphVertex&) const;
- string edgeArrowColor(const LocalMarkerGraphEdge&) const;
- string edgeLabelColor(const LocalMarkerGraphEdge&) const;
+ string vertexColor(const LocalMarkerGraph0Vertex&) const;
+ string edgeArrowColor(const LocalMarkerGraph0Edge&) const;
+ string edgeLabelColor(const LocalMarkerGraph0Edge&) const;
};
friend class Writer;
diff --git a/src/LocalMarkerGraphRequestParameters.hpp b/src/LocalMarkerGraph0RequestParameters.hpp
index 899f612..bd63457 100644
--- a/src/LocalMarkerGraphRequestParameters.hpp
+++ b/src/LocalMarkerGraph0RequestParameters.hpp
@@ -5,13 +5,13 @@
#include <map>
namespace shasta {
- class LocalMarkerGraphRequestParameters;
+ class LocalMarkerGraph0RequestParameters;
}
// Class describing the parameters in the form
// in the local marker graph page.
-class shasta::LocalMarkerGraphRequestParameters {
+class shasta::LocalMarkerGraph0RequestParameters {
public:
uint64_t vertexId;
diff --git a/src/LocalMarkerGraph1.cpp b/src/LocalMarkerGraph1.cpp
new file mode 100644
index 0000000..1c8b9cf
--- /dev/null
+++ b/src/LocalMarkerGraph1.cpp
@@ -0,0 +1,1067 @@
+// Shasta.
+#include "LocalMarkerGraph1.hpp"
+#include "Base.hpp"
+#include "computeLayout.hpp"
+#include "findLinearChains.hpp"
+#include "html.hpp"
+#include "invalid.hpp"
+#include "Marker.hpp"
+#include "MarkerGraph.hpp"
+#include "MurmurHash2.hpp"
+#include "orderPairs.hpp"
+#include "platformDependent.hpp"
+#include "runCommandWithTimeout.hpp"
+using namespace shasta;
+
+// Boost libraries.
+#include "boost/graph/filtered_graph.hpp"
+#include "boost/graph/iteration_macros.hpp"
+#include <boost/uuid/uuid.hpp>
+#include <boost/uuid/uuid_generators.hpp>
+#include <boost/uuid/uuid_io.hpp>
+
+// Standard library.
+#include "chrono.hpp"
+#include "fstream.hpp"
+#include <queue>
+#include <stack>
+
+
+
+LocalMarkerGraph1::LocalMarkerGraph1(
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
+ const MarkerGraph& markerGraph,
+ MarkerGraphVertexId startVertexId,
+ uint64_t maxDistance,
+ uint64_t minVertexCoverage,
+ uint64_t minEdgeCoverage) :
+ markers(markers),
+ markerGraph(markerGraph),
+ maxDistance(maxDistance)
+{
+ LocalMarkerGraph1& graph = *this;
+
+ // Do a BFS to generate the vertices.
+ // Edges will be created later.
+ const vertex_descriptor vStart = addVertex(startVertexId, 0);
+ std::queue<vertex_descriptor> q;
+ q.push(vStart);
+ while(!q.empty()) {
+
+ // Dequeue a vertex.
+ const vertex_descriptor v0 = q.front();
+ q.pop();
+ const LocalMarkerGraph1Vertex& vertex0 = graph[v0];
+ const MarkerGraphVertexId vertexId0 = vertex0.vertexId;
+ const uint64_t distance0 = vertex0.distance;
+ const uint64_t distance1 = distance0 + 1;
+
+ // Loop over outgoing edges.
+ for(uint64_t edgeId: markerGraph.edgesBySource[vertexId0]) {
+ const auto& edge = markerGraph.edges[edgeId];
+
+ // If coverage is too low, skip it.
+ if(markerGraph.edgeCoverage(edgeId) < minEdgeCoverage) {
+ continue;
+ }
+
+ // Get the target vertex.
+ const MarkerGraph::VertexId vertexId1 = edge.target;
+ SHASTA_ASSERT(edge.source == vertexId0);
+ SHASTA_ASSERT(vertexId1 < markerGraph.vertexCount());
+
+ // If vertex coverage is too low, skip it.
+ if(markerGraph.vertexCoverage(vertexId1) < minVertexCoverage) {
+ continue;
+ }
+
+ // Add this vertex, if we don't already have it.
+ if(not vertexMap.contains(vertexId1)) {
+ const vertex_descriptor v1 = graph.addVertex(vertexId1, distance1);
+
+ // Also enqueue it, unless it is at maximum distance.
+ if(distance1 < maxDistance) {
+ q.push(v1);
+ }
+ }
+ }
+
+ // Loop over incoming edges.
+ for(uint64_t edgeId: markerGraph.edgesByTarget[vertexId0]) {
+ const auto& edge = markerGraph.edges[edgeId];
+
+ // If coverage is too low, skip it.
+ if(markerGraph.edgeCoverage(edgeId) < minEdgeCoverage) {
+ continue;
+ }
+
+ // Get the source vertex.
+ const MarkerGraph::VertexId vertexId1 = edge.source;
+ SHASTA_ASSERT(edge.target == vertexId0);
+ SHASTA_ASSERT(vertexId1 < markerGraph.vertexCount());
+
+ // If vertex coverage is too low, skip it.
+ if(markerGraph.vertexCoverage(vertexId1) < minVertexCoverage) {
+ continue;
+ }
+
+ // Add this vertex, if we don't already have it.
+ if(not vertexMap.contains(vertexId1)) {
+ const vertex_descriptor v1 = graph.addVertex(vertexId1, distance1);
+
+ // Also enqueue it, unless it is at maximum distance.
+ if(distance1 < maxDistance) {
+ q.push(v1);
+ }
+ }
+ }
+ }
+
+
+
+ // Create edges.
+ BGL_FORALL_VERTICES(v0, graph, LocalMarkerGraph1) {
+ const LocalMarkerGraph1Vertex& vertex0 = graph[v0];
+ const MarkerGraphVertexId vertexId0 = vertex0.vertexId;
+
+ for(uint64_t edgeId: markerGraph.edgesBySource[vertexId0]) {
+
+ // If coverage is too low, skip it.
+ if(markerGraph.edgeCoverage(edgeId) < minEdgeCoverage) {
+ continue;
+ }
+ const auto& edge = markerGraph.edges[edgeId];
+
+ const MarkerGraph::VertexId vertexId1 = edge.target;
+ SHASTA_ASSERT(edge.source == vertexId0);
+ SHASTA_ASSERT(vertexId1 < markerGraph.vertexCount());
+
+ // If vertexId1 is in the local marker graph, add this edge.
+ auto it = vertexMap.find(vertexId1);
+ if(it != vertexMap.end()) {
+ const vertex_descriptor v1 = it->second;
+ edge_descriptor e;
+ tie(e, ignore) = add_edge(v0, v1, LocalMarkerGraph1Edge(edgeId), graph);
+ edgeMap.insert({edgeId, e});
+ }
+ }
+ }
+}
+
+
+
+LocalMarkerGraph1::vertex_descriptor LocalMarkerGraph1::addVertex(
+ MarkerGraphVertexId vertexId,
+ uint64_t distance)
+{
+ LocalMarkerGraph1& graph = *this;
+
+ SHASTA_ASSERT(not vertexMap.contains(vertexId));
+ const vertex_descriptor v = add_vertex(LocalMarkerGraph1Vertex(vertexId, distance), graph);
+ vertexMap.insert(make_pair(vertexId, v));
+
+ return v;
+}
+
+
+
+void LocalMarkerGraph1::writeGfa(const string& fileName) const
+{
+ const LocalMarkerGraph1& graph = *this;
+ ofstream gfa(fileName);
+
+ // Write the header.
+ gfa << "H\tVN:Z:1.0\n";
+
+ // Write one segment for each edge.
+ BGL_FORALL_EDGES(e, graph, LocalMarkerGraph1) {
+ const MarkerGraphEdgeId edgeId = graph[e].edgeId;
+ gfa <<
+ "S\t" << edgeId << "\t";
+
+ auto sequence = markerGraph.edgeSequence[edgeId];
+ copy(sequence.begin(), sequence.end(), ostream_iterator<shasta::Base>(gfa));
+
+ // RC is multiplied by sequence length so reports the number of reads
+ // (edge coverage) as depth.
+ gfa <<
+ "\tLN:i:" << sequence.size() <<
+ "\tRC:i:" << sequence.size() * markerGraph.edgeCoverage(edgeId) <<
+ "\n";
+ }
+
+
+
+ // Write the links.
+ // For each vertex, we write links between all pairs of incomint/outgoing edges.
+ BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph1) {
+ BGL_FORALL_INEDGES(v, e0, graph, LocalMarkerGraph1) {
+ const MarkerGraphEdgeId edgeId0 = graph[e0].edgeId;
+ BGL_FORALL_OUTEDGES(v, e1, graph, LocalMarkerGraph1) {
+ const MarkerGraphEdgeId edgeId1 = graph[e1].edgeId;
+ gfa << "L\t" <<
+ edgeId0 << "\t+\t" <<
+ edgeId1 << "\t+\t0M\n";
+ }
+ }
+ }
+}
+
+
+
+void LocalMarkerGraph1::writeHtml0(
+ ostream& html,
+ uint64_t sizePixels,
+ uint64_t quality,
+ double timeout,
+ bool useSvg) const
+{
+ const LocalMarkerGraph1& graph = *this;
+
+ // Compute the layout.
+ std::map<edge_descriptor, double> edgeLengthMap;
+ BGL_FORALL_EDGES(e, graph, LocalMarkerGraph1) {
+ edgeLengthMap.insert(make_pair(e, 1.));
+ }
+ std::map<vertex_descriptor, array<double, 2> > positionMap;
+ // const auto t0 = steady_clock::now();
+ const ComputeLayoutReturnCode returnCode = computeLayoutCustom(
+ graph, edgeLengthMap, positionMap, quality, timeout);
+ // const auto t1 = steady_clock::now();
+ // html << "<br>Graph layout computation took " << seconds(t1 - t0) << "s.";
+ if(returnCode == ComputeLayoutReturnCode::Timeout) {
+ throw runtime_error("Graph layout took too long. "
+ "Increase the timeout or decrease the maximum distance.");
+ }
+ if(returnCode != ComputeLayoutReturnCode::Success) {
+ throw runtime_error("Graph layout failed.");
+ }
+
+ // Find minimum and maximum of x and y.
+ double xMin = std::numeric_limits<double>::max();
+ double xMax = std::numeric_limits<double>::min();
+ double yMin = xMin;
+ double yMax = xMax;
+ for(const auto& p: positionMap) {
+ const auto& xy = p.second;
+ const double x = xy[0];
+ const double y = xy[1];
+ xMin = min(xMin, x);
+ xMax = max(xMax, x);
+ yMin = min(yMin, y);
+ yMax = max(yMax, y);
+ }
+ const double range = max(xMax - xMin, yMax - yMin);
+ const double factor = double(sizePixels) / range;
+
+
+
+ // Gather positions, discretized to integers.
+ // Each of these will generate a pixel.
+ class PixelInfo {
+ public:
+ uint64_t maxCoverage;
+ MarkerGraphVertexId vertexId;
+ };
+ std::map< pair<int64_t, int64_t>, PixelInfo> pixels;
+ for(const auto& p: positionMap) {
+ const vertex_descriptor v = p.first;
+ const auto& xy = p.second;
+ const MarkerGraphVertexId vertexId = graph[v].vertexId;
+ const uint64_t coverage = markerGraph.vertexCoverage(vertexId);
+ const double x = xy[0];
+ const double y = xy[1];
+ const uint64_t ix = int64_t(x * factor);
+ const uint64_t iy = int64_t(y * factor);
+ auto it = pixels.find({ix, iy});
+ if(it == pixels.end()) {
+ pixels.insert(make_pair(make_pair(ix, iy), PixelInfo({coverage, vertexId})));
+ } else {
+ if(coverage > it->second.maxCoverage) {
+ it->second.maxCoverage = coverage;
+ it->second.vertexId = vertexId;
+ }
+ }
+ }
+
+
+
+ // Find minimum and maximum ix, iy.
+ int64_t ixMin = std::numeric_limits<int64_t>::max();
+ int64_t ixMax = std::numeric_limits<int64_t>::min();
+ int64_t iyMin = ixMin;
+ int64_t iyMax = ixMax;
+ for(const auto& pixel :pixels) {
+ const auto& ixy = pixel.first;
+ ixMin = min(ixMin, ixy.first);
+ ixMax = max(ixMax, ixy.first);
+ iyMin = min(iyMin, ixy.second);
+ iyMax = max(iyMax, ixy.second);
+ }
+
+ const int64_t width = ixMax - ixMin + 1;
+ const int64_t height = iyMax - iyMin + 1;
+
+
+
+ if(useSvg) {
+
+ // Display using svg.
+ html << "\n<br><svg width=" << width << " height=" << height << ">";
+ const string coverage1Color = "red";
+ const string coverage2Color = "yellow";
+ const string highCoverageColor = "black";
+
+ for(const auto& pixel :pixels) {
+ const auto& ixy = pixel.first;
+ const uint64_t coverage = pixel.second.maxCoverage;
+ const MarkerGraphVertexId vertexId = pixel.second.vertexId;
+ const int64_t ix = ixy.first - ixMin;
+ SHASTA_ASSERT(ix >= 0);
+ SHASTA_ASSERT(ix < width);
+ const int64_t iy = ixy.second - iyMin;
+ SHASTA_ASSERT(iy >= 0);
+ SHASTA_ASSERT(iy < height);
+
+ string color;
+ if(coverage == 1) {
+ color = coverage1Color;
+ } else if(coverage == 2) {
+ color = coverage2Color;
+ } else {
+ color = highCoverageColor;
+ }
+
+ html <<
+ "\n<a href='"
+ "exploreMarkerGraph1?vertexId=" << vertexId << "&outputType=createAndOpenGfa"
+ "'>"
+ "<line x1=" << ix << " y1=" << iy << " x2=" << ix << " y2=" << iy <<
+ " stroke=" << color << " stroke-width=1px stroke-linecap=square />"
+ "</a>";
+
+ }
+
+
+ html << "</svg>";
+
+
+
+ } else {
+
+ // Display using canvas
+ const array<uint8_t, 3> coverage1Color = {255, 0, 0};
+ const array<uint8_t, 3> coverage2Color = {255, 255, 0};
+ const array<uint8_t, 3> highCoverageColor = {0, 0, 0};
+ html <<
+ "\n<br><canvas id=canvas width=" << width << " height=" << height <<
+ ">"
+ "\n <script>"
+ "\n var canvas = document.getElementById('canvas');"
+ "\n var ctx = canvas.getContext('2d');"
+ "\n var i = ctx.createImageData(" << width << "," << height << ");\n";
+ for(const auto& pixel :pixels) {
+ const auto& ixy = pixel.first;
+ const uint64_t coverage = pixel.second.maxCoverage;
+ const int64_t ix = ixy.first - ixMin;
+ SHASTA_ASSERT(ix >= 0);
+ SHASTA_ASSERT(ix < width);
+ const int64_t iy = ixy.second - iyMin;
+ SHASTA_ASSERT(iy >= 0);
+ SHASTA_ASSERT(iy < height);
+ const uint64_t index = (4 * width) * iy + 4 * ix;
+ if(coverage == 1) {
+ for(uint64_t k=0; k<3; k++) {
+ html << "i.data[" << index+k << "]=" << int(coverage1Color[k]) << ";";
+ }
+ } else if(coverage == 2) {
+ for(uint64_t k=0; k<3; k++) {
+ html << "i.data[" << index+k << "]=" << int(coverage2Color[k]) << ";";
+ }
+ } else {
+ for(uint64_t k=0; k<3; k++) {
+ html << "i.data[" << index+k << "]=" << int(highCoverageColor[k]) << ";";
+ }
+ }
+ html << "i.data[" << index+3 << "]=255;";
+ }
+ html <<
+ "\n ctx.putImageData(i, 0, 0);"
+ "\n </script>";
+ }
+
+}
+
+
+
+void LocalMarkerGraph1::writeHtml1(
+ ostream& html,
+ uint64_t sizePixels,
+ double thicknessScaling,
+ uint64_t quality,
+ double edgeResolution,
+ const string& coloring,
+ uint64_t redCoverage,
+ uint64_t greenCoverage,
+ MarkerGraphEdgeId readFollowingStartEdgeId,
+ int64_t firstMarkerOffset,
+ int64_t lastMarkerOffset,
+ bool showLabels,
+ double timeout) const
+{
+ const LocalMarkerGraph1& graph = *this;
+
+
+
+ // To compute the layout, use an auxiliary graph with a vertex
+ // for each vertex of the LocalMarkerGraph1 plus zero or more vertices
+ // for each edge of the LocalMarkerGraph1.
+ // In this initial implementation we divide each LocalMarkerGraph1 edge into a number
+ // of AuxiliaryGraph edges equal to the number of bases in its sequence.
+ using AuxiliaryGraph = boost::adjacency_list<boost::vecS, boost::vecS, boost::undirectedS>;
+ AuxiliaryGraph auxiliaryGraph;
+
+ // The auxiliary graph vertex corresponding to each vertex of the LocalMarkerGraph1.
+ std::map<vertex_descriptor, AuxiliaryGraph::vertex_descriptor> auxiliaryVertexMap;
+
+ // The auxiliary graph vertices corresponding to each edge of the LocalMarkerGraph1.
+ std::map<edge_descriptor, vector<AuxiliaryGraph::vertex_descriptor> > auxiliaryEdgeMap;
+
+ // The desired length of each edge of the auxiliary graph.
+ std::map<AuxiliaryGraph::edge_descriptor, double> auxiliaryEdgeLengthMap;
+
+ // Create vertices and edges of the AuxiliaryGraph.
+ BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph1) {
+ auxiliaryVertexMap.insert({v, add_vertex(auxiliaryGraph)});
+ }
+ vector<AuxiliaryGraph::vertex_descriptor> auxiliaryVertices;
+ BGL_FORALL_EDGES(e, graph, LocalMarkerGraph1) {
+ const vertex_descriptor v0 = source(e, graph);
+ const vertex_descriptor v1 = target(e, graph);
+ const MarkerGraphEdgeId edgeId = graph[e].edgeId;
+ const uint64_t sequenceLength = markerGraph.edgeSequence[edgeId].size();
+ const uint64_t auxiliaryVertexCount = max(1UL, uint64_t(edgeResolution * double(sequenceLength)));
+ const double edgeLength = double(sequenceLength) / double(auxiliaryVertexCount + 1);
+ auxiliaryVertices.clear();
+ for(uint64_t i=0; i<auxiliaryVertexCount; i++) {
+ auxiliaryVertices.push_back(add_vertex(auxiliaryGraph));
+ }
+ auxiliaryEdgeMap.insert({e, auxiliaryVertices});
+
+ // Add the necessary auxiliary graph edges.
+ AuxiliaryGraph::edge_descriptor ae;
+ if(auxiliaryVertexCount == 0) {
+ tie(ae, ignore) = add_edge(auxiliaryVertexMap[v0], auxiliaryVertexMap[v1], auxiliaryGraph);
+ auxiliaryEdgeLengthMap.insert({ae, edgeLength});
+ } else {
+ tie(ae, ignore) = add_edge(auxiliaryVertexMap[v0], auxiliaryVertices.front(), auxiliaryGraph);
+ auxiliaryEdgeLengthMap.insert({ae, edgeLength});
+ for(uint64_t i=1; i<auxiliaryVertexCount; i++) {
+ tie(ae, ignore) = add_edge(auxiliaryVertices[i-1], auxiliaryVertices[i], auxiliaryGraph);
+ auxiliaryEdgeLengthMap.insert({ae, edgeLength});
+ }
+ tie(ae, ignore) = add_edge(auxiliaryVertices.back(), auxiliaryVertexMap[v1], auxiliaryGraph);
+ auxiliaryEdgeLengthMap.insert({ae, edgeLength});
+ }
+ }
+
+ // Compute the layout of the auxiliary graph.
+ std::map<AuxiliaryGraph::vertex_descriptor, array<double, 2> > positionMap;
+ computeLayoutCustom(auxiliaryGraph, auxiliaryEdgeLengthMap, positionMap, quality, timeout);
+
+
+
+ // If we are doing read following, we need to compute
+ // followed read coverage for each edge.
+ std::map<edge_descriptor, uint64_t> readFollowingCoverageMap;
+ uint64_t readFollowingStartEdgeCoverage = 0;
+ if(coloring == "readFollowing") {
+ readFollowingStartEdgeCoverage = markerGraph.edgeCoverage(readFollowingStartEdgeId);
+
+ // Loop over the MarkerIntervals of the start edge for read following.
+ for(const MarkerInterval& startMarkerInterval:
+ markerGraph.edgeMarkerIntervals[readFollowingStartEdgeId]) {
+ const OrientedReadId orientedReadId = startMarkerInterval.orientedReadId;
+ const int64_t startOrdinal0 = int64_t(startMarkerInterval.ordinals[0]);
+
+ // The number of markers in this oriented read.
+ const int64_t orientedReadMarkerCount = int64_t(markers.size(orientedReadId.getValue()));
+
+ // Get the MarkerId of the first marker of this oriented read.
+ // We can use this later to easily get the MarkerId corresponding to any
+ // marker in the same oriented read.
+ const MarkerId firstOrientedReadMarkerId =
+ markers.begin(orientedReadId.getValue()) - markers.begin();
+
+ // Loop over the requested range of offsets.
+ for(int64_t offset=firstMarkerOffset; offset<=lastMarkerOffset; offset++) {
+ const int64_t ordinal0 = startOrdinal0 + offset;
+ if(ordinal0 < 0) {
+ // This offset takes us before the beginning of this oriented read.
+ continue;
+ }
+ const int64_t ordinal1 = ordinal0 + 1;
+ if(ordinal1 > orientedReadMarkerCount-1) {
+ // This offset takes us past the end of this oriented read.
+ continue;
+ }
+
+ // Find the MarkerIds corresponding to these two ordinals.
+ const MarkerId markerId0 = firstOrientedReadMarkerId + ordinal0;
+ const MarkerId markerId1 = firstOrientedReadMarkerId + ordinal1;
+
+ // Find the corresponding marker graph vertices.
+ // We are using the complete marker graph, so the vertices must exist.
+ const MarkerGraph::CompressedVertexId compressedVertexId0 = markerGraph.vertexTable[markerId0];
+ const MarkerGraph::CompressedVertexId compressedVertexId1 = markerGraph.vertexTable[markerId1];
+ SHASTA_ASSERT(compressedVertexId0 != MarkerGraph::invalidCompressedVertexId);
+ SHASTA_ASSERT(compressedVertexId1 != MarkerGraph::invalidCompressedVertexId);
+ const MarkerGraphVertexId vertexId0 = compressedVertexId0;
+ // const MarkerGraphVertexId vertexId1 = compressedVertexId1;
+
+ // Find the edge vertexId0->vertexId1 that contains the MarkerInterval
+ // with these oriented read and ordinals.
+ MarkerInterval targetMarkerInterval(orientedReadId, uint32_t(ordinal0), uint32_t(ordinal1));
+ MarkerGraphEdgeId edgeId = invalid<MarkerGraphEdgeId>;
+ for(const MarkerGraphEdgeId candidateEdgeId: markerGraph.edgesBySource[vertexId0]) {
+ const auto edgeMarkerIntervals = markerGraph.edgeMarkerIntervals[candidateEdgeId];
+ if(find(edgeMarkerIntervals.begin(), edgeMarkerIntervals.end(), targetMarkerInterval)
+ != edgeMarkerIntervals.end()) {
+ edgeId = candidateEdgeId;
+ break;
+ }
+ }
+ SHASTA_ASSERT(edgeId != invalid<MarkerGraphEdgeId>);
+
+ // cout << orientedReadId << " at offset " << offset << endl;
+
+ // If this edge is in the LocalMarkerGraph1, increment its read following coverage.
+ auto it = edgeMap.find(edgeId);
+ if(it != edgeMap.end()) {
+ const edge_descriptor e = it->second;
+ auto jt = readFollowingCoverageMap.find(e);
+ if(jt == readFollowingCoverageMap.end()){
+ readFollowingCoverageMap.insert({e, 1});
+ // cout << "Added a new entry in the readFollowingCoverageMap." << endl;
+ } else {
+ ++jt->second;
+ // cout << "Incremented readFollowingCoverageMap to " << jt->second << endl;
+ }
+ } else {
+ // cout << "Not found in the edge map." << endl;
+ }
+ }
+ }
+
+ /*
+ for(const auto& p: readFollowingCoverageMap) {
+ const edge_descriptor e = p.first;
+ const uint64_t coverage = p.second;
+ cout << graph[e].edgeId << " " << coverage << endl;
+ }
+ */
+ }
+
+
+
+ // Compute the view box.
+ double xMin = std::numeric_limits<double>::max();
+ double xMax = std::numeric_limits<double>::min();
+ double yMin = xMin;
+ double yMax = xMax;
+ for(const auto& p: positionMap) {
+ const array<double, 2>& xy = p.second;
+ const double x = xy[0];
+ const double y = xy[1];
+ xMin = min(xMin, x);
+ xMax = max(xMax, x);
+ yMin = min(yMin, y);
+ yMax = max(yMax, y);
+ }
+ const double extend = thicknessScaling;
+ xMin -= extend;
+ xMax += extend;
+ yMin -= extend;
+ yMax += extend;
+ const double fontSize = 16. * max(xMax-xMin, yMax-yMin) / double(sizePixels);
+
+ // Make the "arrow" length equal to the desired length of 1 base.
+ const double arrowLength = 1.;
+
+ // Begin the svg.
+ const string svgId = "LocalMarkerGraph1";
+ html << "\n<div style='display: inline-block; vertical-align:top'>"
+ "<br><svg id='" << svgId <<
+ "' width='" << sizePixels <<
+ "' height='" << sizePixels <<
+ "' viewbox='" << xMin << " " << yMin << " " <<
+ xMax - xMin << " " <<
+ yMax - yMin << "'"
+ " font-size='" << fontSize << "' style='border-style:solid;border-color:Black;stroke-linecap:round'"
+ " font-family=monospace"
+ ">\n";
+
+
+
+ // Create a vector to contain edges in the order in which we write them out.
+ // Edges written last are less likely to be superimposed by other edges.
+ vector< pair<edge_descriptor, uint64_t> > allEdges;
+ if(coloring == "readFollowing") {
+ BGL_FORALL_EDGES(e, graph, LocalMarkerGraph1) {
+ uint64_t readFollowingCoverage = 0;
+ auto it = readFollowingCoverageMap.find(e);
+ if(it != readFollowingCoverageMap.end()) {
+ readFollowingCoverage = it->second;
+ }
+ allEdges.push_back({e, readFollowingCoverage});
+ }
+ } else {
+ BGL_FORALL_EDGES(e, graph, LocalMarkerGraph1) {
+ const MarkerGraphEdgeId edgeId = graph[e].edgeId;
+ const uint64_t coverage = markerGraph.edgeCoverage(edgeId);
+ allEdges.push_back({e, coverage});
+ }
+ }
+ sort(allEdges.begin(), allEdges.end(),
+ OrderPairsBySecondOnly<edge_descriptor, uint64_t>());
+
+
+
+ // Write the edges.
+ html << "\n<g id=edges stroke-width='" << thicknessScaling << "'>";
+ for(const auto& p: allEdges) {
+ const edge_descriptor e = p.first;
+ const MarkerGraphEdgeId edgeId = graph[e].edgeId;
+ const uint64_t coverage = markerGraph.edgeCoverage(edgeId);
+ const vertex_descriptor v0 = source(e, graph);
+ const vertex_descriptor v1 = target(e, graph);
+ const auto& p0 = positionMap[auxiliaryVertexMap[v0]];
+ const auto& p1 = positionMap[auxiliaryVertexMap[v1]];
+ const vector<AuxiliaryGraph::vertex_descriptor>& auxiliaryVertices = auxiliaryEdgeMap[e];
+
+ string color;
+ uint64_t readFollowingCoverage = 0;
+ if(coloring == "random") {
+ const uint32_t hue = MurmurHash2(&edgeId, sizeof(edgeId), 231) % 360;
+ color = "hsl(" + to_string(hue) + ",50%,50%)";
+ } else if(coloring == "byCoverage") {
+ if(coverage <= redCoverage) {
+ color = "Red";
+ } else if(coverage >= greenCoverage) {
+ color = "Green";
+ } else {
+ const uint32_t hue = uint32_t(120. *
+ (double(coverage) - double(redCoverage)) / (double(greenCoverage) - double(redCoverage)));
+ color = "hsl(" + to_string(hue) + ",50%,50%)";
+ }
+ } else if(coloring == "readFollowing") {
+ auto it = readFollowingCoverageMap.find(e);
+ if(it == readFollowingCoverageMap.end()) {
+ color = "LightGrey";
+ } else {
+ const uint64_t coverage = it->second;
+ readFollowingCoverage = coverage;
+ if(coverage <= redCoverage) {
+ color = "Red";
+ } else if(coverage >= greenCoverage) {
+ color = "Green";
+ } else {
+ const uint32_t hue = uint32_t(120. *
+ (double(coverage) - double(redCoverage)) / (double(greenCoverage) - double(redCoverage)));
+ color = "hsl(" + to_string(hue) + ",50%,50%)";
+ }
+ }
+ } else {
+ SHASTA_ASSERT(0);
+ }
+ const string properties = "stroke='" + color + "'";
+
+ SHASTA_ASSERT(not auxiliaryVertices.empty());
+
+ // Create a group for this edge.
+ const auto sequence = markerGraph.edgeSequence[edgeId];
+ html << "<g>";
+
+ // Add a title.
+ html <<
+ "<title>Edge " << edgeId << ", coverage " << coverage <<
+ ", " << sequence.size() << " bases: ";
+ copy(sequence.begin(), sequence.end(), ostream_iterator<shasta::Base>(html));
+ if(coloring == "readFollowing") {
+ html << ", read following coverage " << readFollowingCoverage << "/" <<
+ readFollowingStartEdgeCoverage;
+ }
+ html << "</title>";
+
+ // Add a hyperlink.
+ html << "<a href='exploreMarkerGraphEdge?edgeId=" << edgeId << "'>";
+ html << "<g id='Edge-"<< edgeId << "' " << properties << " >";
+
+ // Line from p0 to the first auxiliary vertex.
+ const auto& xyFirst = positionMap[auxiliaryVertices.front()];
+ html << "\n<line x1=" << p0[0] << " y1=" << p0[1] <<
+ " x2=" << xyFirst[0] << " y2=" << xyFirst[1] << " />";
+
+ // Lines between auxiliary vertices.
+ for(uint64_t i=1; i<auxiliaryVertices.size(); i++) {
+ const auto& xyA = positionMap[auxiliaryVertices[i-1]];
+ const auto& xyB = positionMap[auxiliaryVertices[i]];
+ html << "\n<line x1=" << xyA[0] << " y1=" << xyA[1] <<
+ " x2=" << xyB[0] << " y2=" << xyB[1] << " />";
+ }
+
+ // Line from the last auxiliary vertex to p1.
+ const auto& xyLast = positionMap[auxiliaryVertices.back()];
+ html << "\n<line x1=" << xyLast[0] << " y1=" << xyLast[1] <<
+ " x2=" << p1[0] << " y2=" << p1[1] << " />";
+ html << "</g></a>";
+
+ // Label.
+ if(showLabels) {
+ double x, y;
+ if((auxiliaryVertices.size() %2) == 0) {
+ const auto positionA = positionMap[auxiliaryVertices[auxiliaryVertices.size()/2 -1]];
+ const auto positionB = positionMap[auxiliaryVertices[auxiliaryVertices.size()/2]];
+ x = (positionA[0] + positionB[0]) / 2;
+ y = (positionA[1] + positionB[1]) / 2;
+ } else {
+ const auto position = positionMap[auxiliaryVertices[auxiliaryVertices.size()/2]];
+ x = position[0];
+ y = position[1];
+ }
+ html << "<text x='" << x << "' << y='" << y << "' dominant-baseline=middle text-anchor=middle>";
+ copy(sequence.begin(), sequence.end(), ostream_iterator<shasta::Base>(html));
+ html << "</text>";
+ }
+
+ // End the group for this edge.
+ html << "</g>";
+ }
+ html << "\n</g>";
+
+
+
+ // Write the "arrows".
+ html << "\n<g id=arrows stroke-width='" << thicknessScaling/3. << "'>";
+ BGL_FORALL_EDGES(e, graph, LocalMarkerGraph1) {
+ const vertex_descriptor v1 = target(e, graph);
+ const auto& p1 = positionMap[auxiliaryVertexMap[v1]];
+ const vector<AuxiliaryGraph::vertex_descriptor>& auxiliaryVertices = auxiliaryEdgeMap[e];
+ SHASTA_ASSERT(not auxiliaryVertices.empty());
+
+ // Position of the last auxiliary vertex.
+ const auto& xyLast = positionMap[auxiliaryVertices.back()];
+
+ // Draw the "arrow".
+ // We need to compute a unit vector in the direction (p1, xyLast).
+ const double vx = xyLast[0] - p1[0];
+ const double vy = xyLast[1] - p1[1];
+ const double v = sqrt(vx*vx + vy * vy);
+ if(v < 1.e-3) {
+ // Trouble. This can happen if two vertices are very close. Skip the arrow.
+ continue;
+ }
+ const double ux = vx / v;
+ const double uy = vy / v;
+ const double xArrow = p1[0] + ux * arrowLength;
+ const double yArrow = p1[1] + uy * arrowLength;
+ html << "\n<line x1=" << xArrow << " y1=" << yArrow <<
+ " x2=" << p1[0] << " y2=" << p1[1] << " stroke=Black />";
+ }
+ html << "\n</g>";
+
+
+ // Write the vertices.
+ // They can obscure coverage coloring.
+ if(true /*coloring == "random"*/) {
+ html << "\n<g id=vertices stroke-width='" << thicknessScaling << "'>";
+ BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph1) {
+ const auto& p = positionMap[auxiliaryVertexMap[v]];
+ const double x = p[0];
+ const double y = p[1];
+ const string color = (graph[v].distance == maxDistance ? "Grey" : "Black");
+
+ // Create a group for this edge.
+ const MarkerGraphVertexId vertexId = graph[v].vertexId;
+ const uint64_t coverage = markerGraph.vertexCoverage(vertexId);
+ html << "<g><title>Vertex " << vertexId << ", coverage " << coverage;
+ html << "</title>";
+ html << "<a href='exploreMarkerGraphVertex?vertexId=" << vertexId << "'>";
+
+ // Write the vertex.
+ html << "\n<line x1=" << x << " y1=" << y <<
+ " x2=" << x << " y2=" << y << " stroke=" << color << " />";
+
+ // End the group.
+ html << "</a></g>";
+ }
+ html << "\n</g>";
+ }
+
+ // Finish the svg.
+ html << "\n</svg></div>";
+
+ // Add drag and zoom.
+ addSvgDragAndZoom(html);
+
+ // Side panel.
+ html << "<div style='display: inline-block'>";
+
+ // Change thickness
+ html << R"stringDelimiter(
+ <p><table>
+ <tr><th class=left>Thickness<td>
+ <button type='button' onClick='changeThickness(0.1)' style='width:3em'>---</button>
+ <button type='button' onClick='changeThickness(0.5)' style='width:3em'>--</button>
+ <button type='button' onClick='changeThickness(0.8)' style='width:3em'>-</button>
+ <button type='button' onClick='changeThickness(1.25)' style='width:3em'>+</button>
+ <button type='button' onClick='changeThickness(2.)' style='width:3em'>++</button>
+ <button type='button' onClick='changeThickness(10.)' style='width:3em'>+++</button>
+ <script>
+ function changeThickness(factor)
+ {
+ edges = document.getElementById('edges');
+ edges.setAttribute('stroke-width', factor * edges.getAttribute('stroke-width'));
+ vertices = document.getElementById('vertices');
+ vertices.setAttribute('stroke-width', factor * vertices.getAttribute('stroke-width'));
+ arrows = document.getElementById('arrows');
+ arrows.setAttribute('stroke-width', factor * arrows.getAttribute('stroke-width'));
+ }
+ </script>
+ )stringDelimiter";
+
+
+
+ // Zoom buttons.
+ html << R"stringDelimiter(
+ <tr title='Or use the mouse wheel.'><th class=left>Zoom<td>
+ <button type='button' onClick='zoomSvg(0.1)' style='width:3em'>---</button>
+ <button type='button' onClick='zoomSvg(0.5)' style='width:3em'>--</button>
+ <button type='button' onClick='zoomSvg(0.8)' style='width:3em'>-</button>
+ <button type='button' onClick='zoomSvg(1.25)' style='width:3em'>+</button>
+ <button type='button' onClick='zoomSvg(2.)' style='width:3em'>++</button>
+ <button type='button' onClick='zoomSvg(10.)' style='width:3em'>+++</button>
+ )stringDelimiter";
+
+
+
+ // Buttons to highlight an edge and zoom to an edge.
+ html << R"stringDelimiter(
+ <tr><td colspan=2>
+ <button onClick='highlightEdge()'>Highlight</button>
+ <button onClick='zoomToEdge()'>Zoom to</button>edge
+ <input id=selectedEdgeId type=text size=10 style='text-align:center'>
+ <script>
+ function zoomToEdge()
+ {
+ // Get the edge id from the input field.
+ var edgeId = document.getElementById("selectedEdgeId").value;
+ zoomToGivenEdge(edgeId);
+ }
+ function zoomToGivenEdge(edgeId)
+ {
+ var element = document.getElementById("Edge-" + edgeId);
+
+ // Find the bounding box and its center.
+ var box = element.getBBox();
+ var xCenter = box.x + 0.5 * box.width;
+ var yCenter = box.y + 0.5 * box.height;
+
+ // Change the viewbox of the svg to be a bit larger than a square
+ // containing the bounding box.
+ var enlargeFactor = 2.;
+ var size = enlargeFactor * Math.max(box.width, box.height);
+ var factor = size / width;
+ width = size;
+ height = size;
+ x = xCenter - 0.5 * size;
+ y = yCenter - 0.5 * size;
+ var svg = document.querySelector('svg');
+ svg.setAttribute('viewBox', `${x} ${y} ${size} ${size}`);
+ ratio = size / svg.getBoundingClientRect().width;
+ svg.setAttribute('font-size', svg.getAttribute('font-size') * factor);
+
+ }
+ function highlightEdge()
+ {
+ // Get the edge id from the input field.
+ var edgeId = document.getElementById("selectedEdgeId").value;
+ var element = document.getElementById("Edge-" + edgeId);
+
+ element.style.stroke = "Magenta";
+ }
+ </script>
+ )stringDelimiter";
+
+
+
+ // End the side panel.
+ html << "</table></div>";
+}
+
+
+
+void LocalMarkerGraph1::pruneLowCoverageLeaves(uint64_t maxPruneEdgeCoverage)
+{
+ if(maxPruneEdgeCoverage == 0) {
+ return;
+ }
+
+ pruneLowCoverageForwardLeaves(maxPruneEdgeCoverage);
+ pruneLowCoverageBackwardLeaves(maxPruneEdgeCoverage);
+
+}
+
+
+
+void LocalMarkerGraph1::pruneLowCoverageForwardLeaves(uint64_t maxPruneCoverage)
+{
+ LocalMarkerGraph1& graph = *this;
+
+ // Start will all vertices with out-degree 0 and low coverage.
+ std::stack<vertex_descriptor> leaves;
+ BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph1) {
+ const MarkerGraphVertexId vertexId = graph[v].vertexId;
+ const uint64_t coverage = markerGraph.vertexCoverage(vertexId);
+ if(coverage > maxPruneCoverage) {
+ continue;
+ }
+ if(out_degree(v, graph) == 0) {
+ leaves.push(v);
+ }
+ }
+
+ // Main loop. At each iteration we remove a leaf, and add others as required.
+ while(not leaves.empty()) {
+ const vertex_descriptor leaf = leaves.top();
+ leaves.pop();
+
+ // If any parent has out-degree 1 and low coverage,
+ // it becomes a leaf to be removed when we remove this one.
+ BGL_FORALL_INEDGES(leaf, e, graph, LocalMarkerGraph1) {
+ const vertex_descriptor parent = source(e, graph);
+ if(parent == leaf) {
+ continue;
+ }
+ const MarkerGraphVertexId vertexId = graph[parent].vertexId;
+ const uint64_t coverage = markerGraph.vertexCoverage(vertexId);
+ if(coverage > maxPruneCoverage) {
+ continue;
+ }
+ if(out_degree(parent, graph) == 1) {
+ leaves.push(parent);
+ }
+ }
+
+ clear_vertex(leaf, graph);
+ remove_vertex(leaf, graph);
+ }
+}
+
+
+
+void LocalMarkerGraph1::pruneLowCoverageBackwardLeaves(uint64_t maxPruneCoverage)
+{
+ LocalMarkerGraph1& graph = *this;
+
+ // Start will all vertices with in-degree 0 and low coverage.
+ std::stack<vertex_descriptor> leaves;
+ BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph1) {
+ const MarkerGraphVertexId vertexId = graph[v].vertexId;
+ const uint64_t coverage = markerGraph.vertexCoverage(vertexId);
+ if(coverage > maxPruneCoverage) {
+ continue;
+ }
+ if(in_degree(v, graph)==0) {
+ leaves.push(v);
+ }
+ }
+
+ // Main loop. At each iteration we remove a leaf, and add others as required.
+ while(not leaves.empty()) {
+ const vertex_descriptor leaf = leaves.top();
+ leaves.pop();
+
+ // If any child has in-degree 1 and low coverage,
+ // it becomes a leaf to be removed when we remove this one.
+ BGL_FORALL_OUTEDGES(leaf, e, graph, LocalMarkerGraph1) {
+ const vertex_descriptor child = target(e, graph);
+ if(child == leaf) {
+ continue;
+ }
+ const MarkerGraphVertexId vertexId = graph[child].vertexId;
+ const uint64_t coverage = markerGraph.vertexCoverage(vertexId);
+ if(coverage > maxPruneCoverage) {
+ continue;
+ }
+ if(in_degree(child, graph)==1) {
+ leaves.push(child);
+ }
+ }
+
+ clear_vertex(leaf, graph);
+ remove_vertex(leaf, graph);
+ }
+
+}
+
+
+
+void LocalMarkerGraph1::findLowCoverageChains(
+ uint64_t maxChainCoverage,
+ vector< vector<vertex_descriptor> >& chains
+ ) const
+{
+ const LocalMarkerGraph1& graph = *this;
+
+ // Create a filtered graph containing only the vertices
+ // with coverage up to maxChainCoverage.
+ class VertexPredicate {
+ public:
+ VertexPredicate() : graph(0), maxChainCoverage(invalid<uint64_t>) {}
+ VertexPredicate(
+ const LocalMarkerGraph1& graph,
+ uint64_t maxChainCoverage) :
+ graph(&graph),
+ maxChainCoverage(maxChainCoverage)
+ {}
+ const LocalMarkerGraph1* graph;
+ uint64_t maxChainCoverage;
+ bool operator()(const vertex_descriptor v) const
+ {
+ const MarkerGraphVertexId vertexId = (*graph)[v].vertexId;
+ const uint64_t coverage = graph->markerGraph.vertexCoverage(vertexId);
+ return coverage <= maxChainCoverage;
+ }
+ };
+ boost::filtered_graph<LocalMarkerGraph1, boost::keep_all, VertexPredicate>
+ filteredGraph(graph, boost::keep_all(), VertexPredicate(graph, maxChainCoverage));
+
+ // Find linear chains in this filtered graph.
+ findLinearVertexChains(filteredGraph, chains);
+}
+
+
+
+void LocalMarkerGraph1::removeLongLowCoverageChains(
+ uint64_t maxChainCoverage,
+ uint64_t minLength)
+{
+ LocalMarkerGraph1& graph = *this;
+
+ // Find low coverage chains.
+ vector< vector<LocalMarkerGraph1::vertex_descriptor> > lowCoverageChains;
+ findLowCoverageChains(1, lowCoverageChains);
+
+ // Remove the long ones.
+ for(const auto& chain: lowCoverageChains) {
+ if(chain.size() >= minLength) {
+ for(const vertex_descriptor v: chain) {
+ clear_vertex(v, graph);
+ remove_vertex(v, graph);
+ }
+ }
+ }
+
+}
+
diff --git a/src/LocalMarkerGraph1.hpp b/src/LocalMarkerGraph1.hpp
new file mode 100644
index 0000000..0991a3c
--- /dev/null
+++ b/src/LocalMarkerGraph1.hpp
@@ -0,0 +1,134 @@
+#ifndef SHASTA_LOCAL_MARKER_GRAPH1_HPP
+#define SHASTA_LOCAL_MARKER_GRAPH1_HPP
+
+// Shasta.
+#include "shastaTypes.hpp"
+
+// Boost libraries.
+#include <boost/graph/adjacency_list.hpp>
+
+// Standard library.
+#include "iosfwd.hpp"
+#include <map>
+#include "string.hpp"
+#include "vector.hpp"
+
+namespace shasta {
+
+ class LocalMarkerGraph1Vertex;
+ class LocalMarkerGraph1Edge;
+ class LocalMarkerGraph1;
+ using LocalMarkerGraph1BaseClass = boost::adjacency_list<
+ boost::listS,
+ boost::listS,
+ boost::bidirectionalS,
+ LocalMarkerGraph1Vertex,
+ LocalMarkerGraph1Edge
+ >;
+
+ class CompressedMarker;
+ class MarkerGraph;
+ namespace MemoryMapped {
+ template<class T, class Int> class VectorOfVectors;
+ }
+}
+
+
+class shasta::LocalMarkerGraph1Vertex {
+public:
+
+ // The id of the corresponding marker graph vertex.
+ MarkerGraphVertexId vertexId;
+
+ // The distance from the start vertex.
+ uint64_t distance;
+
+ LocalMarkerGraph1Vertex(
+ MarkerGraphVertexId vertexId,
+ uint64_t distance) :
+ vertexId(vertexId),
+ distance(distance)
+ {
+ }
+
+};
+
+
+
+class shasta::LocalMarkerGraph1Edge {
+public:
+
+ // The id of the corresponding marker graph edge.
+ MarkerGraphEdgeId edgeId;
+
+ LocalMarkerGraph1Edge(MarkerGraphEdgeId edgeId) :
+ edgeId(edgeId)
+ {
+ }
+
+};
+
+
+
+class shasta::LocalMarkerGraph1 :
+ public LocalMarkerGraph1BaseClass {
+public:
+
+ LocalMarkerGraph1(
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
+ const MarkerGraph&,
+ MarkerGraphVertexId,
+ uint64_t maxDistance,
+ uint64_t minVertexCoverage,
+ uint64_t minEdgeCoverage
+ );
+
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers;
+ const MarkerGraph& markerGraph;
+ uint64_t maxDistance;
+
+ std::map<MarkerGraphVertexId, vertex_descriptor> vertexMap;
+ std::map<MarkerGraphEdgeId, edge_descriptor> edgeMap;
+ vertex_descriptor addVertex(MarkerGraphVertexId, uint64_t distance);
+
+ void writeGfa(const string& fileName) const;
+ void writeHtml0(
+ ostream&,
+ uint64_t sizePixels,
+ uint64_t quality,
+ double timeout,
+ bool useSvg) const;
+ void writeHtml1(
+ ostream&,
+ uint64_t sizePixels,
+ double thicknessScaling,
+ uint64_t quality,
+ double edgeResolution,
+ const string& coloring,
+ uint64_t redCoverage,
+ uint64_t greenCoverage,
+ MarkerGraphEdgeId readFollowingStartEdgeId,
+ int64_t firstMarkerOffset,
+ int64_t lastMarkerOffset,
+ bool showLabels,
+ double timeout) const;
+
+ void pruneLowCoverageLeaves(uint64_t maxPruneCoverage);
+private:
+ void pruneLowCoverageForwardLeaves(uint64_t maxPruneCoverage);
+ void pruneLowCoverageBackwardLeaves(uint64_t maxPruneCoverage);
+
+public:
+
+ void removeLongLowCoverageChains(
+ uint64_t maxChainCoverage,
+ uint64_t minLength);
+private:
+ void findLowCoverageChains(
+ uint64_t maxChainCoverage,
+ vector< vector<vertex_descriptor> >&
+ ) const;
+
+};
+
+#endif
diff --git a/src/LocalReadGraph.cpp b/src/LocalReadGraph.cpp
index 196edd3..e54195d 100644
--- a/src/LocalReadGraph.cpp
+++ b/src/LocalReadGraph.cpp
@@ -1,6 +1,7 @@
// Shasta.
#include "LocalReadGraph.hpp"
#include "Alignment.hpp"
+#include "Assembler.hpp"
#include "writeGraph.hpp"
using namespace shasta;
@@ -177,9 +178,9 @@ void LocalReadGraph::Writer::operator()(std::ostream& s, edge_descriptor e) cons
// Edge thickness is determined by the number of aligned markers.
s << " penwidth=\"" << edgeThicknessScalingFactor * (1.e-4 * edge.markerCount) << "\"";
- // An edge that crosses strands is drawn dashed.
+ // An edge that crosses strands is drawn purple.
if(edge.crossesStrands) {
- s << " style=dashed";
+ s << " color=purple";
}
s << "]";
@@ -224,6 +225,7 @@ void LocalReadGraph::writeSvg(
double vertexScalingFactor,
double edgeThicknessScalingFactor,
uint64_t maxDistance,
+ const Assembler& assembler,
ostream& svg) const
{
using Graph = LocalReadGraph;
@@ -275,15 +277,42 @@ void LocalReadGraph::writeSvg(
EdgeAttributes attributes;
attributes.thickness = edgeThicknessScalingFactor * 1.e-6 * double(edge.markerCount);
- if(edge.color.empty()) {
- attributes.color = "midnightblue";
+
+ // Extract the uniqueness metric. It is only valid for alignment method 5.
+ // In all other cases it is a signaling Nan.
+ // If the uniqueness metric is available, use it to color the edge.
+ const uint64_t globalEdgeId = edge.globalEdgeId;
+ const ReadGraphEdge& globalEdge = assembler.readGraph.edges[globalEdgeId];
+ const uint64_t alignmentId = globalEdge.alignmentId;
+ const AlignmentData& alignmentData = assembler.alignmentData[alignmentId];
+ const AlignmentInfo& alignmentInfo = alignmentData.info;
+
+ // Set the edge color.
+ if(false /* not std::isnan(alignmentInfo.uniquenessMetric) */) {
+ const float red = 1.;
+ const float green = 5.;
+ if(alignmentInfo.uniquenessMetric <= red) {
+ attributes.color = "red";
+ } else if(alignmentInfo.uniquenessMetric >= green) {
+ attributes.color = "green";
+ } else {
+ const uint64_t h = uint64_t(std::round(alignmentInfo.uniquenessMetric - red) * 120. / (green - red));
+ attributes.color = "hsl(" + to_string(h) + ",100%,50%)";
+ }
} else {
- attributes.color = edge.color;
+ if(edge.color.empty()) {
+ attributes.color = "midnightblue";
+ } else {
+ attributes.color = edge.color;
+ }
}
attributes.tooltip = vertex0.orientedReadId.getString() + " " +
vertex1.orientedReadId.getString() +
", " + to_string(edge.markerCount) + " aligned markers";
+ if(not std::isnan(alignmentInfo.uniquenessMetric)) {
+ attributes.tooltip += ", uniqueness metric " + to_string(alignmentInfo.uniquenessMetric);
+ }
edgeAttributes.insert(make_pair(e, attributes));
}
diff --git a/src/LocalReadGraph.hpp b/src/LocalReadGraph.hpp
index 50f9cdc..5898152 100644
--- a/src/LocalReadGraph.hpp
+++ b/src/LocalReadGraph.hpp
@@ -44,6 +44,8 @@ namespace shasta {
{};
enum class AlignmentType;
+
+ class Assembler;
}
@@ -158,6 +160,7 @@ public:
double vertexScalingFactor,
double edgeThicknessScalingFactor,
uint64_t maxDistance,
+ const Assembler&,
ostream& svg) const;
// Write in Graphviz format.
diff --git a/src/LowHash0.cpp b/src/LowHash0.cpp
index a09ed36..b5e6a5c 100644
--- a/src/LowHash0.cpp
+++ b/src/LowHash0.cpp
@@ -33,9 +33,8 @@ LowHash0::LowHash0(
size_t maxBucketSize, // The maximum size for a bucket to be used.
size_t minFrequency, // Minimum number of minHash hits for a pair to be considered a candidate.
size_t threadCountArgument,
- const MemoryMapped::Vector<KmerInfo>& kmerTable,
const Reads& reads,
- const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
+ const MemoryMapped::VectorOfVectors<KmerId, uint64_t>& kmerIds,
MemoryMapped::Vector<OrientedReadPair>& candidateAlignments,
MemoryMapped::Vector< array<uint64_t, 3> >& readLowHashStatistics,
const string& largeDataFileNamePrefix,
@@ -48,9 +47,8 @@ LowHash0::LowHash0(
maxBucketSize(maxBucketSize),
minFrequency(minFrequency),
threadCount(threadCountArgument),
- kmerTable(kmerTable),
reads(reads),
- markers(markers),
+ kmerIds(kmerIds),
readLowHashStatistics(readLowHashStatistics),
largeDataFileNamePrefix(largeDataFileNamePrefix),
largeDataPageSize(largeDataPageSize),
@@ -71,7 +69,7 @@ LowHash0::LowHash0(
// and each feature generates a low hash with probability hashFraction.
// So an estimate of the total number of hashes is:
const uint64_t totalLowHashCountEstimate =
- uint64_t(hashFraction * double(markers.totalSize()));
+ uint64_t(hashFraction * double(kmerIds.totalSize()));
const uint32_t leadingZeroBitCount = uint32_t(__builtin_clzl(totalLowHashCountEstimate));
const uint32_t log2TotalLowHashCountEstimate = 64 - leadingZeroBitCount;
@@ -98,18 +96,11 @@ LowHash0::LowHash0(
cout << " = " << bucketCount << " buckets. "<< endl;
-
-
- // Create vectors containing only the k-mer ids of all markers.
- // This is used to speed up the computation of hash functions.
- performanceLog << timestamp << "Creating kmer ids for oriented reads." << endl;
- createKmerIds();
-
// Compute the threshold for a hash value to be considered low.
hashThreshold = uint64_t(double(hashFraction) * double(std::numeric_limits<uint64_t>::max()));
// The number of oriented reads, each with its own vector of markers.
- const OrientedReadId::Int orientedReadCount = OrientedReadId::Int(markers.size());
+ const OrientedReadId::Int orientedReadCount = OrientedReadId::Int(kmerIds.size());
const ReadId readCount = orientedReadCount / 2;
@@ -127,6 +118,9 @@ LowHash0::LowHash0(
// Write the header of the histogram file.
histogramCsv << "Iteration,BucketSize,BucketCount,FeatureCount\n";
+ // If minBucketSize and maxBucketSize are both zero,
+ // they are chosen automatically for each iteration.
+ const bool dynamicMinMaxBucketSizes = ((minBucketSize == 0) and (maxBucketSize == 0));
// LowHash0 iteration loop.
@@ -174,7 +168,16 @@ LowHash0::LowHash0(
setupLoadBalancing(readCount, batchSize);
runThreads(&LowHash0::pass2ThreadFunction, threadCount);
buckets.endPass2(false, false);
- computeBucketHistogram();
+
+ // Compute a histogram of bucket size.
+ vector<uint64_t> bucketHistogram;
+ computeBucketHistogram(bucketHistogram);
+
+ // If dynamic adjustment of min/max bucket size was requested,
+ // do it now for this iteration, based on the current bucket size histogram.
+ if(dynamicMinMaxBucketSizes) {
+ adjustMinMaxBucketSizes(bucketHistogram);
+ }
// Pass 3: inspect the buckets to find candidates.
batchSize = 10000;
@@ -223,7 +226,7 @@ LowHash0::LowHash0(
for(ReadId readId=0; readId<readCount; readId++) {
const array<uint64_t, 3>& counters = readLowHashStatistics[readId];
const uint64_t total = std::accumulate(counters.begin(), counters.end(), 0);
- const uint64_t featureCount = markers.size(OrientedReadId(readId, 0).getValue()) - (m-1);
+ const uint64_t featureCount = kmerIds.size(OrientedReadId(readId, 0).getValue()) - (m-1);
const double featureSampling = double(total) / double(featureCount);
csv << readId << ",";
csv << (reads.getFlags(readId).isPalindromic ? "Yes," : "No,");
@@ -242,13 +245,8 @@ LowHash0::LowHash0(
}
}
-
-
- // Clean up work areas.
+ // Clean up .
buckets.remove();
- kmerIds.remove();
-
-
// Done.
const auto tEnd = steady_clock::now();
@@ -258,57 +256,6 @@ LowHash0::LowHash0(
-void LowHash0::createKmerIds()
-{
- kmerIds.createNew(
- largeDataFileNamePrefix.empty() ? "" : (largeDataFileNamePrefix + "tmp-LowHash0-Markers"),
- largeDataPageSize);
- const ReadId orientedReadCount = ReadId(markers.size());
- const ReadId readCount = orientedReadCount / 2;
- kmerIds.beginPass1(orientedReadCount);
- for(ReadId readId=0; readId!=readCount; readId++) {
- for(Strand strand=0; strand<2; strand++) {
- const OrientedReadId orientedReadId(readId, strand);
- const auto markerCount = markers.size(orientedReadId.getValue());
- kmerIds.incrementCount(orientedReadId.getValue(), markerCount);
- }
- }
- kmerIds.beginPass2();
- kmerIds.endPass2(false);
- const size_t batchSize = 10000;
- setupLoadBalancing(readCount, batchSize);
- runThreads(&LowHash0::createKmerIds, threadCount);
-}
-
-
-
-// Thread function for createKmerIds.
-void LowHash0::createKmerIds(size_t threadId)
-{
-
- // Loop over batches assigned to this thread.
- uint64_t begin, end;
- while(getNextBatch(begin, end)) {
-
- // Loop over reads assigned to this batch.
- for(ReadId readId=ReadId(begin); readId!=ReadId(end); readId++) {
- for(Strand strand=0; strand<2; strand++) {
- const OrientedReadId orientedReadId(readId, strand);
- const auto orientedReadMarkers = markers[orientedReadId.getValue()];
-
- SHASTA_ASSERT(kmerIds.size(orientedReadId.getValue()) == orientedReadMarkers.size());
-
- auto pointer = kmerIds.begin(orientedReadId.getValue());
- for(const CompressedMarker& marker: orientedReadMarkers) {
- *pointer++ = marker.kmerId;
- }
- }
- }
- }
-}
-
-
-
// Pass1: compute the low hashes for each oriented read
// and prepare the buckets for filling.
void LowHash0::pass1ThreadFunction(size_t threadId)
@@ -322,7 +269,11 @@ void LowHash0::pass1ThreadFunction(size_t threadId)
// Loop over oriented reads assigned to this batch.
for(ReadId readId=ReadId(begin); readId!=ReadId(end); readId++) {
- if(reads.getFlags(readId).isPalindromic) {
+ const ReadFlags& flags = reads.getFlags(readId);
+ if(flags.discardDueToDuplicates) {
+ continue;
+ }
+ if(flags.isPalindromic) {
continue;
}
for(Strand strand=0; strand<2; strand++) {
@@ -340,7 +291,7 @@ void LowHash0::pass1ThreadFunction(size_t threadId)
// Get the markers for this oriented read.
- KmerId* kmerIdsPointer = kmerIds.begin(orientedReadId.getValue());
+ const KmerId* kmerIdsPointer = kmerIds.begin(orientedReadId.getValue());
const size_t featureCount = markerCount - m + 1;
// Loop over features of this oriented read.
@@ -371,7 +322,11 @@ void LowHash0::pass2ThreadFunction(size_t threadId)
// Loop over oriented reads assigned to this batch.
for(ReadId readId=ReadId(begin); readId!=ReadId(end); readId++) {
- if(reads.getFlags(readId).isPalindromic) {
+ const ReadFlags& flags = reads.getFlags(readId);
+ if(flags.discardDueToDuplicates) {
+ continue;
+ }
+ if(flags.isPalindromic) {
continue;
}
for(Strand strand=0; strand<2; strand++) {
@@ -563,7 +518,7 @@ void LowHash0::merge(
-void LowHash0::computeBucketHistogram()
+void LowHash0::computeBucketHistogram(vector<uint64_t>& bucketHistogram)
{
threadBucketHistogram.clear();
threadBucketHistogram.resize(threadCount);
@@ -576,7 +531,8 @@ void LowHash0::computeBucketHistogram()
for(const vector<uint64_t>& histogram: threadBucketHistogram) {
largestBucketSize = max(largestBucketSize, uint64_t(histogram.size()));
}
- vector<uint64_t> bucketHistogram(largestBucketSize, 0);
+ bucketHistogram.clear();
+ bucketHistogram.resize(largestBucketSize, 0);
for(const vector<uint64_t>& histogram: threadBucketHistogram) {
for(uint64_t bucketSize=0; bucketSize<histogram.size(); bucketSize++) {
bucketHistogram[bucketSize] += histogram[bucketSize];
@@ -611,3 +567,37 @@ void LowHash0::computeBucketHistogramThreadFunction(size_t threadId)
}
}
}
+
+
+
+// Adjust minBucketSize and maxBucketSize based on the current
+// bucket size histogram.
+void LowHash0::adjustMinMaxBucketSizes(const vector<uint64_t>& histogram)
+{
+ // Set minBucketSize to the lowest bucket size B0
+ // such that histogram[B0] > histogram[B0-1].
+ bool done = false;
+ for(uint64_t B0=1; B0<histogram.size(); B0++) {
+ if(histogram[B0] > histogram[B0 - 1]) {
+ minBucketSize = B0;
+ done = true;
+ break;
+ }
+ }
+ SHASTA_ASSERT(done);
+
+ // Set maxBucketSize to the largest bucket size B1 such that histogram[B1] >= histogram[B0] = histogram[minBucketSize]
+ done = false;
+ for(uint64_t B1=histogram.size()-1; B1>=minBucketSize; B1--) {
+ if(histogram[B1] >= histogram[minBucketSize]) {
+ maxBucketSize = B1;
+ done = true;
+ break;
+ }
+ }
+ SHASTA_ASSERT(done);
+
+ cout << "Automatic settings for this iteration: minBucketSize " << minBucketSize <<
+ ", maxBucketSize " << maxBucketSize << endl;
+}
+
diff --git a/src/LowHash0.hpp b/src/LowHash0.hpp
index 6bd28ce..3527e93 100644
--- a/src/LowHash0.hpp
+++ b/src/LowHash0.hpp
@@ -42,9 +42,8 @@ public:
size_t maxBucketSize, // The maximum size for a bucket to be used.
size_t minFrequency, // Minimum number of minHash hits for a pair to be considered a candidate.
size_t threadCount,
- const MemoryMapped::Vector<KmerInfo>& kmerTable,
const Reads& reads,
- const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>&,
+ const MemoryMapped::VectorOfVectors<KmerId, uint64_t>& kmerIds,
MemoryMapped::Vector<OrientedReadPair>&,
MemoryMapped::Vector< array<uint64_t, 3> >& readLowHashStatistics,
const string& largeDataFileNamePrefix,
@@ -60,21 +59,12 @@ private:
size_t maxBucketSize; // The maximum size for a bucket to be used.
size_t minFrequency; // Minimum number of minHash hits for a pair to be considered a candidate.
size_t threadCount;
- const MemoryMapped::Vector<KmerInfo>& kmerTable;
const Reads& reads;
- const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers;
+ const MemoryMapped::VectorOfVectors<KmerId, uint64_t>& kmerIds;
MemoryMapped::Vector< array<uint64_t, 3> > &readLowHashStatistics;
const string& largeDataFileNamePrefix;
size_t largeDataPageSize;
- // Vectors containing only the k-mer ids of all markers
- // for all oriented reads.
- // Indexed by OrientedReadId.getValue().
- // This is used to speed up the computation of hash functions.
- MemoryMapped::VectorOfVectors<KmerId, uint64_t> kmerIds;
- void createKmerIds();
- void createKmerIds(size_t threadId);
-
// The current MinHash iteration.
// This is used to compute a different MurmurHash function
// at each iteration.
@@ -181,12 +171,14 @@ private:
// Compute a histogram of the number of entries in each histogram.
- void computeBucketHistogram();
+ void computeBucketHistogram(vector<uint64_t>& bucketHistogram);
void computeBucketHistogramThreadFunction(size_t threadId);
vector< vector<uint64_t> > threadBucketHistogram;
ofstream histogramCsv;
-
+ // Adjust minBucketSize and maxBucketSize based on the current
+ // bucket size histogram.
+ void adjustMinMaxBucketSizes(const vector<uint64_t>& bucketHistogram);
// Thread functions.
diff --git a/src/LowHash1.cpp b/src/LowHash1.cpp
deleted file mode 100644
index 15ecd27..0000000
--- a/src/LowHash1.cpp
+++ /dev/null
@@ -1,685 +0,0 @@
-// Shasta.
-#include "LowHash1.hpp"
-#include "AlignmentCandidates.hpp"
-#include "Marker.hpp"
-#include "MurmurHash2.hpp"
-using namespace shasta;
-
-// Standad library.
-#include "algorithm.hpp"
-#include "chrono.hpp"
-
-#include "MultithreadedObject.tpp"
-template class MultithreadedObject<LowHash1>;
-
-
-LowHash1::LowHash1(
- size_t m, // Number of consecutive markers that define a feature.
- double hashFraction,
- size_t minHashIterationCount, // Number of minHash iterations.
- size_t log2MinHashBucketCount, // Base 2 log of number of buckets for minHash.
- size_t minBucketSize, // The minimum size for a bucket to be used.
- size_t maxBucketSize, // The maximum size for a bucket to be used.
- size_t minFrequency, // Minimum number of minHash hits for a pair to be considered a candidate.
- size_t threadCountArgument,
- const MemoryMapped::Vector<KmerInfo>& kmerTable,
- const Reads& reads,
- const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
- AlignmentCandidates& candidates,
- const string& largeDataFileNamePrefix,
- size_t largeDataPageSize
- ) :
- MultithreadedObject(*this),
- m(m),
- hashFraction(hashFraction),
- minBucketSize(minBucketSize),
- maxBucketSize(maxBucketSize),
- minFrequency(minFrequency),
- threadCount(threadCountArgument),
- kmerTable(kmerTable),
- reads(reads),
- markers(markers),
- candidates(candidates),
- largeDataFileNamePrefix(largeDataFileNamePrefix),
- largeDataPageSize(largeDataPageSize),
- histogramCsv("LowHashBucketHistogram.csv")
-
-{
- cout << timestamp << "LowHash1 begins." << endl;
- const auto tBegin = steady_clock::now();
-
- // Adjust the numbers of threads, if necessary.
- if(threadCount == 0) {
- threadCount = std::thread::hardware_concurrency();
- }
-
- // Estimate the total number of low hashes and its base 2 log.
- // Except for very short reads, each marker generates a feature,
- // and each feature generates a low hash with probability hashFraction.
- // So an estimate of the total number of hashes is:
- const uint64_t totalLowHashCountEstimate =
- uint64_t(hashFraction * double(markers.totalSize()));
- const uint64_t leadingZeroBitCount = __builtin_clzl(totalLowHashCountEstimate);
- const uint64_t log2TotalLowHashCountEstimate = 64 - leadingZeroBitCount;
-
- // If log2MinHashBucketCount is 0, choose a reasonable value
- // for the current number of reads.
- // Otherwise, check that log2MinHashBucketCount is not unreasonably small.
- if(log2MinHashBucketCount == 0) {
- log2MinHashBucketCount = 5 + log2TotalLowHashCountEstimate;
- } else {
- if(log2MinHashBucketCount < log2TotalLowHashCountEstimate) {
- throw runtime_error("LowHash1: log2MinHashBucketCount is unreasonably small.");
- }
- }
-
- // Set the number of buckets and the corresponding mask.
- const uint64_t bucketCount = 1ULL << log2MinHashBucketCount;
- mask = bucketCount - 1;
- cout << "LowHash1 algorithm will use 2^" << log2MinHashBucketCount;
- cout << " = " << bucketCount << " buckets. "<< endl;
- cout << "Estimated number of low hashes per iteration " << totalLowHashCountEstimate << endl;
- cout << "Estimated load factor " << double(totalLowHashCountEstimate)/double(bucketCount) << endl;
-
- // Create vectors containing only the k-mer ids of all markers.
- // This is used to speed up the computation of hash functions.
- cout << timestamp << "Creating kmer ids for oriented reads." << endl;
- createKmerIds();
-
- // Compute the threshold for a hash value to be considered low.
- hashThreshold = uint64_t(hashFraction * double(std::numeric_limits<uint64_t>::max()));
-
- // The number of oriented reads, each with its own vector of markers.
- const OrientedReadId::Int orientedReadCount = OrientedReadId::Int(markers.size());
- const ReadId readCount = orientedReadCount / 2;
- SHASTA_ASSERT(orientedReadCount == 2*readCount);
-
- // Set up work areas.
- buckets.createNew(
- largeDataFileNamePrefix.empty() ? "" : (largeDataFileNamePrefix + "tmp-LowHash-Buckets"),
- largeDataPageSize);
- lowHashes.resize(orientedReadCount);
- threadCommonFeatures.resize(threadCount);
- for(size_t threadId=0; threadId!=threadCount; threadId++) {
- threadCommonFeatures[threadId] = make_shared<MemoryMapped::Vector<CommonFeature> >();
- threadCommonFeatures[threadId]->createNew(
- largeDataFileNamePrefix.empty() ? "" :
- (largeDataFileNamePrefix + "tmp-LowHash-ThreadCommonFeatures-" + to_string(threadId)),
- largeDataPageSize);
- }
-
- // Write the header of the histogram file.
- histogramCsv << "Iteration,BucketSize,BucketCount,FeatureCount\n";
-
- // LowHash iteration loop.
- for(iteration=0; iteration<minHashIterationCount; iteration++) {
- cout << timestamp << "LowHash iteration " << iteration << " begins." << endl;
-
- // Compute the low hashes for each oriented read
- // and count the number of low hash features in each bucket.
- buckets.clear();
- buckets.beginPass1(bucketCount);
- size_t batchSize = 10000;
- setupLoadBalancing(readCount, batchSize);
- runThreads(&LowHash1::computeHashesThreadFunction, threadCount);
-
- // Fill the buckets.
- buckets.beginPass2();
- setupLoadBalancing(readCount, batchSize);
- runThreads(&LowHash1::fillBucketsThreadFunction, threadCount);
- buckets.endPass2(false, false);
- cout << "Load factor at this iteration " <<
- double(buckets.totalSize()) / double(buckets.size()) << endl;
- computeBucketHistogram();
-
- // Scan the buckets to find common features.
- // Each thread stores the common features it finds in its own vector.
- const uint64_t oldCommonFeatureCount = countTotalThreadCommonFeatures();
- batchSize = 10000;
- setupLoadBalancing(bucketCount, batchSize);
- runThreads(&LowHash1::scanBucketsThreadFunction, threadCount);
- const uint64_t newCommonFeatureCount = countTotalThreadCommonFeatures();
- cout << "Stored " << newCommonFeatureCount-oldCommonFeatureCount <<
- " common features at this iteration." << endl;
- }
-
- // Gather together all the common features found by all threads.
- cout << timestamp << "Gathering common features found by all threads." << endl;
- gatherCommonFeatures();
- cout << timestamp << "Total number of common features including duplicates is " <<
- commonFeatures.totalSize() << endl;
-
- // We no longer need the common features by thread.
- for(size_t threadId=0; threadId!=threadCount; threadId++) {
- threadCommonFeatures[threadId]->remove();
- threadCommonFeatures[threadId] = 0;
- }
- threadCommonFeatures.clear();
-
- // Process the common features.
- // For each orientedReadId0, we look at all the CommonFeatureInfo we have
- // and sort them by orientedReadId1, then by ordinals, and remove duplicates.
- // We then find groups of at least minFrequency common features involving the
- // same pair(orientedReadId0, orientedReadId1)
- cout << timestamp << "Processing the common features we found." << endl;
- processCommonFeatures();
-
- // Clean up.
- buckets.remove();
- kmerIds.remove();
- lowHashes.clear();
- commonFeatures.remove();
-
- // Done.
- const auto tEnd = steady_clock::now();
- const double tTotal = seconds(tEnd - tBegin);
- cout << timestamp << "LowHash1 completed in " << tTotal << " s." << endl;
-}
-
-
-
-void LowHash1::createKmerIds()
-{
- kmerIds.createNew(
- largeDataFileNamePrefix.empty() ? "" : (largeDataFileNamePrefix + "tmp-LowHash-Markers"),
- largeDataPageSize);
- const ReadId orientedReadCount = ReadId(markers.size());
- const ReadId readCount = orientedReadCount / 2;
- kmerIds.beginPass1(orientedReadCount);
- for(ReadId readId=0; readId!=readCount; readId++) {
- for(Strand strand=0; strand<2; strand++) {
- const OrientedReadId orientedReadId(readId, strand);
- const auto markerCount = markers.size(orientedReadId.getValue());
- kmerIds.incrementCount(orientedReadId.getValue(), markerCount);
- }
- }
- kmerIds.beginPass2();
- kmerIds.endPass2(false);
- const size_t batchSize = 10000;
- setupLoadBalancing(readCount, batchSize);
- runThreads(&LowHash1::createKmerIds, threadCount);
-}
-
-
-
-// Thread function for createKmerIds.
-void LowHash1::createKmerIds(size_t threadId)
-{
-
- // Loop over batches assigned to this thread.
- uint64_t begin, end;
- while(getNextBatch(begin, end)) {
-
- // Loop over reads assigned to this batch.
- for(ReadId readId=ReadId(begin); readId!=ReadId(end); readId++) {
- for(Strand strand=0; strand<2; strand++) {
- const OrientedReadId orientedReadId(readId, strand);
- const auto orientedReadMarkers = markers[orientedReadId.getValue()];
-
- SHASTA_ASSERT(kmerIds.size(orientedReadId.getValue()) == orientedReadMarkers.size());
-
- auto pointer = kmerIds.begin(orientedReadId.getValue());
- for(const CompressedMarker& marker: orientedReadMarkers) {
- *pointer++ = marker.kmerId;
- }
- }
- }
- }
-}
-
-
-
-// Thread function to compute the low hashes for each oriented read
-// and count the number of entries in each bucket.
-void LowHash1::computeHashesThreadFunction(size_t threadId)
-{
- const int featureByteCount = int(m * sizeof(KmerId));
- const uint64_t seed = iteration * 37;
-
- // Loop over batches assigned to this thread.
- uint64_t begin, end;
- while(getNextBatch(begin, end)) {
-
- // Loop over oriented reads assigned to this batch.
- for(ReadId readId=ReadId(begin); readId!=ReadId(end); readId++) {
- if(reads.getFlags(readId).isPalindromic) {
- continue;
- }
- for(Strand strand=0; strand<2; strand++) {
- const OrientedReadId orientedReadId(readId, strand);
-
- vector< pair<uint64_t, uint32_t> >& orientedReadLowHashes = lowHashes[orientedReadId.getValue()];
- orientedReadLowHashes.clear();
- const size_t markerCount = kmerIds.size(orientedReadId.getValue());
-
- // Handle the pathological case where there are fewer than m markers.
- // This oriented read ends up in no bucket.
- if(markerCount < m) {
- continue;
- }
-
- // Get the markers for this oriented read.
- KmerId* kmerIdsPointer = kmerIds.begin(orientedReadId.getValue());
- const size_t featureCount = markerCount - m + 1;
-
- // Loop over features of this oriented read.
- // Features are sequences of m consecutive markers.
- for(size_t j=0; j<featureCount; j++, kmerIdsPointer++) {
- const uint64_t hash = MurmurHash64A(kmerIdsPointer, featureByteCount, seed);
- if(hash < hashThreshold) {
- orientedReadLowHashes.push_back(make_pair(hash, j));
- const uint64_t bucketId = hash & mask;
- buckets.incrementCountMultithreaded(bucketId);
- }
- }
- }
- }
- }
-
-}
-
-
-
-// Thread function to fill the buckets.
-void LowHash1::fillBucketsThreadFunction(size_t threadId)
-{
-
- // Loop over batches assigned to this thread.
- uint64_t begin, end;
- while(getNextBatch(begin, end)) {
-
- // Loop over oriented reads assigned to this batch.
- for(ReadId readId=ReadId(begin); readId!=ReadId(end); readId++) {
- if(reads.getFlags(readId).isPalindromic) {
- continue;
- }
- for(Strand strand=0; strand<2; strand++) {
- const OrientedReadId orientedReadId(readId, strand);
- const vector< pair<uint64_t, uint32_t> > & orientedReadLowHashes = lowHashes[orientedReadId.getValue()];
-
- for(const auto& p: orientedReadLowHashes) {
- const uint64_t hash = p.first;
- const uint64_t bucketId = hash & mask;
- const uint32_t ordinal = p.second;
- buckets.storeMultithreaded(bucketId, BucketEntry(orientedReadId, ordinal));
- }
- }
- }
- }
-}
-
-
-
-void LowHash1::computeBucketHistogram()
-{
- threadBucketHistogram.clear();
- threadBucketHistogram.resize(threadCount);
- const uint64_t batchSize = 10000;
- setupLoadBalancing(buckets.size(), batchSize);
- runThreads(&LowHash1::computeBucketHistogramThreadFunction, threadCount);
-
- // Combine the histograms found by each thread.
- uint64_t largestBucketSize = 0;
- for(const vector<uint64_t>& histogram: threadBucketHistogram) {
- largestBucketSize = max(largestBucketSize, uint64_t(histogram.size()));
- }
- vector<uint64_t> bucketHistogram(largestBucketSize, 0);
- for(const vector<uint64_t>& histogram: threadBucketHistogram) {
- for(uint64_t bucketSize=0; bucketSize<histogram.size(); bucketSize++) {
- bucketHistogram[bucketSize] += histogram[bucketSize];
- }
- }
-
- for(uint64_t bucketSize=0; bucketSize<bucketHistogram.size(); bucketSize++) {
- const uint64_t frequency = bucketHistogram[bucketSize];
- if(frequency) {
- histogramCsv <<
- iteration << "," <<
- bucketSize << "," <<
- frequency << "," <<
- bucketSize*frequency << "\n";
- }
- }
-
-
-}
-void LowHash1::computeBucketHistogramThreadFunction(size_t threadId)
-{
- vector<uint64_t>& histogram = threadBucketHistogram[threadId];
- histogram.clear();
- uint64_t begin, end;
- while(getNextBatch(begin, end)) {
- for(uint64_t bucketId=begin; bucketId!=end; bucketId++) {
- const uint64_t bucketSize = buckets.size(bucketId);
- if(bucketSize >= histogram.size()) {
- histogram.resize(bucketSize + 1, 0);
- }
- ++histogram[bucketSize];
- }
- }
-}
-
-
-
-// Thread function to scan the buckets to find common features.
-void LowHash1::scanBucketsThreadFunction(size_t threadId)
-{
- // Access the vector where this thread will store
- // the common features it finds.
- MemoryMapped::Vector<CommonFeature>& commonFeatures = *threadCommonFeatures[threadId];
-
- const uint64_t mLocal = uint64_t(m);
-
- // Loop over batches assigned to this thread.
- uint64_t begin, end;
- while(getNextBatch(begin, end)) {
-
- // Loop over buckets in this batch.
- for(uint64_t bucketId=begin; bucketId!=end; bucketId++) {
-
- // Access this bucket.
- const span<BucketEntry> bucket = buckets[bucketId];
- if(bucket.size() < max(size_t(2), minBucketSize)) {
- continue;
- }
- if(bucket.size() > maxBucketSize) {
- continue;
- }
-
- // Loop over pairs of bucket entries.
- for(const BucketEntry& feature0: bucket) {
- const OrientedReadId orientedReadId0 = feature0.orientedReadId;
- const ReadId readId0 = orientedReadId0.getReadId();
- const Strand strand0 = orientedReadId0.getStrand();
- const uint32_t ordinal0 = feature0.ordinal;
- const auto allKmerIds0 = kmerIds[orientedReadId0.getValue()];
- const auto featureKmerIds0 = allKmerIds0.begin() + ordinal0;
- const uint32_t markerCount0 = uint32_t(allKmerIds0.size());
-
- for(const BucketEntry& feature1: bucket) {
- const OrientedReadId orientedReadId1 = feature1.orientedReadId;
- const ReadId readId1 = orientedReadId1.getReadId();
-
- // Only consider the ones where readId0 < readId1.
- if(readId0 >= readId1) {
- continue;
- }
-
- const Strand strand1 = orientedReadId1.getStrand();
- const uint32_t ordinal1 = feature1.ordinal;
- const auto allKmerIds1 = kmerIds[orientedReadId1.getValue()];
- const auto featureKmerIds1 = allKmerIds1.begin() + ordinal1;
- const uint32_t markerCount1 = uint32_t(allKmerIds1.size());
-
- // If the k-mers are not the same, this is a collision. Discard.
- if(not std::equal(featureKmerIds0, featureKmerIds0+mLocal, featureKmerIds1)) {
- continue;
- }
-
- // We found a common feature. Store it.
- // If read0 is on strand 1, we have to reverse the ordinals.
- if(strand0 == 0) {
- commonFeatures.push_back(CommonFeature(
- readId0,
- readId1,
- strand0==strand1,
- ordinal0,
- ordinal1));
- } else {
- commonFeatures.push_back(CommonFeature(
- readId0,
- readId1,
- strand0==strand1,
- markerCount0-1-ordinal0,
- markerCount1-1-ordinal1));
- }
- }
- }
- }
- }
-}
-
-
-// Add up the number of common feature found by all threads.
-uint64_t LowHash1::countTotalThreadCommonFeatures() const
-{
- uint64_t n = 0;
- for(const auto& v: threadCommonFeatures) {
- n += v->size();
- }
- return n;
-}
-
-
-
-void LowHash1::gatherCommonFeatures()
-{
- commonFeatures.createNew(
- largeDataFileNamePrefix.empty() ? "" : (largeDataFileNamePrefix + "tmp-CommonFeatures"),
- largeDataPageSize);
- commonFeatures.beginPass1(kmerIds.size()/2);
- runThreads(&LowHash1::gatherCommonFeaturesPass1, threadCount);
- commonFeatures.beginPass2();
- runThreads(&LowHash1::gatherCommonFeaturesPass2, threadCount);
- commonFeatures.endPass2(false);
-}
-void LowHash1::gatherCommonFeaturesPass1(size_t threadId)
-{
- const MemoryMapped::Vector<CommonFeature>& v = *threadCommonFeatures[threadId];
- for(const CommonFeature& commonFeature: v) {
- commonFeatures.incrementCountMultithreaded(commonFeature.orientedReadPair.readIds[0]);
- }
-}
-void LowHash1::gatherCommonFeaturesPass2(size_t threadId)
-{
- const MemoryMapped::Vector<CommonFeature>& v = *threadCommonFeatures[threadId];
- for(const CommonFeature& commonFeature: v) {
- commonFeatures.storeMultithreaded(
- commonFeature.orientedReadPair.readIds[0],
- CommonFeatureInfo(commonFeature));
- }
-}
-
-
-
-// Process the common features.
-// For each readId0, we look at all the CommonFeatureInfo we have
-// and sort them by readId1, then by ordinals, and remove duplicates.
-// We then find groups of at least minFrequency common features involving the
-// same pair(orientedReadId0, orientedReadId1)
-// Each group generates an alignment candidate and the
-// corresponding common features.
-// Each thread stores the alignment candidates it finds in its own vector.
-void LowHash1::processCommonFeatures()
-{
- const uint64_t readCount = kmerIds.size() / 2;
- const uint64_t batchSize = 1000;
-
- // Prepare areas where each thread will store what it finds.
- threadCandidateTable.resize(readCount);
- threadAlignmentCandidates.resize(threadCount);
- threadCandidateHistogram.resize(threadCount);
-
- // Extract the candidates and features.
- setupLoadBalancing(readCount, batchSize);
- runThreads(&LowHash1::processCommonFeaturesThreadFunction, threadCount);
-
-
-
- // Gather the candidates and the features.
- for(ReadId readId0=0; readId0<readCount; readId0++) {
-
- // Figure out where the candidates are stored.
- const auto& info = threadCandidateTable[readId0];
- const uint64_t threadId = info[0];
- const uint64_t begin = info[1];
- const uint64_t end = info[2];
-
- // Loop over all these candidates.
- for(uint64_t i=begin; i!=end; ++i) {
- const OrientedReadPair& orientedReadPair =
- threadAlignmentCandidates[threadId]->candidates[i];
- SHASTA_ASSERT(orientedReadPair.readIds[0] == readId0);
- candidates.candidates.push_back(orientedReadPair);
- const auto features = threadAlignmentCandidates[threadId]->featureOrdinals[i];
- candidates.featureOrdinals.appendVector(features.begin(), features.end());
- }
- }
- SHASTA_ASSERT(candidates.candidates.size() == candidates.featureOrdinals.size());
- cout << timestamp << "Found " << candidates.candidates.size() <<
- " alignment candidates with a total " <<
- candidates.featureOrdinals.totalSize() <<
- " features." << endl;
-
-
-
- // Combine the histograms found by each thread.
- for(size_t threadId=0; threadId!=threadCount; threadId++) {
- const vector<uint64_t>& v = threadCandidateHistogram[threadId];
- for(uint64_t i=0; i<v.size(); i++){
- const uint64_t n = v[i];
- if(n > 0) {
- if(candidateHistogram.size() <= n){
- candidateHistogram.resize(n+1, 0);
- }
- candidateHistogram[i] += n;
- }
- }
- }
- ofstream csv("LowHashCandidateHistogram.csv");
- csv << "CommonFeatureCount,Frequency\n";
- for(uint64_t i=0; i<candidateHistogram.size(); i++) {
- const uint64_t n = candidateHistogram[i];
- if(n > 0) {
- csv << i << "," << n << "\n";
- }
- }
-
-
-
- // Clean up.
- threadCandidateTable.clear();
- for(size_t threadId=0; threadId<threadCount; threadId++) {
- threadAlignmentCandidates[threadId]->candidates.remove();
- threadAlignmentCandidates[threadId]->featureOrdinals.remove();
- }
- threadAlignmentCandidates.clear();
-}
-
-
-
-void LowHash1::processCommonFeaturesThreadFunction(size_t threadId)
-{
- // Access the vector where this thread will store
- // the alignment candidates it finds.
- threadAlignmentCandidates[threadId] = make_shared<AlignmentCandidates>();
- AlignmentCandidates& alignmentCandidates = *threadAlignmentCandidates[threadId];
- alignmentCandidates.candidates.createNew(
- largeDataFileNamePrefix.empty() ? "" :
- (largeDataFileNamePrefix + "tmp-ThreadAlignmentCandidates-" + to_string(threadId)),
- largeDataPageSize);
- alignmentCandidates.featureOrdinals.createNew(
- largeDataFileNamePrefix.empty() ? "" :
- (largeDataFileNamePrefix + "tmp-ThreadAlignmentCandidatesOrdinals-" + to_string(threadId)),
- largeDataPageSize);
- vector<uint64_t>& histogram = threadCandidateHistogram[threadId];
-
- // Loop over all batches assigned to this thread.
- uint64_t begin, end;
- while(getNextBatch(begin, end)) {
-
- // Loop over ReadId's in this batch.
- for(ReadId readId0=ReadId(begin); readId0!=ReadId(end); readId0++) {
- // std::lock_guard<std::mutex> lock(mutex); // ************************** TAKE OUT!
- // cout << "Working on readId0 " << readId0 << endl;
- const span<CommonFeatureInfo> features = commonFeatures[readId0];
- threadCandidateTable[readId0][0] = uint64_t(threadId);
- threadCandidateTable[readId0][1] = alignmentCandidates.candidates.size();;
-
- /*
- cout << features.size() << " features before deduplication:" << endl;
- for(auto it=features.begin(); it!=features.end(); ++it) {
- const CommonFeatureInfo& feature = *it;
- cout <<
- feature.readId1 << " " <<
- (feature.isSameStrand ? "same strand " : " opposite strands ") <<
- feature.ordinals[0] << " " <<
- feature.ordinals[1] << " " <<
- int32_t(feature.ordinals[1]) - int32_t(feature.ordinals[0]) << "\n";
- }
- */
-
- // Deduplicate.
- const auto uniqueBegin = features.begin();
- auto uniqueEnd = features.end();
- sort(uniqueBegin, uniqueEnd);
- uniqueEnd = unique(uniqueBegin, uniqueEnd);
-
- /*
- cout << uniqueEnd-uniqueBegin << " features after deduplication:" << endl;
- for(auto it=uniqueBegin; it!=uniqueEnd; ++it) {
- const CommonFeatureInfo& feature = *it;
- cout <<
- feature.readId1 << " " <<
- (feature.isSameStrand ? "same strand " : " opposite strands ") <<
- feature.ordinals[0] << " " <<
- feature.ordinals[1] << " " <<
- int32_t(feature.ordinals[1]) - int32_t(feature.ordinals[0]) << "\n";
- }
- */
-
- // Loop over streaks of features with the same readId1 and isSameStrand.
- for(auto it=uniqueBegin; it!=uniqueEnd;) {
- auto streakBegin = it;
- auto streakEnd = streakBegin;
- const ReadId readId1 = streakBegin->readId1;
- const bool isSameStrand = streakBegin->isSameStrand;
- while(streakEnd!=uniqueEnd and streakEnd->readId1==readId1 and streakEnd->isSameStrand==isSameStrand) {
- ++streakEnd;
- }
-
- // Increment the histogram.
- const int64_t streakLength = streakEnd - streakBegin;
- if(histogram.size() <= uint64_t(streakLength)) {
- histogram.resize(streakLength + 1, 0);
- }
- ++histogram[streakLength];
-
- // If too few, skip.
- if(streakLength < int64_t(minFrequency)) {
- it = streakEnd;
- continue;
- }
-
- /*
- cout << "Common features of reads " <<
- readId0 << " " <<
- readId1 << (isSameStrand ? " same strand" : " opposite strands") << ":\n";
- for(auto it=streakBegin; it!=streakEnd; ++it) {
- const CommonFeatureInfo& feature = *it;
- cout <<
- feature.ordinals[0] << " " <<
- feature.ordinals[1] << " " <<
- int32_t(feature.ordinals[1]) - int32_t(feature.ordinals[0]) << "\n";
- }
- cout << "Marker count " <<
- kmerIds[OrientedReadId(readId0, 0).getValue()].size() << " " <<
- kmerIds[OrientedReadId(readId1, 0).getValue()].size() << ":\n";
- */
-
- // This streak generates an alignment candidate
- // and the corresponding common features.
- alignmentCandidates.candidates.push_back(OrientedReadPair(readId0, readId1, isSameStrand));
- alignmentCandidates.featureOrdinals.appendVector();
- for(auto it=streakBegin; it!=streakEnd; ++it) {
- const CommonFeatureInfo& feature = *it;
- alignmentCandidates.featureOrdinals.append(feature.ordinals);
- }
-
- // Prepare for the next streak.
- it = streakEnd;
- }
- threadCandidateTable[readId0][2] = alignmentCandidates.candidates.size();;
- }
- }
-}
diff --git a/src/LowHash1.hpp b/src/LowHash1.hpp
deleted file mode 100644
index 88548d6..0000000
--- a/src/LowHash1.hpp
+++ /dev/null
@@ -1,224 +0,0 @@
-#ifndef SHASTA_LOW_HASH1_HPP
-#define SHASTA_LOW_HASH1_HPP
-
-// Shasta
-#include "Kmer.hpp"
-#include "MemoryMappedVectorOfVectors.hpp"
-#include "MultithreadedObject.hpp"
-#include "OrientedReadPair.hpp"
-#include "Reads.hpp"
-
-// Standard library.
-#include "fstream.hpp"
-#include "memory.hpp"
-
-namespace shasta {
- class AlignmentCandidates;
- class LowHash1;
- class CompressedMarker;
- class OrientedReadPair;
-
- extern template class MultithreadedObject<LowHash1>;
-}
-
-
-// This class uses the LowHash algorithm to find candidate pairs of aligned reads.
-// It uses as features sequences of m consecutive markers.
-// This is the new version that also stores alignmentCandidates.featureOrdinals
-class shasta::LowHash1 :
- public MultithreadedObject<LowHash1> {
-public:
-
- // The constructor does all the work.
- LowHash1(
- size_t m, // Number of consecutive markers that define a feature.
- double hashFraction,
- size_t minHashIterationCount, // Number of minHash iterations.
- size_t log2MinHashBucketCount, // Base 2 log of number of buckets for minHash.
- size_t minBucketSize, // The minimum size for a bucket to be used.
- size_t maxBucketSize, // The maximum size for a bucket to be used.
- size_t minFrequency, // Minimum number of minHash hits for a pair to be considered a candidate.
- size_t threadCount,
- const MemoryMapped::Vector<KmerInfo>& kmerTable,
- const Reads& reads,
- const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>&,
- AlignmentCandidates& candidates,
- const string& largeDataFileNamePrefix,
- size_t largeDataPageSize
- );
-
-private:
-
- // Store some of the arguments passed to the constructor.
- size_t m; // Number of consecutive markers that define a feature.
- double hashFraction;
- size_t minBucketSize; // The minimum size for a bucket to be used.
- size_t maxBucketSize; // The maximum size for a bucket to be used.
- size_t minFrequency; // Minimum number of minHash hits for a pair to be considered a candidate.
- size_t threadCount;
- const MemoryMapped::Vector<KmerInfo>& kmerTable;
- const Reads& reads;
- const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers;
- AlignmentCandidates& candidates;
- const string& largeDataFileNamePrefix;
- size_t largeDataPageSize;
-
- // Vectors containing only the k-mer ids of all markers
- // for all oriented reads.
- // Indexed by OrientedReadId.getValue().
- // This is used to speed up the computation of hash functions.
- MemoryMapped::VectorOfVectors<KmerId, uint64_t> kmerIds;
- void createKmerIds();
- void createKmerIds(size_t threadId);
-
- // The mask used to compute to compute the bucket
- // corresponding to a hash value.
- uint64_t mask;
-
- // The threshold for a hash value to be considered low.
- uint64_t hashThreshold;
-
- // The current MinHash iteration.
- // This is used to compute a different MurmurHash function
- // at each iteration.
- size_t iteration;
-
- // The low hashes of each oriented read and the ordinals at
- // which the corresponding feature occurs.
- // This is recomputed at each iteration.
- // Indexed by OrientedReadId::getValue().
- vector< vector< pair<uint64_t, uint32_t> > > lowHashes;
- void computeLowHashes(size_t threadId);
-
- // Each bucket entry describes a low hash feature.
- // It consists of an oriented read id and
- // the ordinal where the low hash feature appears.
- class BucketEntry {
- public:
- OrientedReadId orientedReadId;
- uint32_t ordinal;
- BucketEntry(
- OrientedReadId orientedReadId,
- uint32_t ordinal) :
- orientedReadId(orientedReadId),
- ordinal(ordinal) {}
- BucketEntry() {}
- };
- MemoryMapped::VectorOfVectors<BucketEntry, uint64_t> buckets;
-
-
- // Compute a histogram of the number of entries in each histogram.
- void computeBucketHistogram();
- void computeBucketHistogramThreadFunction(size_t threadId);
- vector< vector<uint64_t> > threadBucketHistogram;
- ofstream histogramCsv;
-
-
-
- // When two oriented reads appear in the same bucket, we
- // check if that happens by chance or because we found a
- // common feature between the two oriented reads.
- // In the latter case, we store a new CommonFeature
- // containing the OrientedReadIdPair and
- // the ordinals where the feature appears.
- // Note that the OrientedReadIdPair is interpreted
- // with readId0 on strand 0 and readId1 on the strand
- // implied by isSameStrand.
- // This means that if we encounter the common feature
- // with readId0 on strand 1 we have to reverse the
- // strands and adjust the ordinals.
- // Each thread stores into its own vector of common features.
- // We only store common features with readId0<readId1.
- class CommonFeature {
- public:
- OrientedReadPair orientedReadPair;
- array<uint32_t, 2> ordinals;
- CommonFeature() {}
- CommonFeature(
- ReadId readId0,
- ReadId readId1,
- bool isSameStrand,
- uint32_t ordinal0,
- uint32_t ordinal1
- ) :
- orientedReadPair(readId0, readId1, isSameStrand),
- ordinals({ordinal0, ordinal1})
- {}
- };
- vector< shared_ptr<MemoryMapped::Vector<CommonFeature> > > threadCommonFeatures;
- uint64_t countTotalThreadCommonFeatures() const;
-
-
-
- // The common features found by each thread are stored together,
- // segregated by the first ReadId, readId0.
- // This vector of vectors is indexed by readId0.
- // That is, commonFeatures[readId0]
- // is a vector contaiOrientedReadId is readId0.
- class CommonFeatureInfo {
- public:
- ReadId readId1;
- array<uint32_t, 2> ordinals;
- bool isSameStrand;
- CommonFeatureInfo() {}
- CommonFeatureInfo(const CommonFeature& commonFeature) :
- readId1(commonFeature.orientedReadPair.readIds[1]),
- ordinals(commonFeature.ordinals),
- isSameStrand(commonFeature.orientedReadPair.isSameStrand)
- {}
- bool operator<(const CommonFeatureInfo& that) const {
- return tie(readId1, isSameStrand, ordinals) < tie(that.readId1, that.isSameStrand, that.ordinals);
- }
- bool operator==(const CommonFeatureInfo& that) const {
- return tie(readId1, isSameStrand, ordinals) == tie(that.readId1, that.isSameStrand, that.ordinals);
- }
- };
- MemoryMapped::VectorOfVectors<CommonFeatureInfo, uint64_t> commonFeatures;
- void gatherCommonFeatures();
- void gatherCommonFeaturesPass1(size_t threadId);
- void gatherCommonFeaturesPass2(size_t threadId);
-
-
-
- // Process the common features.
- // For each readId0, we look at all the CommonFeatureInfo we have
- // and sort them by readId1, then by ordinals, and remove duplicates.
- // We then find groups of at least minFrequency common features involving the
- // same pair(orientedReadId0, orientedReadId1).
- // Each group generates an alignment candidate and the
- // corresponding common features.
- // Each thread stores the alignment candidates it finds in its own vector.
- void processCommonFeatures();
- void processCommonFeaturesThreadFunction(size_t threadId);
-
- // Alignment candidates found by each thread.
- vector< shared_ptr<AlignmentCandidates> > threadAlignmentCandidates;
-
- // A table used to gather threadAlignmentCandidates in order
- // of increasing readId0. Indexed by readId0, gives
- // (thread, begin, end) for the candidates for which the first read is readId0.
- vector< array<uint64_t, 3> > threadCandidateTable;
-
-
- // During processCommonFeatures, we also create a histogram that tells us
- // how many (readId0, readId1) pairs with exactly n common features were found.
- // Only the pairs with n>=minFrquency generate an alignment candidate.
- vector<uint64_t> candidateHistogram;
- vector< vector<uint64_t> > threadCandidateHistogram;
-
-
-
- // Thread functions.
-
- // Thread function to compute the low hashes for each oriented read
- // and count the number of entries in each bucket.
- void computeHashesThreadFunction(size_t threadId);
-
- // Thread function to fill the buckets.
- void fillBucketsThreadFunction(size_t threadId);
-
- // Thread function to scan the buckets to find common features.
- void scanBucketsThreadFunction(size_t threadId);
-};
-
-#endif
diff --git a/src/MappedMemoryOwner.hpp b/src/MappedMemoryOwner.hpp
new file mode 100644
index 0000000..7caee88
--- /dev/null
+++ b/src/MappedMemoryOwner.hpp
@@ -0,0 +1,45 @@
+#ifndef SHASTA_MAPPED_MEMORY_OWNER_HPP
+#define SHASTA_MAPPED_MEMORY_OWNER_HPP
+
+#include "cstdint.hpp"
+#include "string.hpp"
+
+namespace shasta {
+ class MappedMemoryOwner;
+}
+
+
+
+class shasta::MappedMemoryOwner {
+public:
+
+ string largeDataFileNamePrefix;
+ uint64_t largeDataPageSize;
+
+ // Function to construct names for binary objects.
+ // The output can be passed to createNew or accessExisting
+ // member functions of MemoryMapped obkects.
+ string largeDataName(const string& name) const
+ {
+ if(largeDataFileNamePrefix.empty()) {
+ return ""; // Anonymous;
+ } else {
+ return largeDataFileNamePrefix + name;
+ }
+ }
+
+ MappedMemoryOwner() {}
+ MappedMemoryOwner(const MappedMemoryOwner&) = default;
+
+ template<class T> void createNew(T& t, const string& name)
+ {
+ t.createNew(largeDataName(name), largeDataPageSize);
+ }
+ template<class T> void accessExistingReadOnly(T& t, const string& name)
+ {
+ t.accessExistingReadOnly(largeDataName(name));
+ }
+};
+
+
+#endif
diff --git a/src/Marker.hpp b/src/Marker.hpp
index 3a18dc6..2b9fd5c 100644
--- a/src/Marker.hpp
+++ b/src/Marker.hpp
@@ -10,12 +10,6 @@ and never changed, and selected in such a way that,
if (and only if) a k-mer is a marker, its reverse complement
is also a marker.
-The k-mer table is a vector of 4^k KmerInfo object,
-indexed by k-mer id as computed using Kmer::id(k).
-Because of the way markers are selected, the following is
-true for all permitted values of i, 0 <= i < 4^k:
-kmerTable[i].isMarker == kmerTable[kmerTable[i].reverseComplementKmerId].isMarker
-
*******************************************************************************/
#include "Kmer.hpp"
@@ -38,35 +32,14 @@ namespace shasta {
-// Markers in shared memory are stored using class CompressedMarker
-// which requires only 5 bytes per marker.
-
-// For a run with 120 Gb of coverage and 10% of k-mers
-// used as markers, storing all the 24 G markers requires
-// 120 GB (we store markers for each read on both strands).
-// This compares with 30 GB to store the reads
-// (we store reads on one strand only).
-
-// This layout results in unaligned memory accesses.
-// This is not a problem as modern processors (beginning with Nehalem)
-// have a much lower performance penalty for unaligned memory access
-// than older processors did:
-// http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.455.4198&rep=rep1&type=pdf
-
+// Markers in shared memory are stored using class CompressedMarker.
class shasta::CompressedMarker {
public:
- // The id of the k-mer for this marker.
- KmerId kmerId __attribute__ ((packed));
-
// The position of this marker in the oriented read.
// This limits the length of a read to 2^24=16Mib bases.
Uint24 position;
-
};
-static_assert(sizeof(shasta::CompressedMarker) ==
- sizeof(shasta::KmerId) + sizeof(shasta::Uint24),
- "Unexpected size of class CompressedMarker.");
@@ -81,11 +54,8 @@ public:
// The position of this marker in the oriented read.
uint32_t position;
- // Constructor from a CompressedMarker.
- Marker(const CompressedMarker& compressedMarker) :
- kmerId(compressedMarker.kmerId),
- position(compressedMarker.position)
- {}
+ Marker(KmerId kmerId, uint32_t position) :
+ kmerId(kmerId), position(position) {}
// Default constructor.
Marker() {}
@@ -100,11 +70,8 @@ class shasta::MarkerWithOrdinal : public Marker {
public:
uint32_t ordinal;
- // Constructor from a marker and an ordinal.
- MarkerWithOrdinal(const Marker& marker, uint32_t ordinal) :
- Marker(marker),
- ordinal(ordinal)
- {}
+ MarkerWithOrdinal(KmerId kmerId, uint32_t position, uint32_t ordinal) :
+ Marker(kmerId, position), ordinal(ordinal) {}
// Default constructor.
MarkerWithOrdinal() {}
diff --git a/src/MarkerFinder.cpp b/src/MarkerFinder.cpp
index f919e07..d46bad0 100644
--- a/src/MarkerFinder.cpp
+++ b/src/MarkerFinder.cpp
@@ -1,6 +1,7 @@
// shasta.
#include "MarkerFinder.hpp"
#include "LongBaseSequence.hpp"
+#include "KmerChecker.hpp"
#include "performanceLog.hpp"
#include "ReadId.hpp"
#include "timestamp.hpp"
@@ -15,13 +16,13 @@ template class MultithreadedObject<MarkerFinder>;
MarkerFinder::MarkerFinder(
size_t k,
- const MemoryMapped::Vector<KmerInfo>& kmerTable,
+ const KmerChecker& kmerChecker,
const Reads& reads,
MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
size_t threadCountArgument) :
MultithreadedObject(*this),
k(k),
- kmerTable(kmerTable),
+ kmerChecker(kmerChecker),
reads(reads),
markers(markers),
threadCount(threadCountArgument)
@@ -83,19 +84,17 @@ void MarkerFinder::threadFunction(size_t threadId)
}
for(uint32_t position=0; /*The check is done later */; position++) {
const KmerId kmerId = KmerId(kmer.id(k));
- if(kmerTable[kmerId].isMarker) {
+ if(kmerChecker.isMarker(kmerId)) {
// This k-mer is a marker.
if(pass == 1) {
++markerCount;
} else {
// Strand 0.
- markerPointerStrand0->kmerId = kmerId;
markerPointerStrand0->position = position;
++markerPointerStrand0;
// Strand 1.
- markerPointerStrand1->kmerId = kmerTable[kmerId].reverseComplementedKmerId;
markerPointerStrand1->position = uint32_t(read.baseCount - k - position);
--markerPointerStrand1;
diff --git a/src/MarkerFinder.hpp b/src/MarkerFinder.hpp
index fedda4c..dcdbff3 100644
--- a/src/MarkerFinder.hpp
+++ b/src/MarkerFinder.hpp
@@ -8,6 +8,7 @@
namespace shasta {
class MarkerFinder;
class LongBaseSequences;
+ class KmerChecker;
namespace MemoryMapped {
template<class T> class Vector;
@@ -26,7 +27,7 @@ public:
// The constructor does all the work.
MarkerFinder(
size_t k,
- const MemoryMapped::Vector<KmerInfo>& kmerTable,
+ const KmerChecker&,
const Reads& reads,
MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
size_t threadCount);
@@ -35,7 +36,7 @@ private:
// The arguments passed to the constructor.
size_t k;
- const MemoryMapped::Vector<KmerInfo>& kmerTable;
+ const KmerChecker& kmerChecker;
const Reads& reads;
MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers;
size_t threadCount;
diff --git a/src/MarkerGraph.cpp b/src/MarkerGraph.cpp
index 8d4577b..28c1b12 100644
--- a/src/MarkerGraph.cpp
+++ b/src/MarkerGraph.cpp
@@ -1,6 +1,10 @@
// Shasta.
#include "MarkerGraph.hpp"
#include "Coverage.hpp"
+#include "deduplicate.hpp"
+#include "findMarkerId.hpp"
+#include "invalid.hpp"
+#include "markerAccessFunctions.hpp"
using namespace shasta;
// Standard library.
@@ -113,6 +117,20 @@ uint64_t MarkerGraph::outDegree(VertexId vertexId) const
+void MarkerGraph::Edge::writeFlags(ostream& s) const
+{
+ s << "wasRemovedByTransitiveReduction " << int(wasRemovedByTransitiveReduction) << "\n";
+ s << "wasPruned " << int(wasPruned) << "\n";
+ s << "isSuperBubbleEdge " << int(isSuperBubbleEdge) << "\n";
+ s << "isLowCoverageCrossEdge " << int(isLowCoverageCrossEdge) << "\n";
+ s << "wasAssembled " << int(wasAssembled) << "\n";
+ s << "isSecondary " << int(isSecondary) << "\n";
+ s << "wasRemovedWhileSplittingSecondaryEdges " << int(wasRemovedWhileSplittingSecondaryEdges) << "\n";
+ s << flush;
+}
+
+
+
MarkerGraph::EdgeId MarkerGraph::getFirstNonRemovedOutEdge(
MarkerGraph::VertexId vertexId) const
{
@@ -596,3 +614,499 @@ void MarkerGraph::createVerticesFromVertexTableThreadFunction4(size_t threadId)
}
+
+// Find the common KmerId for all the markers of a marker graph vertex.
+KmerId MarkerGraph::getVertexKmerId(
+ MarkerGraphVertexId vertexId,
+ uint64_t k,
+ const Reads& reads,
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers
+ ) const
+{
+ // Get it from the first marker on this vertex.
+ const MarkerId markerId = getVertexMarkerIds(vertexId)[0];
+
+ // Find the OrientedReadId.
+ // This is slow as it requires a binary search in the markers toc.
+ OrientedReadId orientedReadId;
+ uint32_t ordinal;
+ tie(orientedReadId, ordinal) = findMarkerId(markerId, markers);
+
+ return getOrientedReadMarkerKmerId(
+ orientedReadId,
+ ordinal,
+ k,
+ reads,
+ markers
+ );
+}
+
+
+
+// Find the edge that contains a given MarkerInterval.
+MarkerGraphEdgeId MarkerGraph::locateMarkerInterval(
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
+ const MarkerInterval& markerInterval) const
+{
+ const OrientedReadId orientedReadId = markerInterval.orientedReadId;
+ const uint64_t firstOrientedReadMarkerId =
+ markers.begin(orientedReadId.getValue()) - markers.begin();
+
+ // Now locate this marker interval.
+ const uint64_t markerId0 = firstOrientedReadMarkerId + markerInterval.ordinals[0];
+ const uint64_t markerId1 = firstOrientedReadMarkerId + markerInterval.ordinals[1];
+ const MarkerGraphVertexId vertexId0 = vertexTable[markerId0];
+ const MarkerGraphVertexId vertexId1 = vertexTable[markerId1];
+
+ for(const auto edgeId: edgesBySource[vertexId0]) {
+ if(edges[edgeId].target != vertexId1) {
+ continue;
+ }
+ const auto markerIntervals = edgeMarkerIntervals[edgeId];
+ if(find(markerIntervals.begin(), markerIntervals.end(), markerInterval) !=
+ markerIntervals.end()) {
+ return edgeId;
+ }
+ }
+
+ return invalid<MarkerGraphEdgeId>;
+}
+
+
+// Apply an ordinal offset in the specified direction to a given MarkerInterval
+// and find the edge that contains the offset MarkerInterval.
+// This assumes that we have the complete marker graph.
+MarkerGraphEdgeId MarkerGraph::locateMarkerIntervalWithOffset(
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
+ MarkerInterval markerInterval,
+ uint32_t ordinalOffset,
+ uint64_t direction // 0=forward, 1=backward.
+ ) const
+{
+ const OrientedReadId orientedReadId = markerInterval.orientedReadId;
+ const uint64_t firstOrientedReadMarkerId =
+ markers.begin(orientedReadId.getValue()) - markers.begin();
+
+ // Construct the offset MarkerInterval.
+ // If we end up outside the oriented read, return invalid<MarkerGraphEdgeId>.
+ if(direction == 0) {
+ markerInterval.ordinals[0] += ordinalOffset;
+ markerInterval.ordinals[1] += ordinalOffset;
+ if(markerInterval.ordinals[1] >= markers.size(orientedReadId.getValue())) {
+ return invalid<MarkerGraphEdgeId>;
+ }
+ } else {
+ if(ordinalOffset > markerInterval.ordinals[0]) {
+ return invalid<MarkerGraphEdgeId>;
+ }
+ markerInterval.ordinals[0] -= ordinalOffset;
+ markerInterval.ordinals[1] -= ordinalOffset;
+ }
+ SHASTA_ASSERT(markerInterval.ordinals[1] == markerInterval.ordinals[0] + 1);
+
+
+ // Now locate this marker interval.
+ const uint64_t markerId0 = firstOrientedReadMarkerId + markerInterval.ordinals[0];
+ const uint64_t markerId1 = firstOrientedReadMarkerId + markerInterval.ordinals[1];
+ const MarkerGraphVertexId vertexId0 = vertexTable[markerId0];
+ const MarkerGraphVertexId vertexId1 = vertexTable[markerId1];
+
+ for(const auto edgeId: edgesBySource[vertexId0]) {
+ if(edges[edgeId].target != vertexId1) {
+ continue;
+ }
+ const auto markerIntervals = edgeMarkerIntervals[edgeId];
+ if(find(markerIntervals.begin(), markerIntervals.end(), markerInterval) !=
+ markerIntervals.end()) {
+ return edgeId;
+ }
+ }
+
+ // If this happens, we don't have a complete marker graph.
+ SHASTA_ASSERT(0);
+}
+
+
+
+// Find out if an edge has duplicate oriented reads
+// in its MarkerIntervals.
+bool MarkerGraph::edgeHasDuplicateOrientedReadIds(EdgeId edgeId) const
+{
+ const auto markerIntervals = edgeMarkerIntervals[edgeId];
+ if(markerIntervals.size() < 2) {
+ return false;
+ }
+ for(uint64_t i=1; i<markerIntervals.size(); i++) {
+ if(markerIntervals[i-1].orientedReadId == markerIntervals[i].orientedReadId) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
+
+// Find out if a vertex has more than one marker on the same oriented read.
+bool MarkerGraph::vertexHasDuplicateOrientedReadIds(
+ VertexId vertexId,
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers) const
+{
+ const span<const MarkerId> vertexMarkerIds = vertices()[vertexId];
+ if(vertexMarkerIds.size() < 2) {
+ return false;
+ }
+
+ // The markers are sorted, so we only have to check each marker
+ // against the previous one.
+ // This could be done faster but is not performance critical.
+ for(uint64_t i=1; i<vertexMarkerIds.size(); i++) {
+ const MarkerId markerId0 = vertexMarkerIds[i-1];
+ const MarkerId markerId1 = vertexMarkerIds[i];
+ OrientedReadId orientedReadId0;
+ OrientedReadId orientedReadId1;
+ tie(orientedReadId0, ignore) = findMarkerId(markerId0, markers);
+ tie(orientedReadId1, ignore) = findMarkerId(markerId1, markers);
+ if(orientedReadId0 == orientedReadId1) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
+
+// Flag primary edges (only used for Mode 3 assembly).
+void MarkerGraph::flagPrimaryEdges(
+ uint64_t minPrimaryCoverage,
+ uint64_t maxPrimaryCoverage,
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
+ uint64_t threadCount)
+{
+ SHASTA_ASSERT(disjointSetsHistogram.isOpen);
+
+ // If minPrimaryCoverage and maxPrimaryCoverage are both 0,
+ // use the disjoint sets histogram and simple heuristics to choose
+ // appropriate values.
+ if((minPrimaryCoverage == 0) and (maxPrimaryCoverage == 0)) {
+
+ // Set minPrimaryCoverage to the first value where the
+ // disjointSetsHistogram starts increasing.
+ bool done = false;
+ uint64_t frequencyAtMinPrimaryCoverage = 0;
+ for(uint64_t i=1; i<disjointSetsHistogram.size(); i++) {
+ const uint64_t coverage = disjointSetsHistogram[i].first;
+ const uint64_t frequency = disjointSetsHistogram[i].second;
+ const uint64_t previousCoverage = disjointSetsHistogram[i-1].first;
+ const uint64_t previousFrequency = disjointSetsHistogram[i-1].second;
+ if(
+ (coverage != previousCoverage+1) // Frequency at coverage-1 is zero, so the histogram went up.
+ or
+ frequency > previousFrequency // The histogram went up.
+ ) {
+ minPrimaryCoverage = coverage;
+ frequencyAtMinPrimaryCoverage = frequency;
+ done = true;
+ break;
+ }
+ }
+ SHASTA_ASSERT(done);
+
+ // Set maxPrimaryCoverage to the last coverage with frequency
+ // at least equal to frequencyAtMinPrimaryCoverage.
+ done = false;
+ for(uint64_t i=disjointSetsHistogram.size()-1; i>0; i--) {
+ const uint64_t coverage = disjointSetsHistogram[i].first;
+ const uint64_t frequency = disjointSetsHistogram[i].second;
+ if(frequency >= frequencyAtMinPrimaryCoverage) {
+ maxPrimaryCoverage = coverage;
+ done= true;
+ break;
+ }
+ }
+ SHASTA_ASSERT(done);
+
+ cout << "Automatically set: minPrimaryCoverage = " << minPrimaryCoverage <<
+ ", maxPrimaryCoverage = " << maxPrimaryCoverage << endl;
+ }
+
+
+
+ // Store the arguments so the threads can see them.
+ flagPrimaryEdgesData.minPrimaryCoverage = minPrimaryCoverage;
+ flagPrimaryEdgesData.maxPrimaryCoverage = maxPrimaryCoverage;
+ flagPrimaryEdgesData.markersPointer = &markers;
+
+ // Adjust the numbers of threads, if necessary.
+ if(threadCount == 0) {
+ threadCount = std::thread::hardware_concurrency();
+ }
+
+ // Clear the flags on all edges.
+ for(Edge& edge: edges) {
+ edge.isPrimary = 0;
+ }
+
+ // Multithreaded code to flag primary edges.
+ const uint64_t batchCount = 10000;
+ setupLoadBalancing(edges.size(), batchCount);
+ runThreads(&MarkerGraph::flagPrimaryEdgesThreadFunction, threadCount);
+
+ uint64_t primaryEdgeCount = 0;
+ for(Edge& edge: edges) {
+ if(edge.isPrimary == 1) {
+ ++primaryEdgeCount;
+ }
+ }
+ cout << "Found " << primaryEdgeCount <<
+ " primary marker graph edges out of " << edges.size() << " total." << endl;
+}
+
+
+
+void MarkerGraph::flagPrimaryEdgesThreadFunction(uint64_t threadId)
+{
+ const uint64_t minPrimaryCoverage = flagPrimaryEdgesData.minPrimaryCoverage;
+ const uint64_t maxPrimaryCoverage = flagPrimaryEdgesData.maxPrimaryCoverage;
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers =
+ *flagPrimaryEdgesData.markersPointer;
+
+ uint64_t begin, end;
+ while(getNextBatch(begin, end)) {
+ for(EdgeId edgeId=begin; edgeId!=end; ++edgeId) {
+ if(isPrimaryEdge(edgeId, minPrimaryCoverage, maxPrimaryCoverage, markers)) {
+ edges[edgeId].isPrimary = 1;
+ }
+ }
+ }
+}
+
+
+
+// Find out if a marker graph edge is a primary edge.
+// Only used for Mode 3 assembly.
+bool MarkerGraph::isPrimaryEdge(
+ MarkerGraphEdgeId edgeId,
+ uint64_t minPrimaryCoverage,
+ uint64_t maxPrimaryCoverage,
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers) const
+{
+ // Check coverage.
+ const uint64_t coverage = edgeCoverage(edgeId);
+ if(coverage < minPrimaryCoverage) {
+ return false;
+ }
+ if(coverage > maxPrimaryCoverage) {
+ return false;
+ }
+
+ // Check for duplicate oriented reads on the edge.
+ if(edgeHasDuplicateOrientedReadIds(edgeId)) {
+ return false;
+ }
+
+ // Check for duplicate oriented reads on its vertices.
+ const MarkerGraph::Edge& edge = edges[edgeId];
+ if(
+ vertexHasDuplicateOrientedReadIds(edge.source, markers) or
+ vertexHasDuplicateOrientedReadIds(edge.target, markers)) {
+ return false;
+ }
+
+ // If all above checks passed, this is a primary edge.
+ return true;
+}
+
+
+
+#if 0
+void MarkerGraph::createPrimaryJourneys(
+ uint64_t orientedReadCount,
+ uint64_t threadCount)
+{
+ // Adjust the numbers of threads, if necessary.
+ if(threadCount == 0) {
+ threadCount = std::thread::hardware_concurrency();
+ }
+
+ primaryJourneys.clear();
+
+ const uint64_t batchCount = 100;
+
+ primaryJourneys.beginPass1(orientedReadCount);
+ setupLoadBalancing(edges.size(), batchCount);
+ runThreads(&MarkerGraph::createPrimaryJourneysThreadFunction1, threadCount);
+ primaryJourneys.beginPass2();
+ setupLoadBalancing(edges.size(), batchCount);
+ runThreads(&MarkerGraph::createPrimaryJourneysThreadFunction2, threadCount);
+ primaryJourneys.endPass2(false, true);
+ setupLoadBalancing(orientedReadCount, 1);
+ runThreads(&MarkerGraph::createPrimaryJourneysThreadFunction3, threadCount);
+
+ cout << "Found " << primaryJourneys.totalSize() <<
+ " marker graph primary journey entries for " << orientedReadCount <<
+ " oriented reads." << endl;
+ cout << "Average number of marker graph primary journey entries per oriented read is " <<
+ double(primaryJourneys.totalSize()) / double(orientedReadCount) << endl;
+
+ writePrimaryJourneys();
+}
+
+
+
+void MarkerGraph::writePrimaryJourneys()
+{
+ const uint64_t orientedReadCount = primaryJourneys.size();
+
+ ofstream csv("MarkerGraphPrimaryJourneys.csv");
+
+ for(ReadId readId=0; readId<orientedReadCount/2; readId++) {
+ for(Strand strand=0; strand<2; strand++) {
+ const OrientedReadId orientedReadId(readId, strand);
+ csv << orientedReadId << ",";
+ for(const auto& primaryJourneyEntry: primaryJourneys[orientedReadId.getValue()]) {
+ csv << primaryJourneyEntry.edgeId << ",";
+ }
+ csv << "\n";
+ }
+ }
+}
+
+
+
+void MarkerGraph::createPrimaryJourneysThreadFunction1(uint64_t threadId)
+{
+ createPrimaryJourneysThreadFunction12(1);
+}
+
+
+
+void MarkerGraph::createPrimaryJourneysThreadFunction2(uint64_t threadId)
+{
+ createPrimaryJourneysThreadFunction12(2);
+}
+
+
+
+void MarkerGraph::createPrimaryJourneysThreadFunction12(uint64_t pass)
+{
+ // Loop over batches assigned to this thread.
+ uint64_t begin, end;
+ while(getNextBatch(begin, end)) {
+
+ // Loop over marker graph edges assigned to this batch.
+ for(EdgeId edgeId=begin; edgeId!=end; ++edgeId) {
+ const Edge& edge = edges[edgeId];
+
+ // If this is not a primary edge, skip it.
+ if(edge.isPrimary == 0) {
+ continue;
+ }
+
+ PrimaryJourneyEntry primaryJourneyEntry;
+ primaryJourneyEntry.edgeId = edgeId;
+
+ // Loop over the MarkerIntervals of this edge.
+ span<MarkerInterval> markerIntervals = edgeMarkerIntervals[edgeId];
+ for(const MarkerInterval& markerInterval: markerIntervals) {
+ const uint64_t orientedReadIdValue = markerInterval.orientedReadId.getValue();
+
+ if(pass == 1) {
+ primaryJourneys.incrementCountMultithreaded(orientedReadIdValue);
+ } else {
+ primaryJourneyEntry.ordinals = markerInterval.ordinals;
+ primaryJourneys.storeMultithreaded(orientedReadIdValue, primaryJourneyEntry);
+ }
+
+ }
+
+ }
+ }
+}
+
+
+
+void MarkerGraph::createPrimaryJourneysThreadFunction3(uint64_t threadId)
+{
+ // Loop over batches assigned to this thread.
+ uint64_t begin, end;
+ while(getNextBatch(begin, end)) {
+
+ // Loop over oriented reads assigned to this batch.
+ for(uint64_t orientedReadIdValue=begin; orientedReadIdValue!=end; orientedReadIdValue++) {
+ auto journey = primaryJourneys[orientedReadIdValue];
+ sort(journey.begin(), journey.end());
+ }
+ }
+}
+
+
+
+// Starting from a primary marker graph edge, follow the primary journeys
+// of all oriented reads on the edge, moving forward.
+// Find the set of MarkerGraphEdgeIds that were encountered in this way,
+// and for each the number of times it was encountered.
+void MarkerGraph::followPrimaryJourneysForward(
+ MarkerGraphEdgeId edgeId0,
+ vector<MarkerGraphEdgeId>& edgeIds,
+ vector<uint64_t>& count) const
+{
+ edgeIds.clear();
+ count.clear();
+
+ // Loop over the oriented reads in edgeId0.
+ for(const MarkerInterval& markerInterval: edgeMarkerIntervals[edgeId0]) {
+ const OrientedReadId orientedReadId = markerInterval.orientedReadId;
+ const auto primaryJourney = primaryJourneys[orientedReadId.getValue()];
+
+ // Loop over the primary journey backward, stopping when we encounter edgeId0.
+ for(uint64_t j=primaryJourney.size(); /* Check later */; --j) {
+ const auto& primaryJourneyEntry = primaryJourney[j];
+ const MarkerGraphEdgeId edgeId1 = primaryJourneyEntry.edgeId;
+ if(edgeId1 == edgeId0) {
+ break;
+ }
+ edgeIds.push_back(edgeId1);
+ if(j == 0) {
+ break;
+ }
+ }
+ }
+
+ deduplicateAndCount(edgeIds, count);
+ SHASTA_ASSERT(edgeIds.size() == count.size());
+
+}
+
+
+
+// Same, but moving backward.
+void MarkerGraph::followPrimaryJourneysBackward(
+ MarkerGraphEdgeId edgeId0,
+ vector<MarkerGraphEdgeId>& edgeIds,
+ vector<uint64_t>& count) const
+{
+ edgeIds.clear();
+ count.clear();
+
+ // Loop over the oriented reads in edgeId0.
+ for(const MarkerInterval& markerInterval: edgeMarkerIntervals[edgeId0]) {
+ const OrientedReadId orientedReadId = markerInterval.orientedReadId;
+ const auto primaryJourney = primaryJourneys[orientedReadId.getValue()];
+
+ // Loop over the primary journey, stopping when we encounter edgeId0.
+ for(const auto& primaryJourneyEntry: primaryJourney) {
+ const MarkerGraphEdgeId edgeId1 = primaryJourneyEntry.edgeId;
+ if(edgeId1 == edgeId0) {
+ break;
+ }
+ edgeIds.push_back(edgeId1);
+ }
+ }
+
+ deduplicateAndCount(edgeIds, count);
+ SHASTA_ASSERT(edgeIds.size() == count.size());
+
+}
+#endif
diff --git a/src/MarkerGraph.hpp b/src/MarkerGraph.hpp
index d7e7856..c43dec1 100644
--- a/src/MarkerGraph.hpp
+++ b/src/MarkerGraph.hpp
@@ -13,8 +13,10 @@
namespace shasta {
class Base;
- class MarkerGraph;
class CompressedCoverageData;
+ class CompressedMarker;
+ class MarkerGraph;
+ class Reads;
extern template class MultithreadedObject<MarkerGraph>;
}
@@ -80,6 +82,11 @@ public:
return vertices()[vertexId];
}
+ // Find out if a vertex has more than one marker on the same oriented read.
+ bool vertexHasDuplicateOrientedReadIds(
+ VertexId,
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers) const;
+
void remove();
// The global marker graph vertex corresponding to each marker.
@@ -147,7 +154,11 @@ private:
CreateVerticesFromVertexTableData createVerticesFromVertexTableData;
public:
-
+ // The disjoint sets histogram in a MemoryMapped::Vector.
+ // This is used when flagging primary marker graph edges for Mode 3 assembly.
+ // This stored pairs(coverage, frequency).
+ // Only pairs where the frequency is not zero are stored.
+ MemoryMapped::Vector< pair<uint64_t, uint64_t> > disjointSetsHistogram;
// Remove marker graph vertices and update vertices and vertexTable.
// After this is called, the only
@@ -170,6 +181,15 @@ private:
public:
+ // Find the common KmerId for all the markers of a marker graph vertex.
+ KmerId getVertexKmerId(
+ MarkerGraphVertexId vertexId,
+ uint64_t k,
+ const Reads&,
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers
+ ) const;
+
+
// The reverse complement of each vertex.
// Indexed by VertexId.
@@ -180,7 +200,6 @@ public:
public:
Uint40 source; // The source vertex (index into globalMarkerGraphVertices).
Uint40 target; // The target vertex (index into globalMarkerGraphVertices).
- uint8_t coverage; // (255 indicates 255 or more).
// Flags used to mark the edge as removed from the marker graph.
bool wasRemoved() const
@@ -220,8 +239,8 @@ public:
// Assembly mode 2 only.
uint8_t wasRemovedWhileSplittingSecondaryEdges : 1;
- // Unused.
- uint8_t flag6 : 1;
+ // This is set for primary edges (Mode 3 assembly only).
+ uint8_t isPrimary : 1;
void clearFlags()
{
@@ -232,15 +251,16 @@ public:
wasAssembled = 0;
isSecondary = 0;
wasRemovedWhileSplittingSecondaryEdges = 0;
- flag6 = 0;
+ isPrimary = 0;
}
Edge() :
source(MarkerGraph::invalidCompressedVertexId),
- target(MarkerGraph::invalidCompressedVertexId),
- coverage(0)
+ target(MarkerGraph::invalidCompressedVertexId)
{
clearFlags();
}
+
+ void writeFlags(ostream&) const;
};
MemoryMapped::Vector<Edge> edges;
const Edge* findEdge(Uint40 source, Uint40 target) const;
@@ -264,6 +284,25 @@ public:
EdgeId getFirstNonRemovedOutEdge(VertexId) const;
EdgeId getFirstNonRemovedInEdge(VertexId) const;
+ // Find the edge that contains a given MarkerInterval.
+ EdgeId locateMarkerInterval(
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
+ const MarkerInterval&) const;
+
+ // Apply an ordinal offset in the specified direction to a given MarkerInterval
+ // and find the edge that contains the offset MarkerInterval.
+ // This assumes that we have the complete marker graph.
+ EdgeId locateMarkerIntervalWithOffset(
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
+ MarkerInterval,
+ uint32_t ordinalOffset,
+ uint64_t direction // 0=forward, 1=backward.
+ ) const;
+
+ // Find out if an edge has duplicate oriented reads
+ // in its MarkerIntervals.
+ bool edgeHasDuplicateOrientedReadIds(EdgeId) const;
+
// The reverse complement of each edge.
// Indexed by EdgeId.
MemoryMapped::Vector<EdgeId> reverseComplementEdge;
@@ -327,6 +366,90 @@ public:
// ordered by position.
MemoryMapped::VectorOfVectors<pair<uint32_t, CompressedCoverageData>, uint64_t>
edgeCoverageData;
+
+
+
+ // Edge sequence for each edge, for Mode 3 assembly.
+ // There are several difference compared to the consensus sequences stored above,
+ // which are not used in Mode 3 assembly:
+ // - Mode 3 assembly assumes we are not using RLE, so we don't need to store repeat counts.
+ // - Mode 3 assembly uses createMarkerGraphedgesStrict, which guarantees that
+ // all marker interval on a marker graph edge have exactly the same sequence.
+ // This dramatically simplifies edge sequence assembly because we can just
+ // obtain the sequence from the first marker interval, and multiple sequence
+ // alignment is not nedeed.
+ // - For Mode 3 assembly we assume that marker ength k is even, and
+ // the stored edge sequence includes the last k/2 bases from the marker
+ // of the source vertex and the first k/2 bases from the marker of
+ // the target vertex. As a result, every edge has at least one base of sequence,
+ // even when adjacent markers overlap. And the sequence of a path can
+ // be obtained by just concatenating the edge sequences.
+ MemoryMapped::VectorOfVectors<Base, uint64_t> edgeSequence;
+
+ // Flag primary edges (only used for Mode 3 assembly).
+ void flagPrimaryEdges(
+ uint64_t minPrimaryCoverage,
+ uint64_t maxPrimaryCoverage,
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
+ uint64_t threadCount);
+private:
+ void flagPrimaryEdgesThreadFunction(uint64_t threadId);
+ bool isPrimaryEdge(
+ EdgeId,
+ uint64_t minPrimaryCoverage,
+ uint64_t maxPrimaryCoverage,
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers) const;
+ class FlagPrimaryEdgesData {
+ public:
+ uint64_t minPrimaryCoverage;
+ uint64_t maxPrimaryCoverage;
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>* markersPointer;
+ };
+ FlagPrimaryEdgesData flagPrimaryEdgesData;
+
+
+#if 0
+ // PRIMARY JOURNEYS ARE NOW COMPUTED LOCALLY BY CLASS Mode3Assembler.
+ // The primary journey of an oriented read is the sequence of primary
+ // marker graph edges encountered by the oriented read.
+ // Indexed by OrientedReadId::getValue().
+ // Only used for mode 3 assembly.
+public:
+ class PrimaryJourneyEntry {
+ public:
+ array<uint32_t, 2> ordinals;
+ EdgeId edgeId;
+ bool operator<(const PrimaryJourneyEntry& that) const {
+ return ordinals[0] < that.ordinals[0];
+ }
+ };
+ MemoryMapped::VectorOfVectors<PrimaryJourneyEntry, uint64_t> primaryJourneys;
+ void createPrimaryJourneys(uint64_t orientedReadCount, uint64_t threadCount);
+ void writePrimaryJourneys();
+private:
+ void createPrimaryJourneysThreadFunction1(uint64_t threadId);
+ void createPrimaryJourneysThreadFunction2(uint64_t threadId);
+ void createPrimaryJourneysThreadFunction12(uint64_t pass);
+ void createPrimaryJourneysThreadFunction3(uint64_t threadId);
+
+public:
+
+ // Starting from a primary marker graph edge, follow the primary journeys
+ // of all oriented reads on the edge, moving forward.
+ // Find the set of MarkerGraphEdgeIds that were encountered in this way,
+ // and for each the number of times it was encountered.
+ void followPrimaryJourneysForward(
+ MarkerGraphEdgeId,
+ vector<MarkerGraphEdgeId>&,
+ vector<uint64_t>& count
+ ) const;
+ // Same, but moving backward.
+ void followPrimaryJourneysBackward(
+ MarkerGraphEdgeId,
+ vector<MarkerGraphEdgeId>&,
+ vector<uint64_t>& count
+ ) const;
+#endif
};
#endif
diff --git a/src/MarkerGraphEdgePairInfo.hpp b/src/MarkerGraphEdgePairInfo.hpp
new file mode 100644
index 0000000..c067c14
--- /dev/null
+++ b/src/MarkerGraphEdgePairInfo.hpp
@@ -0,0 +1,91 @@
+#ifndef SHASTA_MARKER_GRAPH_EDGE_PAIR_INFO_HPP
+#define SHASTA_MARKER_GRAPH_EDGE_PAIR_INFO_HPP
+
+// Shasta.
+#include "invalid.hpp"
+
+// Standard library.
+#include "algorithm.hpp"
+#include "cstdint.hpp"
+
+namespace shasta {
+ class MarkerGraphEdgePairInfo;
+}
+
+
+
+// Information abut the read similarity composition of two marker graph edges.
+class shasta::MarkerGraphEdgePairInfo {
+public:
+
+ // The total number of OrientedReadIds in each of the edges A and B.
+ uint64_t totalA = 0;
+ uint64_t totalB = 0;
+
+ // The number of common oriented reads.
+ uint64_t common = 0;
+
+ // The number of oriented reads present in A but not in B.
+ uint64_t onlyA = 0;
+
+ // The number of oriented reads present in B but not in A.
+ uint64_t onlyB = 0;
+
+ // The rest of the statistics are only valid if the number
+ // of common oriented reads is not 0.
+
+ // The estimated offset between the two edges.
+ // The estimate is done using the common oriented reads.
+ int64_t offsetInMarkers = invalid<int64_t>;
+ int64_t offsetInBases = invalid<int64_t>;
+
+ // The number of onlyA reads which are too short to be on edge B,
+ // based on the above estimated offset.
+ uint64_t onlyAShort = invalid<uint64_t>;
+
+ // The number of onlyB reads which are too short to be on edge A,
+ // based on the above estimated offset.
+ uint64_t onlyBShort = invalid<uint64_t>;
+
+ uint64_t intersectionCount() const
+ {
+ return common;
+ }
+ uint64_t unionCount() const {
+ return totalA + totalB - common;
+ }
+ uint64_t correctedUnionCount() const
+ {
+ return unionCount() - onlyAShort - onlyBShort;
+ }
+ double jaccard() const
+ {
+ return double(intersectionCount()) / double(unionCount());
+ }
+ double correctedJaccard() const
+ {
+ return double(intersectionCount()) / double(correctedUnionCount());
+ }
+
+ // Order them by number of common oriented reads.
+ bool operator<(const MarkerGraphEdgePairInfo& that) const
+ {
+ return correctedJaccard() < that.correctedJaccard();
+ }
+ bool operator>(const MarkerGraphEdgePairInfo& that) const
+ {
+ return correctedJaccard() > that.correctedJaccard();
+ }
+
+ void reverse()
+ {
+ swap(totalA, totalB);
+ swap(onlyA, onlyB);
+ swap(onlyAShort, onlyBShort);
+ offsetInMarkers = - offsetInMarkers;
+ offsetInBases = - offsetInBases;
+ }
+
+};
+
+#endif
diff --git a/src/Mode3Assembler.cpp b/src/Mode3Assembler.cpp
new file mode 100644
index 0000000..812d5a6
--- /dev/null
+++ b/src/Mode3Assembler.cpp
@@ -0,0 +1,477 @@
+// Shasta.
+#include "Mode3Assembler.hpp"
+#include "Assembler.hpp"
+#include "AssemblerOptions.hpp"
+#include "deduplicate.hpp"
+#include "dset64-gccAtomic.hpp"
+#include "mode3-AssemblyGraph.hpp"
+#include "mode3-PrimaryGraph.hpp"
+#include "orderPairs.hpp"
+#include "performanceLog.hpp"
+#include "timestamp.hpp"
+using namespace shasta;
+using namespace mode3;
+
+// Standard library.
+#include "iostream.hpp"
+
+// Explicit instantiation.
+#include "MultithreadedObject.tpp"
+template class MultithreadedObject<Mode3Assembler>;
+
+
+
+Mode3Assembler::Mode3Assembler(
+ const Assembler& assembler,
+ uint64_t threadCount,
+ const Mode3AssemblyOptions& options,
+ bool debug) :
+ MultithreadedObject<Mode3Assembler>(*this),
+ MappedMemoryOwner(assembler),
+ assembler(assembler),
+ debug(debug)
+{
+ performanceLog << timestamp << "Mode 3 assembly begins." << endl;
+
+ gatherPrimaryMarkerGraphEdgeIds();
+ computeConnectedComponents();
+ assembleConnectedComponents(threadCount, options, debug);
+
+ performanceLog << timestamp << "Mode 3 assembly ends." << endl;
+}
+
+
+
+void Mode3Assembler::gatherPrimaryMarkerGraphEdgeIds()
+{
+ const auto& markerGraphEdges = assembler.markerGraph.edges;
+
+ primaryMarkerGraphEdgeIds.clear();
+ for(MarkerGraphEdgeId edgeId=0; edgeId<markerGraphEdges.size(); edgeId++) {
+ if(markerGraphEdges[edgeId].isPrimary) {
+ primaryMarkerGraphEdgeIds.push_back(edgeId);
+ }
+ }
+ cout << "Of " << markerGraphEdges.size() << " marker graph edges, " <<
+ primaryMarkerGraphEdgeIds.size() << " are primary." << endl;
+}
+
+
+
+// The oriented reads present in each primary marker graph edge
+// define a bipartite graph. We want to compute connected components
+// of this bipartite graph and process them one at a time.
+void Mode3Assembler::computeConnectedComponents()
+{
+ performanceLog << timestamp << "Mode3Assembler::computeConnectedComponents begins." << endl;
+
+ // Compute connected components of the oriented reads portion
+ // of the bipartite graph.
+ // Here oriented reads are indexed by OrientedReadId::getValue().
+ const uint64_t orientedReadCount = assembler.markers.size();
+ vector<DisjointSets::Aint> disjointSetsData(orientedReadCount);
+ DisjointSets disjointSets(&disjointSetsData[0], orientedReadCount);
+
+ // Loop over all primary marker graph edges.
+ // This could be multithreaded but runs at decent speed as is.
+ for(const MarkerGraphEdgeId edgeId: primaryMarkerGraphEdgeIds) {
+ const auto markerIntervals = assembler.markerGraph.edgeMarkerIntervals[edgeId];
+ SHASTA_ASSERT(not markerIntervals.empty());
+ const OrientedReadId orientedReadId0 = markerIntervals.front().orientedReadId;
+ for(const MarkerInterval& markerInterval: markerIntervals) {
+ const OrientedReadId orientedReadId1 = markerInterval.orientedReadId;
+ disjointSets.unite(orientedReadId0.getValue(), orientedReadId1.getValue());
+ }
+ }
+
+ // Gather the oriented reads in each connected component.
+ vector< vector<OrientedReadId> > componentsOrientedReads(orientedReadCount);
+ for(uint64_t i=0; i<orientedReadCount; i++) {
+ const uint64_t componentId = disjointSets.find(i);
+ componentsOrientedReads[componentId].push_back(OrientedReadId::fromValue(ReadId(i)));
+ }
+
+ // Gather the primary marker graph edges in each connected component.
+ // This stores PrimaryIds, not MarkerGraphEdgeIds.
+ vector< vector<uint64_t> > componentsPrimaryIds(orientedReadCount);
+ for(uint64_t primaryId=0; primaryId<primaryMarkerGraphEdgeIds.size(); primaryId++) {
+ const MarkerGraphEdgeId edgeId = primaryMarkerGraphEdgeIds[primaryId];
+ const auto markerIntervals = assembler.markerGraph.edgeMarkerIntervals[edgeId];
+ SHASTA_ASSERT(not markerIntervals.empty());
+ const OrientedReadId orientedReadId0 = markerIntervals.front().orientedReadId;
+ const uint64_t componentId = disjointSets.find(orientedReadId0.getValue());
+
+ // Check that all MarkerIntervals are in the same component.
+ // THIS CHECK CAN BE REMOVED FOR PERFORMANCE.
+ for(const MarkerInterval& markerInterval: markerIntervals) {
+ const OrientedReadId orientedReadId1 = markerInterval.orientedReadId;
+ SHASTA_ASSERT(disjointSets.find(orientedReadId1.getValue()) == componentId);
+ }
+ componentsPrimaryIds[componentId].push_back(primaryId);
+ }
+
+
+
+ disjointSetsData.clear();
+
+
+
+ // Gather the components with more than one read and their sizes.
+ // The connected components cannot be self-complementary because
+ // we are using read strand separation method 2.
+ // This means that the ReadIds must be all distinct (and increasing).
+ // For each complementary pair, only keep the one
+ // that has the first oriented read on strand 0.
+ vector< pair<uint64_t, uint64_t> > componentTable;
+ for(uint64_t componentId=0; componentId<orientedReadCount; componentId++) {
+ const vector<OrientedReadId>& component = componentsOrientedReads[componentId];
+ const uint64_t componentSize = component.size();
+ if(componentSize < 2) {
+ continue;
+ }
+ if(component.front().getStrand() != 0) {
+ continue;
+ }
+
+ // Verify that the ReadIds are all distinct.
+ // THIS CHECK CAN BE REMOVED FOR PERFORMANCE.
+ for(uint64_t i1=1; i1<component.size(); i1++) {
+ const uint64_t i0 = i1 - 1;
+ SHASTA_ASSERT(component[i0].getReadId() < component[i1].getReadId());
+ }
+
+ // Store this component in the componentTable.
+ componentTable.push_back({componentId, componentSize});
+ }
+
+ // Sort the component table by decreasing size.
+ sort(componentTable.begin(), componentTable.end(),
+ OrderPairsBySecondOnlyGreater<uint64_t, uint64_t>());
+
+ // Store the connected components we kept.
+ connectedComponents.resize(componentTable.size());
+ for(uint64_t i=0; i<connectedComponents.size(); i++) {
+ const uint64_t componentId = componentTable[i].first;
+ connectedComponents[i].orientedReadIds.swap(componentsOrientedReads[componentId]);
+ connectedComponents[i].primaryIds.swap(componentsPrimaryIds[componentId]);
+ }
+
+ // Fill in the orientedReadIdTable.
+ orientedReadIdTable.clear();
+ orientedReadIdTable.resize(orientedReadCount, {invalid<uint64_t>, invalid<uint64_t>});
+ for(uint64_t componentId=0; componentId<connectedComponents.size(); componentId++) {
+ const vector<OrientedReadId>& orientedReadIds = connectedComponents[componentId].orientedReadIds;
+ for(uint64_t position=0; position<orientedReadIds.size(); position++) {
+ const OrientedReadId orientedReadId = orientedReadIds[position];
+ orientedReadIdTable[orientedReadId.getValue()] = {componentId, position};
+ }
+ }
+
+ performanceLog << timestamp << "Mode3Assembler::computeConnectedComponents ends." << endl;
+}
+
+
+
+void Mode3Assembler::assembleConnectedComponents(
+ uint64_t threadCount,
+ const Mode3AssemblyOptions& options,
+ bool debug)
+{
+ performanceLog << timestamp << "Mode3Assembler::assembleConnectedComponents begins." << endl;
+
+ vector< vector<uint64_t> > assemblyChainLengthsByPValue;
+ vector<uint64_t> assemblyBubbleChainLengths;
+
+ ofstream summaryCsv("Components.csv");
+ summaryCsv << "Component,Reads,Segments,Sequence,N50,Total Bubble chain length,Bubble chain N50\n";
+
+ vector< shared_ptr<mode3::AssemblyGraph> > assemblyGraphs;
+ for(uint64_t componentId=0; componentId<connectedComponents.size(); componentId++) {
+ const shared_ptr<AssemblyGraph> assemblyGraph =
+ assembleConnectedComponent(componentId, threadCount, options, true, debug);
+ assemblyGraphs.push_back(assemblyGraph);
+
+ // Chain length statistics.
+ vector< vector<uint64_t> > chainLengths;
+ assemblyGraph->getChainLengthsByPValue(chainLengths);
+
+ // Assembly statistics by P-value.
+ cout << "Assembly statistics by P-Value for component " << componentId << ":" << endl;
+ for(uint64_t pValue=0; pValue<chainLengths.size(); pValue++) {
+ uint64_t totalLength, n50;
+ tie(totalLength, n50) = AssemblyGraph::n50(chainLengths[pValue]);
+ cout << "P-value " << pValue << ": total assembled length " << totalLength <<
+ ", N50 " << n50 << endl;
+ }
+
+ // Combined chain length statistics for this component.
+ vector<uint64_t> allChainLengths;
+ for(const auto& v: chainLengths) {
+ copy(v.begin(), v.end(), back_inserter(allChainLengths));
+ }
+ sort(allChainLengths.begin(), allChainLengths.end(), std::greater<uint64_t>());
+ uint64_t totalLength, n50;
+ tie(totalLength, n50) = AssemblyGraph::n50(allChainLengths);
+ cout << "Combined for this component: total assembled length " << totalLength <<
+ ", N50 " << n50 << endl;
+
+ // Bubble chain length statistics (non-trivial bubble chains only).
+ vector<uint64_t> bubbleChainLengths;
+ assemblyGraph->getBubbleChainLengths(bubbleChainLengths);
+ uint64_t totalBubbleChainLength, bubbleChainN50;
+ tie(totalBubbleChainLength, bubbleChainN50) = AssemblyGraph::n50(bubbleChainLengths);
+ copy(bubbleChainLengths.begin(), bubbleChainLengths.end(),
+ back_inserter(assemblyBubbleChainLengths));
+ cout << "Total non-trivial bubble chain length for this component " << totalBubbleChainLength <<
+ ", N50 " << bubbleChainN50 << endl;
+
+ // Write a line to the summaryCsv.
+ summaryCsv << componentId << ",";
+ summaryCsv << connectedComponents[componentId].orientedReadIds.size() << ",";
+ summaryCsv << allChainLengths.size() << ",";
+ summaryCsv << totalLength << ",";
+ summaryCsv << n50 << ",";
+ summaryCsv << totalBubbleChainLength << ",";
+ summaryCsv << bubbleChainN50 << "\n";
+
+ // Store the chain lengths.
+ if(assemblyChainLengthsByPValue.size() < chainLengths.size()) {
+ assemblyChainLengthsByPValue.resize(chainLengths.size());
+ }
+ for(uint64_t pValue=0; pValue<chainLengths.size(); pValue++) {
+ copy(chainLengths[pValue].begin(), chainLengths[pValue].end(),
+ back_inserter(assemblyChainLengthsByPValue[pValue]));
+ }
+ }
+
+ cout << "Global assembly statistics by P-Value:" << endl;
+ for(uint64_t pValue=0; pValue<assemblyChainLengthsByPValue.size(); pValue++) {
+ sort(assemblyChainLengthsByPValue[pValue].begin(), assemblyChainLengthsByPValue[pValue].end(),
+ std::greater<uint64_t>());
+ uint64_t totalLength, n50;
+ tie(totalLength, n50) = AssemblyGraph::n50(assemblyChainLengthsByPValue[pValue]);
+ cout << "P-value " << pValue << ": total assembled length " << totalLength <<
+ ", N50 " << n50 << endl;
+ }
+ vector<uint64_t> allChainLengths;
+ for(const auto& v: assemblyChainLengthsByPValue) {
+ copy(v.begin(), v.end(), back_inserter(allChainLengths));
+ }
+ sort(allChainLengths.begin(), allChainLengths.end(), std::greater<uint64_t>());
+ uint64_t totalLength, n50;
+ tie(totalLength, n50) = AssemblyGraph::n50(allChainLengths);
+ cout << "Global assembly statistics, combined for all P-values: total assembled length " << totalLength <<
+ ", N50 " << n50 << endl;
+
+ sort(assemblyBubbleChainLengths.begin(), assemblyBubbleChainLengths.end(), std::greater<uint64_t>());
+ uint64_t totalBubbleChainLength, bubbleChainN50;
+ tie(totalBubbleChainLength, bubbleChainN50) = AssemblyGraph::n50(assemblyBubbleChainLengths);
+ cout << "Total non-trivial bubble chain length " << totalBubbleChainLength <<
+ ", N50 " << bubbleChainN50 << endl;
+
+
+ // Create a csv file with one line for each assembled segment.
+ // This can also be loaded in Bandage.
+ {
+ ofstream csv("Assembly.csv");
+ csv << "Chain,Connectivity,Component,Bubble chain,Position in bubble chain,Index in bubble,"
+ "Sequence length,Primary coverage,P value,Color,"
+ "Preceded by,Followed by,"
+ "\n";
+ for(const shared_ptr<mode3::AssemblyGraph>& assemblyGraph: assemblyGraphs) {;
+ assemblyGraph->writeCsvSummary(csv);
+ }
+ }
+
+ // Create a global FASTA file with output from all the connected components.
+ {
+ ofstream fasta("Assembly.fasta");
+ for(const shared_ptr<mode3::AssemblyGraph>& assemblyGraph: assemblyGraphs) {
+ assemblyGraph->writeFastaExpanded(fasta);
+ }
+ }
+
+ // Create a global GFA file with output from all the connected components.
+ {
+ ofstream gfa("Assembly.gfa");
+ AssemblyGraph::writeGfaHeader(gfa);
+ for(const shared_ptr<mode3::AssemblyGraph>& assemblyGraph: assemblyGraphs) {
+ assemblyGraph->writeGfaSegmentsExpanded(gfa, true, true);
+ }
+ for(const shared_ptr<mode3::AssemblyGraph>& assemblyGraph: assemblyGraphs) {
+ assemblyGraph->writeGfaLinksExpanded(gfa);
+ }
+ }
+
+ // Also create a global GFA file without sequence.
+ ofstream gfa("Assembly-NoSequence.gfa");
+ {
+ AssemblyGraph::writeGfaHeader(gfa);
+ for(const shared_ptr<mode3::AssemblyGraph>& assemblyGraph: assemblyGraphs) {
+ assemblyGraph->writeGfaSegmentsExpanded(gfa, false, true);
+ }
+ for(const shared_ptr<mode3::AssemblyGraph>& assemblyGraph: assemblyGraphs) {
+ assemblyGraph->writeGfaLinksExpanded(gfa);
+ }
+ }
+
+ performanceLog << timestamp << "Mode3Assembler::assembleConnectedComponents ends." << endl;
+}
+
+
+
+shared_ptr<AssemblyGraph> Mode3Assembler::assembleConnectedComponent(
+ uint64_t componentId,
+ uint64_t threadCount,
+ const Mode3AssemblyOptions& options,
+ bool assembleSequence,
+ bool debug)
+{
+ performanceLog << timestamp << "Assembling connected component " <<
+ componentId << " of " << connectedComponents.size() << endl;
+ cout << timestamp << "Assembling connected component " <<
+ componentId << " of " << connectedComponents.size() << endl;
+
+ const ConnectedComponent& connectedComponent = connectedComponents[componentId];
+ const vector<OrientedReadId>& orientedReadIds = connectedComponent.orientedReadIds;
+ const vector<uint64_t>& primaryIds = connectedComponent.primaryIds;
+
+ cout << "This connected component has " << orientedReadIds.size() <<
+ " reads and " << primaryIds.size() << " primary marker graph edges." << endl;
+
+
+
+ // We need to compute the primary journey of each oriented read,
+ // that is, the sequence of primary edges encountered by each read.
+ // We store each journey as a vector of pairs of
+ // (ordinal0, localPrimaryId), where localPrimaryId is an index into primaryIds
+ // for this connected component.
+ vector< vector< pair<uint32_t, uint64_t> > > journeys(orientedReadIds.size());
+
+ performanceLog << timestamp << "Journey computation begins." << endl;
+ for(uint64_t localPrimaryId=0; localPrimaryId<primaryIds.size(); localPrimaryId++) {
+ const uint64_t primaryId = primaryIds[localPrimaryId];
+ const MarkerGraphEdgeId edgeId = primaryMarkerGraphEdgeIds[primaryId];
+ const auto markerIntervals = assembler.markerGraph.edgeMarkerIntervals[edgeId];
+ for(const MarkerInterval& markerInterval: markerIntervals) {
+ const OrientedReadId orientedReadId = markerInterval.orientedReadId;
+ const uint32_t ordinal0 = markerInterval.ordinals[0];
+ const auto& p = orientedReadIdTable[orientedReadId.getValue()];
+ SHASTA_ASSERT(p.first == componentId);
+ journeys[p.second].push_back({ordinal0, localPrimaryId});
+ }
+ }
+ for(vector< pair<uint32_t, uint64_t> >& journey: journeys) {
+ sort(journey.begin(), journey.end(), OrderPairsByFirstOnly<uint32_t, uint64_t>());
+ }
+ performanceLog << timestamp << "Journey computation ends." << endl;
+
+#if 0
+ // Check that the journeys computed in this way are identical to the ones stored in the MarkerGraph.
+ // The ones stored in the MarkerGraph will eventually go away.
+ for(uint64_t i=0; i<orientedReadIds.size(); i++) {
+ const OrientedReadId orientedReadId = orientedReadIds[i];
+ const auto journey = journeys[i];
+ const auto storedJourney = assembler.markerGraph.primaryJourneys[orientedReadId.getValue()];
+ SHASTA_ASSERT(journey.size() == storedJourney.size());
+
+ for(uint64_t j=0; j<journey.size(); j++) {
+ const auto& p = journey[j];
+ const uint64_t localPrimaryId = p.second;
+ const uint64_t primaryId = primaryIds[localPrimaryId];
+ const MarkerGraphEdgeId edgeId = primaryMarkerGraphEdgeIds[primaryId];
+ // cout << orientedReadId << " " << storedJourney[j].edgeId << " " << edgeId << endl;
+ SHASTA_ASSERT(edgeId == storedJourney[j].edgeId);
+ }
+ }
+#endif
+
+
+ // Now we can create the PrimaryGraph for this connected component.
+ PrimaryGraph primaryGraph;
+
+ // Create the vertices first.
+ vector<PrimaryGraph::vertex_descriptor> vertexDescriptors;
+ for(uint64_t localPrimaryId=0; localPrimaryId<primaryIds.size(); localPrimaryId++) {
+ const uint64_t primaryId = primaryIds[localPrimaryId];
+ const MarkerGraphEdgeId edgeId = primaryMarkerGraphEdgeIds[primaryId];
+ vertexDescriptors.push_back(primaryGraph.addVertex(edgeId));
+ }
+
+
+
+ // To generate edges of the PrimaryGraph, we need to gather pairs of consecutive
+ // journey entries. Each pair (localPrimaryId0, localPrimaryId1) is stored
+ // as a localPrimaryId1 in journeyPairs[localPrimaryId0].
+ // For now use a simple vector of vector and sequential code, but later
+ // switch to MemoryMapped::VectorOfVectors<uint64_t, uint64_t> and multithreaded code.
+ vector< vector<uint64_t> > journeyPairs(primaryIds.size());
+ performanceLog << timestamp << "PrimaryGraph edge creation begins." << endl;
+ for(const auto& journey: journeys) {
+ for(uint64_t i1=1; i1<journey.size(); i1++) {
+ const uint64_t i0 = i1 - 1;
+ const uint64_t localPrimaryId0 = journey[i0].second;
+ const uint64_t localPrimaryId1 = journey[i1].second;
+ journeyPairs[localPrimaryId0].push_back(localPrimaryId1);
+ }
+ }
+ vector<uint64_t> count;
+ for(uint64_t localPrimaryId0=0; localPrimaryId0<primaryIds.size(); localPrimaryId0++) {
+ const PrimaryGraph::vertex_descriptor v0 = vertexDescriptors[localPrimaryId0];
+ const MarkerGraphEdgeId edgeId0 = primaryGraph[v0].edgeId;
+ auto journeyPairs0 = journeyPairs[localPrimaryId0];
+ deduplicateAndCount(journeyPairs0, count);
+ SHASTA_ASSERT(journeyPairs0.size() == count.size());
+ for(uint64_t j=0; j<journeyPairs0.size(); j++) {
+ const uint64_t localPrimaryId1 = journeyPairs0[j];
+ const uint64_t coverage = count[j];
+ const PrimaryGraph::vertex_descriptor v1 = vertexDescriptors[localPrimaryId1];
+ const MarkerGraphEdgeId edgeId1 = primaryGraph[v1].edgeId;
+ MarkerGraphEdgePairInfo info;
+ SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(edgeId0, edgeId1, info));
+ primaryGraph.addEdgeFromVertexDescriptors(v0, v1, info, coverage);
+ }
+ }
+ performanceLog << timestamp << "PrimaryGraph edge creation ends." << endl;
+
+ cout << "The PrimaryGraph for this connected component has " <<
+ num_vertices(primaryGraph) << " vertices and " << num_edges(primaryGraph) << " edges." << endl;
+
+
+ // Graphviz output.
+ if(debug) {
+ PrimaryGraphDisplayOptions options;
+ options.showNonTransitiveReductionEdges = true;
+ primaryGraph.writeGraphviz(
+ "PrimaryGraphInitial" + to_string(componentId), options, assembler.markerGraph);
+ options.makeCompact();
+ primaryGraph.writeGraphviz(
+ "PrimaryGraphCompactInitial" + to_string(componentId), options, assembler.markerGraph);
+ primaryGraph.writeEdgeCoverageHistogram("PrimaryGraphInitial" + to_string(componentId) + "-EdgeCoverageHistogram.csv");
+ }
+
+ // Remove weak edges..
+ primaryGraph.removeWeakEdges(options.primaryGraphOptions.maxLoss);
+
+ // Remove cross-edges.
+ primaryGraph.removeCrossEdges(
+ options.primaryGraphOptions.crossEdgesLowCoverageThreshold,
+ options.primaryGraphOptions.crossEdgesHighCoverageThreshold,
+ 0);
+
+ // Graphviz output.
+ if(debug) {
+ PrimaryGraphDisplayOptions options;
+ options.showNonTransitiveReductionEdges = false;
+ primaryGraph.writeGraphviz(
+ "PrimaryGraph" + to_string(componentId), options, assembler.markerGraph);
+ options.makeCompact();
+ primaryGraph.writeGraphviz(
+ "PrimaryGraphCompact" + to_string(componentId), options, assembler.markerGraph);
+ }
+
+ // Create the assembly graph for this connected component.
+ return make_shared<AssemblyGraph>(
+ primaryGraph, componentId, assembler, threadCount,
+ options, assembleSequence, debug);
+}
diff --git a/src/Mode3Assembler.hpp b/src/Mode3Assembler.hpp
new file mode 100644
index 0000000..a7a8773
--- /dev/null
+++ b/src/Mode3Assembler.hpp
@@ -0,0 +1,80 @@
+#pragma once
+
+// Shasta.
+#include "MappedMemoryOwner.hpp"
+#include "MultithreadedObject.hpp"
+#include "ReadId.hpp"
+#include "shastaTypes.hpp"
+
+// Standard library.
+#include "memory.hpp"
+#include "utility.hpp"
+#include "vector.hpp"
+
+namespace shasta {
+ class Mode3Assembler;
+ class Assembler;
+ class Mode3AssemblyOptions;
+ namespace mode3 {
+ class AssemblyGraph;
+ }
+}
+
+
+class shasta::Mode3Assembler :
+ public MultithreadedObject<Mode3Assembler>,
+ public MappedMemoryOwner {
+public:
+ Mode3Assembler(
+ const Assembler&,
+ uint64_t threadCount,
+ const Mode3AssemblyOptions&,
+ bool debug);
+private:
+ const Assembler& assembler;
+ bool debug;
+
+ // The MarkerGraphEdgeIds of the primary marker graph edges.
+ // These are sorted.
+ // An index in this vector is called PrimaryId.
+ vector<MarkerGraphEdgeId> primaryMarkerGraphEdgeIds;
+ void gatherPrimaryMarkerGraphEdgeIds();
+
+ // The oriented reads present in each primary marker graph edge
+ // define a bipartite graph. We want to compute connected components
+ // of this bipartite graph and process them one at a time.
+ // These are also connected components of the global primary graph
+ // (with one vertex for each primary marker graph edge,
+ // and edges created by following the reads).
+ class ConnectedComponent {
+ public:
+ // The oriented reads in this connected component.
+ vector<OrientedReadId> orientedReadIds;
+
+ // The PrimaryIds of the marker graph edges in this connected component.
+ // These are indices into primaryMarkerGraphEdgeIds.
+ vector<uint64_t> primaryIds;
+ };
+ vector<ConnectedComponent> connectedComponents;
+ void computeConnectedComponents();
+
+ // For each oriented read, store which ConnectedComponent it belongs to,
+ // and at what position.
+ // Indexed by OrientedReadId::getValue().
+ // For each OrientedReadId we store a pair (componentId, position),
+ // where componentId is the index in the connectedComponents vector
+ // and position is the index in the orientedReadIds vector
+ // for that connected component.
+ vector< pair<uint64_t, uint64_t> > orientedReadIdTable;
+
+ void assembleConnectedComponents(
+ uint64_t threadCount,
+ const Mode3AssemblyOptions&,
+ bool debug);
+ shared_ptr<mode3::AssemblyGraph> assembleConnectedComponent(
+ uint64_t componentId,
+ uint64_t threadCount,
+ const Mode3AssemblyOptions&,
+ bool assembleSequence,
+ bool debug);
+};
diff --git a/src/PythonModule.cpp b/src/PythonModule.cpp
index 5812058..2a3aed0 100644
--- a/src/PythonModule.cpp
+++ b/src/PythonModule.cpp
@@ -12,8 +12,11 @@
#include "deduplicate.hpp"
#include "dset64Test.hpp"
#include "diploidBayesianPhase.hpp"
+#include "enumeratePaths.hpp"
#include "shastaLapack.hpp"
+#include "globalMsa.hpp"
#include "LongBaseSequence.hpp"
+#include "longestPath.hpp"
#include "mappedCopy.hpp"
#include "MedianConsensusCaller.hpp"
#include "MemoryMappedAllocator.hpp"
@@ -154,37 +157,8 @@ PYBIND11_MODULE(shasta, shastaModule)
// K-mers.
.def("accessKmers",
&Assembler::accessKmers)
- .def("writeKmers",
- &Assembler::writeKmers,
- arg("fileName") = "Kmers.csv")
- .def("randomlySelectKmers",
- &Assembler::randomlySelectKmers,
- arg("k"),
- arg("probability"),
- arg("seed") = 231)
- .def("selectKmersBasedOnFrequency",
- &Assembler::selectKmersBasedOnFrequency,
- arg("k"),
- arg("markerDensity"),
- arg("seed") = 231,
- arg("enrichmentThreshold"),
- arg("threadCount") = 0)
- .def("selectKmers2",
- &Assembler::selectKmers2,
- arg("k"),
- arg("markerDensity"),
- arg("seed") = 231,
- arg("enrichmentThreshold"),
- arg("threadCount") = 0)
- .def("selectKmers4",
- &Assembler::selectKmers4,
- arg("k"),
- arg("markerDensity"),
- arg("seed") = 231,
- arg("distanceThreshold"),
- arg("threadCount") = 0)
-
-
+ .def("accessKmerChecker",
+ &Assembler::accessKmerChecker)
// Markers.
.def("accessMarkers",
@@ -203,10 +177,6 @@ PYBIND11_MODULE(shasta, shastaModule)
arg("readId"),
arg("strand"),
arg("fileName"))
- .def("getMarkers",
- &Assembler::getMarkers)
- .def("writeMarkerFrequency",
- &Assembler::writeMarkerFrequency)
.def("computeSortedMarkers",
&Assembler::computeSortedMarkers,
arg("threadCount") = 0)
@@ -226,16 +196,6 @@ PYBIND11_MODULE(shasta, shastaModule)
arg("maxBucketSize"),
arg("minFrequency"),
arg("threadCount") = 0)
- .def("findAlignmentCandidatesLowHash1",
- &Assembler::findAlignmentCandidatesLowHash1,
- arg("m"),
- arg("hashFraction"),
- arg("minHashIterationCount"),
- arg("log2MinHashBucketCount") = 0,
- arg("minBucketSize"),
- arg("maxBucketSize"),
- arg("minFrequency"),
- arg("threadCount") = 0)
.def("accessAlignmentCandidates",
&Assembler::accessAlignmentCandidates)
.def("accessAlignmentCandidateTable",
@@ -263,8 +223,6 @@ PYBIND11_MODULE(shasta, shastaModule)
&Assembler::writeAlignmentCandidates,
arg("useReadName") = false,
arg("verbose") = false)
- .def("writeAlignmentDetails",
- &Assembler::writeAlignmentDetails)
.def("writeLocalAlignmentCandidateReads",
&Assembler::writeLocalAlignmentCandidateReads,
arg("readId"),
@@ -305,7 +263,7 @@ PYBIND11_MODULE(shasta, shastaModule)
(ReadId, Strand, ReadId, Strand,
uint64_t, uint64_t, uint64_t, uint64_t,
uint64_t, double, uint64_t, uint64_t, uint64_t, uint64_t,
- int64_t, int64_t, int64_t) const
+ int64_t, int64_t, int64_t)
)
&Assembler::alignOrientedReads4,
arg("readId0"),
@@ -355,6 +313,10 @@ PYBIND11_MODULE(shasta, shastaModule)
arg("strand0"),
arg("readId1"),
arg("strand1"))
+ .def("computeMarkerKmerIds",
+ &Assembler::computeMarkerKmerIds)
+ .def("cleanupMarkerKmerIds",
+ &Assembler::cleanupMarkerKmerIds)
@@ -437,6 +399,8 @@ PYBIND11_MODULE(shasta, shastaModule)
.def("accessMarkerGraphVertices",
&Assembler::accessMarkerGraphVertices,
arg("readWriteAccess") = false)
+ .def("accessDisjointSetsHistogram",
+ &Assembler::accessDisjointSetsHistogram)
.def("getGlobalMarkerGraphVertex",
(
MarkerGraph::VertexId (Assembler::*)
@@ -481,8 +445,6 @@ PYBIND11_MODULE(shasta, shastaModule)
arg("pattern2CreateNewVertices"))
.def("getMarkerGraphMinCoverageUsed",
&Assembler::getMarkerGraphMinCoverageUsed)
- .def("vertexCoverageStatisticsByKmerId",
- &Assembler::vertexCoverageStatisticsByKmerId)
// Edges of the global marker graph.
.def("createMarkerGraphEdges",
@@ -520,11 +482,6 @@ PYBIND11_MODULE(shasta, shastaModule)
arg("highCoverageThreshold"),
arg("maxDistance"),
arg("edgeMarkerSkipThreshold"))
- .def("reverseTransitiveReduction",
- &Assembler::reverseTransitiveReduction,
- arg("lowCoverageThreshold"),
- arg("highCoverageThreshold"),
- arg("maxDistance"))
.def("pruneMarkerGraphStrongSubgraph",
&Assembler::pruneMarkerGraphStrongSubgraph,
arg("iterationCount"))
@@ -621,7 +578,7 @@ PYBIND11_MODULE(shasta, shastaModule)
.def("assembleAssemblyGraphEdge",
(
AssembledSegment (Assembler::*)
- (AssemblyGraph::EdgeId, bool)
+ (mode0::AssemblyGraph::EdgeId, bool)
)
&Assembler::assembleAssemblyGraphEdge,
arg("edgeId"),
@@ -638,8 +595,6 @@ PYBIND11_MODULE(shasta, shastaModule)
arg("diagonalReadCountMin"),
arg("offDiagonalReadCountMax"),
arg("offDiagonalRatio"))
- .def("alignPseudoPaths",
- &Assembler::alignPseudoPaths)
.def("removeAssemblyGraph",
&Assembler::removeAssemblyGraph)
@@ -655,18 +610,11 @@ PYBIND11_MODULE(shasta, shastaModule)
// Assembly mode 3.
.def("mode3Assembly",
- &Assembler::mode3Assembly,
- arg("threadCount") = 0)
- .def("accessMode3AssemblyGraph",
- &Assembler::accessMode3AssemblyGraph)
- .def("analyzeMode3Subgraph",
- &Assembler::analyzeMode3Subgraph)
- .def("createMode3PathGraph",
- &Assembler::createMode3PathGraph)
- .def("createMode3Detangler",
- &Assembler::createMode3Detangler)
-
-
+ &Assembler::mode3Assembly)
+ .def("mode3AssembleComponent",
+ &Assembler::mode3AssembleComponent)
+ .def("flagPrimaryMarkerGraphEdges",
+ &Assembler::flagPrimaryMarkerGraphEdges)
// Consensus caller.
.def("setupConsensusCaller",
@@ -708,6 +656,18 @@ PYBIND11_MODULE(shasta, shastaModule)
+ // Expose portions of class AssemblerOptions to Python.
+ class_<AssemblerOptions>(shastaModule, "AssemblerOptions")
+ .def(pybind11::init<const string&>())
+ .def_readonly("assemblyOptions", &AssemblerOptions::assemblyOptions)
+ ;
+ class_<AssemblyOptions>(shastaModule, "AssemblyOptions")
+ .def_readonly("mode3Options", &AssemblyOptions::mode3Options)
+ ;
+ class_<Mode3AssemblyOptions>(shastaModule, "Mode3AssemblyOptions");
+
+
+
// Constants.
shastaModule.attr("invalidGlobalMarkerGraphVertexId") = MarkerGraph::invalidVertexId;
shastaModule.attr("invalidCompressedGlobalMarkerGraphVertexId") =
@@ -785,6 +745,15 @@ PYBIND11_MODULE(shasta, shastaModule)
shastaModule.def("testSubsetGraph",
testSubsetGraph
);
+ shastaModule.def("testLongestPath",
+ testLongestPath
+ );
+ shastaModule.def("testEnumeratePaths",
+ testEnumeratePaths
+ );
+ shastaModule.def("globalMsaPython",
+ globalMsaPython
+ );
}
#endif
diff --git a/src/ReadFlags.hpp b/src/ReadFlags.hpp
index bf83ff9..094cca6 100644
--- a/src/ReadFlags.hpp
+++ b/src/ReadFlags.hpp
@@ -10,6 +10,14 @@ namespace shasta {
class shasta::ReadFlags {
public:
+ // Set if we have other reads with the same name.
+ uint8_t isDuplicate : 1;
+
+ // Set if this read is not to be used in the assembly
+ // due to the presence of duplicates.
+ // The way this is set is determined by the value of --Reads.handleDuplicates:
+ uint8_t discardDueToDuplicates : 1;
+
// This is set for reads that are approximate palindromic,
// that is, are well aligned with their own reverse complement.
uint8_t isPalindromic : 1;
diff --git a/src/Reads.cpp b/src/Reads.cpp
index 4a2ce50..3651d46 100644
--- a/src/Reads.cpp
+++ b/src/Reads.cpp
@@ -242,23 +242,32 @@ uint64_t Reads::getReadRawSequenceLength(ReadId readId) const
// representation of an oriented read.
vector<uint32_t> Reads::getRawPositions(OrientedReadId orientedReadId) const
{
- const ReadId readId = orientedReadId.getReadId();
- const ReadId strand = orientedReadId.getStrand();
- const auto repeatCounts = readRepeatCounts[readId];
- const uint64_t n = repeatCounts.size();
-
vector<uint32_t> v;
- uint32_t position = 0;
- for(uint64_t i=0; i<n; i++) {
- v.push_back(position);
- uint8_t count;
- if(strand == 0) {
- count = repeatCounts[i];
- } else {
- count = repeatCounts[n-1-i];
+ if(representation == 1) {
+ const ReadId readId = orientedReadId.getReadId();
+ const ReadId strand = orientedReadId.getStrand();
+ const auto repeatCounts = readRepeatCounts[readId];
+ const uint64_t n = repeatCounts.size();
+
+ uint32_t position = 0;
+ for(uint64_t i=0; i<n; i++) {
+ v.push_back(position);
+ uint8_t count;
+ if(strand == 0) {
+ count = repeatCounts[i];
+ } else {
+ count = repeatCounts[n-1-i];
+ }
+ position += count;
+ }
+ } else {
+
+ // If not using RLE, raw positions are the same as RLE positions.
+ const ReadId readId = orientedReadId.getReadId();
+ for(uint32_t i=0; i<reads[readId].baseCount; i++) {
+ v.push_back(i);
}
- position += count;
}
return v;
@@ -541,3 +550,113 @@ ReadId Reads::getReadId(const span<const char>& readName) const
}
}
+
+
+// Find duplicate reads, as determined by name (not sequence).
+// This also sets the isDuplicate and discardDueToDuplicates read flags
+// and summarizes what it found Duplicates.csv.
+void Reads::findDuplicates(const string& handleDuplicates)
+{
+ const uint64_t readCount = reads.size();
+ SHASTA_ASSERT(readFlags.size() == readCount);
+ SHASTA_ASSERT(readNames.size() == readCount);
+
+ // Set bool variables correspondng to the permitted values of handleDuplicates.
+ bool useAllCopies = false;
+ bool useOneCopy = false;
+ bool useNone = false;
+ bool forbid = false;
+ if(handleDuplicates == "useAllCopies") {
+ useAllCopies = true;
+ } else if(handleDuplicates == "useOneCopy") {
+ useOneCopy = true;
+ } else if(handleDuplicates == "useNone") {
+ useNone = true;
+ } else if(handleDuplicates == "forbid") {
+ forbid = true;
+ } else {
+ throw runtime_error("Invalid value " + handleDuplicates + " specified for --Reads.handleDuplicates. "
+ "Must be one of: useAllCopies, useOneCopy, useNone, forbid.");
+ }
+
+ uint64_t discardedCount = 0;
+ vector<uint64_t> duplicatedReadIds;
+ for(uint64_t i=0; i<readCount; i++) {
+ const uint64_t readId = readIdsSortedByName[i];
+ const auto name = readNames[readId];
+
+ // Find out if the name is the same as the
+ // name of the previous read, in order sorted by name.
+ bool hasSameNameAsPrevious = false;
+ if(i != 0) {
+ const auto previousName = readNames[readIdsSortedByName[i - 1]];
+ hasSameNameAsPrevious = equal(
+ name.begin(), name.end(),
+ previousName.begin(), previousName.end());
+ }
+
+ // Find out if the name is the same as the
+ // name of the next read, in order sorted by name.
+ bool hasSameNameAsNext = false;
+ if(i < readCount - 1) {
+ const auto nextName = readNames[readIdsSortedByName[i + 1]];
+ hasSameNameAsNext = equal(
+ name.begin(), name.end(),
+ nextName.begin(), nextName.end());
+ }
+
+ // Set the isDuplicate flag for this read.
+ ReadFlags& flags = readFlags[readId];
+ flags.isDuplicate = uint8_t(hasSameNameAsPrevious or hasSameNameAsNext);
+
+ // Set the discardDueToDuplicates flag for this read.
+ if(useAllCopies) {
+ flags.discardDueToDuplicates = uint8_t(false);
+ } else if(useOneCopy) {
+ flags.discardDueToDuplicates = uint8_t(hasSameNameAsPrevious);
+ } else if(useNone) {
+ flags.discardDueToDuplicates = flags.isDuplicate;
+ } else if(forbid) {
+ // This does not really matter because in this case the assembly will stop.
+ flags.discardDueToDuplicates = flags.isDuplicate;
+ }
+
+ // Increment counts.
+ if(flags.isDuplicate) {
+ duplicatedReadIds.push_back(readId);
+ }
+ if(flags.discardDueToDuplicates) {
+ ++discardedCount;
+ }
+ }
+
+ cout << "Found " << duplicatedReadIds.size() << " reads with duplicate names." << endl;
+ cout << "Discarded from the assembly " << discardedCount << " reads with duplicate names." << endl;
+
+
+
+ // Write a csv file with details of the duplicate reads.
+ ofstream csv("DuplicateReads.csv");
+ csv << "Id,Discarded,Name,MetaData\n";
+ for(const uint64_t readId: duplicatedReadIds) {
+ const ReadFlags& flags = readFlags[readId];
+ if(flags.isDuplicate) {
+ csv << readId << ",";
+ csv << (flags.discardDueToDuplicates ? "Yes" : "No") << ",";
+
+ const auto name = readNames[readId];
+ copy(name.begin(), name.end(), ostream_iterator<char>(csv));
+ csv << ",";
+
+ const auto metaData = readMetaData[readId];
+ copy(metaData.begin(), metaData.end(), ostream_iterator<char>(csv));
+ csv << "\n";
+ }
+ }
+
+ // If there are duplicates, stop the assembly, if requested.
+ if(forbid and duplicatedReadIds.size() > 0) {
+ throw runtime_error("Stopping assembly because reads with duplicate names were found "
+ "and --Reads.handleDuplicates is set to forbid.");
+ }
+}
diff --git a/src/Reads.hpp b/src/Reads.hpp
index 5b9cc8d..06215b1 100644
--- a/src/Reads.hpp
+++ b/src/Reads.hpp
@@ -256,6 +256,11 @@ public:
uint64_t& discardedShortReadBases
);
+ // Find duplicate reads, as determined by name (not sequence).
+ // This also sets the isDuplicate and discardDueToDuplicates read flags
+ // and summarizes what it found Duplicates.csv.
+ void findDuplicates(const string& handleDuplicates);
+
void remove();
uint64_t representation; // 0 = raw sequence, 1 = RLE sequence
diff --git a/src/ShortBaseSequence.cpp b/src/ShortBaseSequence.cpp
index 7f18e72..b8fe346 100644
--- a/src/ShortBaseSequence.cpp
+++ b/src/ShortBaseSequence.cpp
@@ -9,7 +9,7 @@ using namespace shasta;
void shasta::testShortBaseSequence()
{
- ShortBaseSequence8 s;
+ ShortBaseSequence16 s;
s.set(0, Base::fromCharacter('T'));
s.set(1, Base::fromCharacter('C'));
s.set(2, Base::fromCharacter('G'));
@@ -19,13 +19,39 @@ void shasta::testShortBaseSequence()
cout << s << endl;
// const auto oldFill = cout.fill('0');
- for(const uint8_t x: s.data) {
+ for(const uint16_t x: s.data) {
cout << std::setw(2) << std::hex << int(x) << endl;
// cout << int(x) << endl;
}
// cout.fill(oldFill);
// Check that constructor from id does the inverse of function id().
- const ShortBaseSequence8 t(s.id(4), 4);
+ const ShortBaseSequence16 t(s.id(4), 4);
SHASTA_ASSERT(t == s);
+
+
+ // Verify that the KmerId for a k-mer of given length k is the
+ // same regardless of how the k-mer is stored.
+ {
+ const string sequenceString = "TCGAGCTTAG";
+ const uint64_t k = sequenceString.size();
+
+ ShortBaseSequence16 s16;
+ ShortBaseSequence32 s32;
+ ShortBaseSequence64 s64;
+ for(uint64_t i=0; i<k; i++) {
+ const Base base = Base::fromCharacter(sequenceString[i]);
+ s16.set(i, base);
+ s32.set(i, base);
+ s64.set(i, base);
+ }
+ const uint64_t kmerId16 = s16.id(k);
+ const uint64_t kmerId32 = s32.id(k);
+ const uint64_t kmerId64 = s64.id(k);
+
+ cout << kmerId16 << " " << kmerId32 << " " << kmerId64 << endl;
+ SHASTA_ASSERT(kmerId16 == kmerId32);
+ SHASTA_ASSERT(kmerId32 == kmerId64);
+
+ }
}
diff --git a/src/ShortBaseSequence.hpp b/src/ShortBaseSequence.hpp
index 7c848f7..39944be 100644
--- a/src/ShortBaseSequence.hpp
+++ b/src/ShortBaseSequence.hpp
@@ -3,6 +3,7 @@
// shasta.
#include "Base.hpp"
+#include "bitReversal.hpp"
// Standard library.
#include "array.hpp"
@@ -106,7 +107,7 @@ public:
}
// Return the reverse complement of the first n bases.
- ShortBaseSequence<Int> reverseComplement(uint64_t n) const
+ ShortBaseSequence<Int> reverseComplementSlow(uint64_t n) const
{
ShortBaseSequence<Int> reverseComplementedSequence;
for(size_t i=0; i<n; i++) {
@@ -116,11 +117,39 @@ public:
return reverseComplementedSequence;
}
+
+
+ // Return the reverse complement of the first n bases.
+ // Use bit reversal for speed. This avoids a loop over the n bases.
+ ShortBaseSequence<Int> reverseComplement(uint64_t n) const
+ {
+ const Int shift = Int(capacity - n);
+ const Int mask = Int(1ULL << n) - Int(1);
+ ShortBaseSequence<Int> reverseComplementedSequence;
+ reverseComplementedSequence.data[0] = Int(((~bitReversal(data[0])) & mask) << shift);
+ reverseComplementedSequence.data[1] = Int(((~bitReversal(data[1])) & mask) << shift);
+
+#if 0
+ // Testing.
+ SHASTA_ASSERT(reverseComplementedSequence == reverseComplementSlow(n));
+ SHASTA_ASSERT(reverseComplementedSequence.reverseComplementSlow(n) == *this);
+#endif
+
+ return reverseComplementedSequence;
+ }
+
+
+
bool operator==(const ShortBaseSequence<Int>& that) const
{
return data == that.data;
}
+ bool operator<(const ShortBaseSequence<Int>& that) const
+ {
+ return data < that.data;
+ }
+
// Write the first n bases.
ostream& write(ostream& s, uint64_t n) const
{
diff --git a/src/approximateTopologicalSort.hpp b/src/approximateTopologicalSort.hpp
index 00a0af9..effd036 100644
--- a/src/approximateTopologicalSort.hpp
+++ b/src/approximateTopologicalSort.hpp
@@ -51,6 +51,7 @@ Only the last edge processed will be classified as causing a cycle.
#include <boost/graph/iteration_macros.hpp>
#include <stack>
+#include "utility.hpp"
#include "vector.hpp"
namespace shasta {
diff --git a/src/assembleMarkerGraphPath.cpp b/src/assembleMarkerGraphPath.cpp
index ed140f7..e615e95 100644
--- a/src/assembleMarkerGraphPath.cpp
+++ b/src/assembleMarkerGraphPath.cpp
@@ -7,6 +7,7 @@ using namespace shasta;
void shasta::assembleMarkerGraphPath(
uint64_t readRepresentation,
uint64_t k,
+ const Reads& reads,
const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
const MarkerGraph& markerGraph,
const span<const MarkerGraph::EdgeId>& markerGraphPath,
@@ -53,9 +54,7 @@ void shasta::assembleMarkerGraphPath(
for(size_t i=0; i<assembledSegment.vertexCount; i++) {
// Get the sequence.
- const MarkerId firstMarkerId = markerGraph.getVertexMarkerIds(assembledSegment.vertexIds[i])[0];
- const CompressedMarker& firstMarker = markers.begin()[firstMarkerId];
- const KmerId kmerId = firstMarker.kmerId;
+ const KmerId kmerId = markerGraph.getVertexKmerId(assembledSegment.vertexIds[i], k, reads, markers);
const Kmer kmer(kmerId, k);
if(readRepresentation == 1) {
diff --git a/src/assembleMarkerGraphPath.hpp b/src/assembleMarkerGraphPath.hpp
index f3ded2f..a5ec3b6 100644
--- a/src/assembleMarkerGraphPath.hpp
+++ b/src/assembleMarkerGraphPath.hpp
@@ -8,10 +8,12 @@
namespace shasta {
class AssembledSegment;
+ class Reads;
void assembleMarkerGraphPath(
uint64_t readRepresentation,
uint64_t k,
+ const Reads& reads,
const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
const MarkerGraph&,
const span<const MarkerGraph::EdgeId>& markerGraphPath,
diff --git a/src/bitReversal.hpp b/src/bitReversal.hpp
new file mode 100644
index 0000000..04ed9c6
--- /dev/null
+++ b/src/bitReversal.hpp
@@ -0,0 +1,54 @@
+#ifndef SHASTA_BIT_REVERSAL_HPP
+#define SHASTA_BIT_REVERSAL_HPP
+
+// See https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
+
+#include "cstdint.hpp"
+
+namespace shasta {
+ inline uint16_t bitReversal(uint16_t);
+ inline uint32_t bitReversal(uint32_t);
+ inline uint64_t bitReversal(uint64_t);
+}
+
+
+
+inline uint16_t shasta::bitReversal(uint16_t x)
+{
+ const uint16_t m1 = uint16_t(0x5555);
+ const uint16_t m2 = uint16_t(0x3333);
+ const uint16_t m4 = uint16_t(0x0F0F);
+
+ x = ((x >> 1) & m1) | ((x & m1) << 1);
+ x = ((x >> 2) & m2) | ((x & m2) << 2);
+ x = ((x >> 4) & m4) | ((x & m4) << 4);
+ x = (x >> 8) | (x << 8);
+ return x;
+}
+
+
+
+inline uint32_t shasta::bitReversal(uint32_t x)
+{
+ x = ((x >> 1) & 0x55555555) | ((x & 0x55555555) << 1);
+ x = ((x >> 2) & 0x33333333) | ((x & 0x33333333) << 2);
+ x = ((x >> 4) & 0x0F0F0F0F) | ((x & 0x0F0F0F0F) << 4);
+ x = ((x >> 8) & 0x00FF00FF) | ((x & 0x00FF00FF) << 8);
+ x = ( x >> 16) | ( x << 16);
+ return x;
+}
+
+
+
+inline uint64_t shasta::bitReversal(uint64_t x)
+{
+ x = ((x >> 1) & 0x5555555555555555UL) | ((x & 0x5555555555555555UL) << 1 );
+ x = ((x >> 2) & 0x3333333333333333UL) | ((x & 0x3333333333333333UL) << 2 );
+ x = ((x >> 4) & 0x0F0F0F0F0F0F0F0FUL) | ((x & 0x0F0F0F0F0F0F0F0FUL) << 4 );
+ x = ((x >> 8) & 0x00FF00FF00FF00FFUL) | ((x & 0x00FF00FF00FF00FFUL) << 8 );
+ x = ((x >> 16) & 0x0000FFFF0000FFFFUL) | ((x & 0x0000FFFF0000FFFFUL) << 16);
+ x = (x >> 32) | (x << 32);
+ return x;
+}
+
+#endif
diff --git a/src/computeLayout.hpp b/src/computeLayout.hpp
index 90e3c84..7395ee7 100644
--- a/src/computeLayout.hpp
+++ b/src/computeLayout.hpp
@@ -79,6 +79,7 @@ namespace shasta {
const Graph&,
const std::map<typename Graph::edge_descriptor, double>& edgeLengthMap,
std::map<typename Graph::vertex_descriptor, array<double, 2> >& positionMap,
+ uint64_t quality,
double timeout);
}
@@ -214,6 +215,7 @@ template<class Graph> shasta::ComputeLayoutReturnCode shasta::computeLayoutCusto
const Graph& graph,
const std::map<typename Graph::edge_descriptor, double>& edgeLengthMap,
std::map<typename Graph::vertex_descriptor, array<double, 2> >& positionMap,
+ uint64_t quality,
double timeout)
{
using vertex_descriptor = typename Graph::vertex_descriptor;
@@ -251,7 +253,8 @@ template<class Graph> shasta::ComputeLayoutReturnCode shasta::computeLayoutCusto
// Invoke the custom graph layout program.
const string outputFileName = tmpDirectory() + uuid + "-output.txt";
- const string command = "customLayout -i " + inputFileName + " -o " + outputFileName ;
+ const string command = "customLayout -i " + inputFileName + " -o " + outputFileName +
+ " --quality " + to_string(quality);
bool timeoutTriggered = false;
bool signalOccurred = false;
int returnCode = 0;
diff --git a/src/copyNumber.hpp b/src/copyNumber.hpp
index f0d4ef2..9c33d07 100644
--- a/src/copyNumber.hpp
+++ b/src/copyNumber.hpp
@@ -2,7 +2,7 @@
#define SHASTA_COPY_NUMBER_HPP
#include "prefixLength.hpp"
-#include "span.hpp"
+#include "SHASTA_ASSERT.hpp"
#include "cstdint.hpp"
namespace shasta {
@@ -42,21 +42,6 @@ template<class Container> uint64_t shasta::isCopyNumberDifference(
}
SHASTA_ASSERT(nx < ny);
- // If the length difference is not a multiple of one of the allowed periods,
- // return 0.
- const uint64_t dn = ny - nx;
- bool found = false;
- for(uint64_t period=2; period<=maxPeriod; period++) {
- if((dn % period) == 0) {
- found = true;
- break;
- }
- }
- if(not found) {
- return 0;
- }
-
-
const uint64_t prefixLength = commonPrefixLength(x, y);
const uint64_t suffixLength = commonSuffixLength(x, y);
@@ -79,6 +64,7 @@ template<class Container> uint64_t shasta::isCopyNumberDifference(
// If getting here, x and y differ by an insertion in iy of range [iy, jy).
+ const uint64_t dn = ny - nx;
SHASTA_ASSERT(ix == jx);
SHASTA_ASSERT(jy - iy == dn);
@@ -86,7 +72,7 @@ template<class Container> uint64_t shasta::isCopyNumberDifference(
// Check for k base repeat.
// We kept the entire common prefix, so we can check just to the left of the insertion.
- for(uint64_t period=2; period<=maxPeriod; period++) {
+ for(uint64_t period=1; period<=maxPeriod; period++) {
if((dn % period) != 0) {
continue;
}
diff --git a/src/deduplicate.hpp b/src/deduplicate.hpp
index 932d1ae..b04a6a2 100644
--- a/src/deduplicate.hpp
+++ b/src/deduplicate.hpp
@@ -62,11 +62,108 @@ namespace shasta {
}
+
+ // Remove duplicate elements in a vector and count occurrences of each.
+ // Keep only the ones that occur at least minCount times.
+ template<class T, class Int> void deduplicateAndCountWithThreshold(
+ vector<T>& v,
+ vector<Int>& count,
+ Int minCount
+ )
+ {
+ // Clear the count vector.
+ count.clear();
+
+ // If the given vector is empty, return now.
+ if(v.empty()) {
+ return;
+ }
+
+ // Sort the vector.
+ sort(v.begin(), v.end());
+
+ // Add elements, keeping track of the number
+ // of occurrences of each.
+ typename vector<T>::iterator output = v.begin();
+ typename vector<T>::iterator input = v.begin();
+ while(input != v.end()) {
+
+
+ // Count how many there are.
+ typename vector<T>::iterator it = input;
+ while(it!=v.end() && *it==*input) {
+ ++it;
+ }
+ const Int n = Int(it - input);
+
+ if(n >= minCount) {
+
+ // Store this element.
+ *output = *input;
+ ++output;
+
+ // Store the count.
+ count.push_back(n);
+ }
+
+ // Update our output iterator.
+ input = it;
+
+ }
+ v.resize(count.size());
+ }
+
+
+
+ // Remove duplicate elements in a vector and count occurrences of each.
+ // Keep only the ones that occur exactly once.
+ template<class T> void deduplicateAndCountAndKeepUnique(
+ vector<T>& v)
+ {
+
+ // If the given vector is empty, return now.
+ if(v.empty()) {
+ return;
+ }
+
+ // Sort the vector.
+ sort(v.begin(), v.end());
+
+ // Add elements, keeping track of the number
+ // of occurrences of each.
+ typename vector<T>::iterator output = v.begin();
+ typename vector<T>::iterator input = v.begin();
+ while(input != v.end()) {
+
+
+ // Count how many there are.
+ typename vector<T>::iterator it = input;
+ while(it!=v.end() && *it==*input) {
+ ++it;
+ }
+ const uint64_t n = it - input;
+
+ if(n == 1) {
+
+ // Store this element.
+ *output = *input;
+ ++output;
+ }
+
+ // Update our output iterator.
+ input = it;
+
+ }
+ v.resize(output - v.begin());
+ }
+
+
+
inline void testDeduplicateAndCount()
{
vector<int> v = {7, 4, 5, 7, 4, 18, 2, 4};
vector<int> count;
- deduplicateAndCount(v, count);
+ deduplicateAndCountWithThreshold(v, count, 2);
SHASTA_ASSERT(v.size() == count.size());
for(uint64_t i=0; i<v.size(); i++) {
cout << v[i] << " " << count[i] << endl;
diff --git a/src/enumeratePaths.cpp b/src/enumeratePaths.cpp
new file mode 100644
index 0000000..1e7ca68
--- /dev/null
+++ b/src/enumeratePaths.cpp
@@ -0,0 +1,50 @@
+#include "enumeratePaths.hpp"
+using namespace shasta;
+
+#include <boost/graph/adjacency_list.hpp>
+
+#include "iostream.hpp"
+
+
+void shasta::testEnumeratePaths()
+{
+ using Graph = boost::adjacency_list<
+ boost::vecS,
+ boost::vecS,
+ boost::bidirectionalS>;
+ Graph graph(10);
+ using edge_descriptor = Graph::edge_descriptor;
+ using Path = vector<edge_descriptor>;
+
+ class PathInspector {
+ public:
+ const Graph& graph;
+ uint64_t length;
+ PathInspector(const Graph& graph, uint64_t length) : graph(graph), length(length) {}
+ void operator()(const Path& path)
+ {
+ if(path.size() == length) {
+ for(const edge_descriptor e: path) {
+ cout << source(e, graph) << "->" << target(e, graph) << " ";
+ }
+ cout << endl;
+ }
+ }
+ };
+ const uint64_t length = 4;
+ PathInspector pathInspector(graph, length);
+
+ add_edge(0, 1, graph);
+ add_edge(1, 2, graph);
+ add_edge(2, 3, graph);
+ add_edge(3, 4, graph);
+ add_edge(1, 5, graph);
+ add_edge(5, 6, graph);
+ add_edge(6, 7, graph);
+ add_edge(6, 3, graph);
+ add_edge(3, 8, graph);
+ add_edge(7, 9, graph);
+
+ enumeratePaths(graph, 0, length, pathInspector);
+
+}
diff --git a/src/enumeratePaths.hpp b/src/enumeratePaths.hpp
index beb14e5..4e77fa1 100644
--- a/src/enumeratePaths.hpp
+++ b/src/enumeratePaths.hpp
@@ -5,18 +5,62 @@
#include <boost/graph/iteration_macros.hpp>
#include "algorithm.hpp"
+#include "iostream.hpp"
#include <stack>
#include "tuple.hpp"
+#include "utility.hpp"
#include "vector.hpp"
namespace shasta {
-template<class G> void enumerateSelfAvoidingPaths(const G&,
- typename G::vertex_descriptor vA, typename G::vertex_descriptor vB,
- vector<vector<typename G::edge_descriptor> > &paths);
+ template<class G> void enumerateSelfAvoidingPaths(const G&,
+ typename G::vertex_descriptor vA, typename G::vertex_descriptor vB,
+ vector<vector<typename G::edge_descriptor> > &paths);
+
+ template<class G, class PathInspector> void enumeratePaths(
+ const G&,
+ typename G::vertex_descriptor v,
+ uint64_t pathLength,
+ PathInspector&);
+ template<class G, class PathInspector> void enumeratePathsRecursive(
+ const G&,
+ typename G::vertex_descriptor v,
+ uint64_t pathLength,
+ PathInspector&,
+ vector<typename G::edge_descriptor>& path);
+
+ // Same, but in the reverse direction (backward paths).
+ template<class G, class PathInspector> void enumeratePathsReverse(
+ const G&,
+ typename G::vertex_descriptor v,
+ uint64_t pathLength,
+ PathInspector&);
+ template<class G, class PathInspector> void enumeratePathsReverseRecursive(
+ const G&,
+ typename G::vertex_descriptor v,
+ uint64_t pathLength,
+ PathInspector&,
+ vector<typename G::edge_descriptor>& path);
+
+ // Similar to the above, but for paths of any length beginning at vA and ending at vB.
+ template<class G, class PathInspector> void enumeratePathsBetween(
+ const G&,
+ typename G::vertex_descriptor vA,
+ typename G::vertex_descriptor vB,
+ PathInspector&);
+ template<class G, class PathInspector> void enumeratePathsBetweenRecursive(
+ const G&,
+ typename G::vertex_descriptor vA,
+ typename G::vertex_descriptor vB,
+ PathInspector&,
+ vector<typename G::edge_descriptor>& path);
+
+ void testEnumeratePaths();
}
+
+
// Enumerate self-avoiding paths starting at v0 and ending at v1.
// Self-avoiding means that an edge cannot be used twice.
template<class G> void shasta::enumerateSelfAvoidingPaths(const G &g,
@@ -75,5 +119,101 @@ template<class G> void shasta::enumerateSelfAvoidingPaths(const G &g,
}
}
+
+
+// In a directed graph of type G,
+// enumerate all paths starting at v and with length (number of edges)
+// up to pathLength.
+// For each path found, apply the given function object by calling
+// functionObject(path), where path is a vector<G::edge_descriptor>
+template<class G, class PathInspector> void shasta::enumeratePaths(
+ const G& g,
+ typename G::vertex_descriptor v,
+ uint64_t maxPathLength,
+ PathInspector& pathInspector)
+{
+ vector<typename G::edge_descriptor> path;
+ enumeratePathsRecursive(g, v, maxPathLength, pathInspector, path);
+}
+template<class G, class PathInspector> void shasta::enumeratePathsRecursive(
+ const G& g,
+ typename G::vertex_descriptor v,
+ uint64_t maxPathLength,
+ PathInspector& pathInspector,
+ vector<typename G::edge_descriptor>& path)
+{
+ if(maxPathLength == 0) {
+ return;
+ }
+ BGL_FORALL_OUTEDGES_T(v, e, g, G) {
+ path.push_back(e);
+ pathInspector(path);
+ enumeratePathsRecursive(g, target(e, g), maxPathLength - 1, pathInspector, path);
+ path.pop_back();
+ }
+}
+
+
+
+template<class G, class PathInspector> void shasta::enumeratePathsReverse(
+ const G& g,
+ typename G::vertex_descriptor v,
+ uint64_t maxPathLength,
+ PathInspector& pathInspector)
+{
+ vector<typename G::edge_descriptor> path;
+ enumeratePathsReverseRecursive(g, v, maxPathLength, pathInspector, path);
+}
+template<class G, class PathInspector> void shasta::enumeratePathsReverseRecursive(
+ const G& g,
+ typename G::vertex_descriptor v,
+ uint64_t maxPathLength,
+ PathInspector& pathInspector,
+ vector<typename G::edge_descriptor>& path)
+{
+ if(maxPathLength == 0) {
+ return;
+ }
+ BGL_FORALL_INEDGES_T(v, e, g, G) {
+ path.push_back(e);
+ pathInspector(path);
+ enumeratePathsReverseRecursive(g, source(e, g), maxPathLength - 1, pathInspector, path);
+ path.pop_back();
+ }
+}
+
+
+// In a directed graph of type G,
+// enumerate all paths of any length starting at vA ending at vB.
+// For each path found, apply the given function object by calling
+// functionObject(path), where path is a vector<G::edge_descriptor>
+template<class G, class PathInspector> void shasta::enumeratePathsBetween(
+ const G& g,
+ typename G::vertex_descriptor vA,
+ typename G::vertex_descriptor vB,
+ PathInspector& pathInspector)
+{
+ vector<typename G::edge_descriptor> path;
+ enumeratePathsBetweenRecursive(g, vA, vB, pathInspector, path);
+}
+template<class G, class PathInspector> void shasta::enumeratePathsBetweenRecursive(
+ const G& g,
+ typename G::vertex_descriptor vA,
+ typename G::vertex_descriptor vB,
+ PathInspector& pathInspector,
+ vector<typename G::edge_descriptor>& path)
+{
+ BGL_FORALL_OUTEDGES_T(vA, e, g, G) {
+ path.push_back(e);
+ typename G::vertex_descriptor vC = target(e, g);
+ if(vC == vB) {
+ pathInspector(path);
+ } else {
+ enumeratePathsBetweenRecursive(g, vC, vB, pathInspector, path);
+ }
+ path.pop_back();
+ }
+}
+
#endif
diff --git a/src/findLinearChains.hpp b/src/findLinearChains.hpp
index 64fb35b..b260943 100644
--- a/src/findLinearChains.hpp
+++ b/src/findLinearChains.hpp
@@ -225,7 +225,15 @@ template<class Graph> void shasta::findLinearVertexChains(
// Check that all vertices were found.
- SHASTA_ASSERT(verticesFound.size() == num_vertices(graph));
+ // Just using num_vertices does not work if the graph is a filtered_graph.
+ // SHASTA_ASSERT(verticesFound.size() == num_vertices(graph));
+ uint64_t vertexCount = 0;
+ BGL_FORALL_VERTICES_T(v, graph, Graph) {
+ if(v != Graph::null_vertex()) { // Just to avoid compiler warning.
+ ++vertexCount;
+ }
+ }
+ SHASTA_ASSERT(verticesFound.size() == vertexCount);
}
diff --git a/src/globalMsa.cpp b/src/globalMsa.cpp
new file mode 100644
index 0000000..9831d75
--- /dev/null
+++ b/src/globalMsa.cpp
@@ -0,0 +1,471 @@
+// Shasta.
+#include "globalMsa.hpp"
+#include "Base.hpp"
+#include "deduplicate.hpp"
+#include "invalid.hpp"
+#include "orderPairs.hpp"
+#include "SHASTA_ASSERT.hpp"
+#include "ShortBaseSequence.hpp"
+
+// Spoa.
+#include "spoa/spoa.hpp"
+
+// Standard library.
+#include "algorithm.hpp"
+#include <map>
+#include "tuple.hpp"
+
+// See the comments in globalMsa.hpp.
+
+
+
+void shasta::globalMsa(
+ const vector< pair<vector<Base>, uint64_t> >& sequences,
+ uint64_t maxSpoaLength,
+ uint64_t kmerLength,
+ vector<Base>& consensus
+ )
+{
+ const bool debug = true;
+ if(debug) {
+ cout << "globalMsa called with " << sequences.size() << " sequences with (length,weight):" << endl;
+ uint64_t totalWeight = 0;
+ for(const auto& p: sequences) {
+ cout << "(" << p.first.size() << "," << p.second << ") ";
+ totalWeight += p.second;
+ }
+ cout << endl;
+ cout << "Total weight is " << totalWeight << endl;
+ }
+
+ // Sanity check.
+ SHASTA_ASSERT(not sequences.empty());
+
+ using Kmer = ShortBaseSequence64;
+ SHASTA_ASSERT(kmerLength <= Kmer::capacity);
+
+ // Trivial case.
+ if(sequences.size() == 1) {
+ consensus = sequences.front().first;
+ return;
+ }
+
+ // Compute the maximum length of the input sequences.
+ uint64_t maxLength = 0;
+ for(const auto& p: sequences) {
+ maxLength = max(maxLength, p.first.size());
+ }
+
+ // If short enough, use spoa.
+ if(maxLength <= maxSpoaLength) {
+ if(debug) {
+ cout << "Using spoa." << endl;
+ }
+ globalMsaSpoa(sequences, consensus);
+ return;
+ }
+
+
+
+ // Create a table of unique k-mers for each of the sequences.
+ class KmerInfo {
+ public:
+ Kmer kmer;
+ uint64_t position;
+ bool operator<(const KmerInfo& that) const
+ {
+ return kmer.data < that.kmer.data;
+ }
+ bool operator==(const KmerInfo& that) const
+ {
+ return kmer.data == that.kmer.data;
+ }
+ };
+ vector< vector<KmerInfo> > kmerTable1(sequences.size());
+
+ for(uint64_t i=0; i<sequences.size(); i++) {
+ const vector<Base>& sequence = sequences[i].first;
+ if(false) {
+ cout << "Finding unique k-mers for sequence of length " << sequence.size() << endl;
+ }
+ vector<KmerInfo>& kmerInfos = kmerTable1[i];
+
+ Kmer kmer;
+ for(uint64_t position=0; position<kmerLength; position++) {
+ kmer.set(position, sequence[position]);
+ }
+
+ for(uint64_t position=0; /* Check later */; position++) {
+ kmerInfos.push_back({kmer, position});
+
+ if(position + kmerLength == sequence.size()) {
+ break;
+ }
+
+ // Update the k-mer.
+ kmer.shiftLeft();
+ kmer.set(kmerLength - 1, sequence[position + kmerLength]);
+ }
+ SHASTA_ASSERT(kmerInfos.size() == sequence.size() - kmerLength + 1);
+
+ // Only keep the k-mers that appear once.
+ if(false) {
+ cout << kmerInfos.size() << " total kmers." << endl;
+ }
+ deduplicateAndCountAndKeepUnique(kmerInfos);
+ if(false) {
+ cout << kmerInfos.size() << " unique kmers." << endl;
+ }
+ }
+
+
+
+ // Create a global table of unique k-mers in all the sequences.
+ class KmerData {
+ public:
+ Kmer kmer;
+ uint64_t sequenceIndex;
+ uint64_t position;
+ bool operator<(const KmerData& that) const
+ {
+ return tie(kmer.data, sequenceIndex) < tie(that.kmer.data, that.sequenceIndex);
+ }
+ };
+ vector<KmerData> kmerTable2;
+ for(uint64_t sequenceIndex=0; sequenceIndex<sequences.size(); sequenceIndex++) {
+ const vector<KmerInfo>& kmerInfos = kmerTable1[sequenceIndex];
+ for(const KmerInfo& kmerInfo: kmerInfos) {
+ kmerTable2.push_back({kmerInfo.kmer, sequenceIndex, kmerInfo.position});
+ }
+ }
+ sort(kmerTable2.begin(), kmerTable2.end());
+
+
+
+ // Now construct a third table that for each unique k-mer
+ // gives the sequence indexes and positions the k-mer appears in.
+ class UniqueKmerInfo {
+ public:
+ Kmer kmer;
+ uint64_t totalWeight = 0;
+ uint64_t minDistanceFromEnds = invalid<uint64_t>;
+ class Occurrence {
+ public:
+ uint64_t sequenceIndex;
+ uint64_t position;
+ };
+ vector<Occurrence> occurrences;
+ bool operator<(const UniqueKmerInfo& that) const
+ {
+ return tie(totalWeight, minDistanceFromEnds) > tie(that.totalWeight, that.minDistanceFromEnds);
+ }
+ void write(ostream& s, uint64_t kmerLength) const
+ {
+ kmer.write(s, kmerLength);
+ s << " " << totalWeight;
+ s << " " << minDistanceFromEnds;
+ for(const auto& occurrence: occurrences) {
+ s << " (" << occurrence.sequenceIndex << "," <<
+ occurrence.position << ")";
+ }
+ s << endl;
+ }
+ };
+ vector<UniqueKmerInfo> kmerTable3;
+ for(auto it=kmerTable2.begin(); it!= kmerTable2.end(); /* Increment later */) {
+ const Kmer kmer = it->kmer;
+
+ // Find the end of the streak for the same kmer.
+ auto jt = it;
+ while(true) {
+ if(jt == kmerTable2.end()) {
+ break;
+ }
+ if(jt->kmer != kmer) {
+ break;
+ }
+ ++jt;
+ }
+
+ // Store this streak in kmerTable3.
+ UniqueKmerInfo uniqueKmerInfo;
+ uniqueKmerInfo.kmer = kmer;
+ for(; it!=jt; it++) {
+ const uint64_t sequenceIndex = it->sequenceIndex;
+ const uint64_t sequenceLength = sequences[sequenceIndex].first.size();
+ const uint64_t position = it->position;
+ const uint64_t distanceFromLeft = position;
+ const uint64_t distanceFromRight = sequenceLength - kmerLength - position;
+ const uint64_t distanceFromEnds = min(distanceFromLeft, distanceFromRight);
+ uniqueKmerInfo.occurrences.push_back({sequenceIndex, it->position});
+ uniqueKmerInfo.totalWeight += sequences[it->sequenceIndex].second;
+ uniqueKmerInfo.minDistanceFromEnds = min(uniqueKmerInfo.minDistanceFromEnds, distanceFromEnds);
+ }
+ kmerTable3.push_back(uniqueKmerInfo);
+ }
+ sort(kmerTable3.begin(), kmerTable3.end());
+
+
+ if(false) {
+ for(const auto& uniqueKmerInfo: kmerTable3) {
+ uniqueKmerInfo.write(cout, kmerLength);
+ }
+ }
+
+ // The first entry in kmerTable3 gives the optimal splitting kmer,
+ // the sequences involves (all of them, in most cases),
+ // and the position of the splitting k-mer in each of the sequences.
+ SHASTA_ASSERT(not kmerTable3.empty());
+ const UniqueKmerInfo& optimalSplitting = kmerTable3.front();
+ if(debug) {
+ cout << "Splitting at ";
+ optimalSplitting.write(cout, kmerLength);
+ }
+
+
+ // Prepare the sequences for the left and right MSA.
+ vector< pair<vector<Base>, uint64_t> > leftSequences;
+ vector< pair<vector<Base>, uint64_t> > rightSequences;
+ vector<Base> leftConsensus;
+ vector<Base> rightConsensus;
+ for(const auto& occurrence: optimalSplitting.occurrences) {
+ const uint64_t sequenceIndex = occurrence.sequenceIndex;
+ const auto& p = sequences[sequenceIndex];
+ const vector<Base>& sequence = p.first;
+ const uint64_t weight = p.second;
+ const uint64_t position = occurrence.position;
+ leftSequences.push_back(make_pair(vector<Base>(), weight));
+ rightSequences.push_back(make_pair(vector<Base>(), weight));
+ vector<Base>& leftSequence = leftSequences.back().first;
+ vector<Base>& rightSequence = rightSequences.back().first;
+ copy(sequence.begin(), sequence.begin() + position,
+ back_inserter(leftSequence));
+ copy(sequence.begin() + position + kmerLength, sequence.end(),
+ back_inserter(rightSequence));
+ }
+
+ // Recursive call to do the left and right MSA.
+ globalMsa(leftSequences , maxSpoaLength, kmerLength, leftConsensus);
+ globalMsa(rightSequences, maxSpoaLength, kmerLength, rightConsensus);
+
+ // Now stitch the pieces together.
+ consensus = leftConsensus;
+ for(uint64_t position=0; position<kmerLength; position++) {
+ consensus.push_back(optimalSplitting.kmer[position]);
+ }
+ copy(rightConsensus.begin(), rightConsensus.end(),
+ back_inserter(consensus));
+}
+
+
+
+// This just uses spoa.
+// It cannot be used for very long sequences due to quadratic
+// memory and time. Practical limit is a few thousand bases.
+void shasta::globalMsaSpoa(
+ const vector< pair<vector<Base>, uint64_t> >& sequences,
+ vector<Base>& consensus
+ )
+{
+ // Sanity check.
+ SHASTA_ASSERT(not sequences.empty());
+
+ // Trivial case.
+ if(sequences.size() == 1) {
+ consensus = sequences.front().first;
+ return;
+ }
+
+ // We want to enter the sequences in order of decreasing weight.
+ // Create a table of pairs (sequenceIndex, weight)
+ // where sequenceIndex is the index in the sequences vector.
+ // Then sort by decreasing weight.
+ vector< pair<uint64_t, uint64_t> > sequencesTable;
+ for(uint64_t sequenceIndex=0; sequenceIndex<sequences.size(); sequenceIndex++) {
+ const auto& p = sequences[sequenceIndex];
+ const uint64_t weight = p.second;
+ sequencesTable.push_back(make_pair(sequenceIndex, weight));
+ }
+ sort(sequencesTable.begin(), sequencesTable.end(),
+ OrderPairsBySecondOnlyGreater<uint64_t, uint64_t>());
+
+ // Create the spoa alignment engine and alignment graph.
+ const spoa::AlignmentType alignmentType = spoa::AlignmentType::kNW;
+ const int8_t match = 1;
+ const int8_t mismatch = -1;
+ const int8_t gap = -1;
+ auto spoaAlignmentEngine = spoa::AlignmentEngine::Create(alignmentType, match, mismatch, gap);
+ spoa::Graph spoaAlignmentGraph;
+
+ // Add the sequences to the MSA in order of decreasing weight.
+ string sequenceString;
+ for(uint64_t indexByWeight=0; indexByWeight<sequencesTable.size(); indexByWeight++) {
+ const auto& p = sequencesTable[indexByWeight];
+ const uint64_t sequenceIndex = p.first;
+ const uint64_t weight = p.second;
+ const auto& q = sequences[sequenceIndex];
+ SHASTA_ASSERT(q.second == weight);
+ const vector<Base>& sequence = q.first;
+
+ sequenceString.clear();
+ for(const Base base: sequence) {
+ sequenceString += base.character();
+ }
+ auto alignment = spoaAlignmentEngine->Align(sequenceString, spoaAlignmentGraph);
+ spoaAlignmentGraph.AddAlignment(alignment, sequenceString, uint32_t(weight));
+ }
+
+ // Get the MSA alignment.
+ // The true argument causes a final alignment entry equal to the consensus.
+ vector<string> alignment = spoaAlignmentGraph.GenerateMultipleSequenceAlignment(false);
+ SHASTA_ASSERT(alignment.size() == sequencesTable.size());
+
+ // Compute coverage at each alignment position for each of the 5 AlignedBases.
+ const uint64_t alignmentLength = alignment.front().size();
+ vector< array<uint64_t, 5> > coverage(alignmentLength, {0, 0, 0, 0, 0});
+ for(uint64_t indexByWeight=0; indexByWeight<sequencesTable.size(); indexByWeight++) {
+ const string& alignmentRow = alignment[indexByWeight];
+ SHASTA_ASSERT(alignmentRow.size() == alignmentLength);
+ for(uint64_t position=0; position<alignmentLength; position++) {
+ const AlignedBase b = AlignedBase::fromCharacter(alignmentRow[position]);
+ coverage[position][b.value] += sequencesTable[indexByWeight].second;
+ }
+ }
+
+ // Compute coverage-based consensus at each alignment position.
+ vector<AlignedBase> alignedConsensus;
+ for(const auto& c: coverage) {
+ const uint64_t iBase = std::max_element(c.begin(), c.end()) - c.begin();
+ alignedConsensus.push_back(AlignedBase::fromInteger(iBase));
+ }
+ SHASTA_ASSERT(alignedConsensus.size() == alignmentLength);
+
+ // Take out the gaps.
+ consensus.clear();
+ for(const AlignedBase b: alignedConsensus) {
+ if(not b.isGap()) {
+ consensus.push_back(Base(b));
+ }
+ }
+}
+
+
+
+// This just uses spoa.
+// It cannot be used for very long sequences due to quadratic
+// memory and time. Practical limit is a few thousand bases.
+// Version that returns the alignment.
+// THE SEQUENCES MUST BE PASSED IN ORDER OF DECREASING WEIGHT.
+void shasta::globalMsaSpoa(
+ const vector< pair<vector<Base>, uint64_t> >& sequences,
+ vector< vector<AlignedBase> >& alignmentArgument
+ )
+{
+ // Sanity check.
+ SHASTA_ASSERT(not sequences.empty());
+
+ // Check that the sequences are ordered by decreasing weight.
+ for(uint64_t i=1; i<sequences.size(); i++) {
+ SHASTA_ASSERT(sequences[i-1].second >= sequences[i].second);
+ }
+
+ // Create the spoa alignment engine and alignment graph.
+ const spoa::AlignmentType alignmentType = spoa::AlignmentType::kNW;
+ const int8_t match = 1;
+ const int8_t mismatch = -1;
+ const int8_t gap = -1;
+ auto spoaAlignmentEngine = spoa::AlignmentEngine::Create(alignmentType, match, mismatch, gap);
+ spoa::Graph spoaAlignmentGraph;
+
+ // Add the sequences to the MSA in order of decreasing weight.
+ string sequenceString;
+ for(uint64_t i=0; i<sequences.size(); i++) {
+ const auto& p = sequences[i];
+ const vector<Base>& sequence = p.first;
+ const uint64_t weight = p.second;
+
+ sequenceString.clear();
+ for(const Base base: sequence) {
+ sequenceString += base.character();
+ }
+ auto alignment = spoaAlignmentEngine->Align(sequenceString, spoaAlignmentGraph);
+ spoaAlignmentGraph.AddAlignment(alignment, sequenceString, uint32_t(weight));
+ }
+
+ // Get the MSA alignment.
+ // The true argument causes a final alignment entry equal to the consensus.
+ vector<string> alignment = spoaAlignmentGraph.GenerateMultipleSequenceAlignment(false);
+ SHASTA_ASSERT(alignment.size() == sequences.size());
+
+ // Copy it to alignmentArgument.
+ alignmentArgument.clear();
+ alignmentArgument.resize(alignment.size());
+ for(uint64_t i=0 ; i<alignment.size(); i++) {
+ const string& alignmentRow = alignment[i];
+ vector<AlignedBase>& alignmentArgumentRow = alignmentArgument[i];
+ alignmentArgumentRow.resize(alignmentRow.size());
+ for(uint64_t j=0; j<alignmentRow.size(); j++) {
+ alignmentArgumentRow[j] = AlignedBase::fromCharacter(alignmentRow[j]);
+ }
+ }
+
+}
+
+
+
+// Version that enforces a maximum MSA length and returns false if it is exceeded.
+bool shasta::globalMsaSpoa(
+ const vector< pair<vector<Base>, uint64_t> >& sequences,
+ vector<Base>& consensus,
+ uint64_t maximumMsaLength
+ )
+{
+ if(sequences.size() > 1) {
+ uint64_t maxLength = 0;
+ for(const auto& sequence: sequences) {
+ maxLength = max(maxLength, sequence.first.size());
+ }
+ if(maxLength > maximumMsaLength) {
+ return false;
+ }
+ }
+
+ // If getting here, the MSA is no longer than the specified maximum length
+ // (or it is trivial, consisting of just one sequence).
+ globalMsaSpoa(sequences, consensus);
+ return true;
+}
+
+
+
+// Python-callable version.
+std::string shasta::globalMsaPython(
+ const vector< pair<string, uint64_t> >& sequenceStrings,
+ uint64_t maxSpoaLength,
+ uint64_t kmerLength)
+{
+ // Extract the sequences.
+ vector< pair<vector<Base>, uint64_t> > sequences;
+ sequences.reserve(sequenceStrings.size());
+ for(const auto& p: sequenceStrings) {
+ sequences.resize(sequences.size() + 1);
+ sequences.back().second = p.second;
+ const string& sequenceString = p.first;
+ vector<Base>& sequence = sequences.back().first;
+ for(const char c: sequenceString) {
+ sequence.push_back(Base::fromCharacter(c));
+ }
+ }
+
+ // Do the MSA.
+ vector<Base> consensus;
+ globalMsa(sequences, maxSpoaLength, kmerLength, consensus);
+
+ // Construct the consensus string and return it.
+ string consensusString;
+ for(const Base b: consensus) {
+ consensusString.push_back(b.character());
+ }
+ return consensusString;
+}
+
diff --git a/src/globalMsa.hpp b/src/globalMsa.hpp
new file mode 100644
index 0000000..700bf03
--- /dev/null
+++ b/src/globalMsa.hpp
@@ -0,0 +1,62 @@
+#ifndef SHASTA_GLOBAL_MSA_HPP
+#define SHASTA_GLOBAL_MSA_HPP
+
+/*******************************************************************************
+
+Global multiple sequence alignment.
+Global means constrained on both sides, aka Needleman–Wunsch.
+
+This supports sequences of arbitrary length.
+If all the sequences are at most maxSpoaLength long,
+this invokes spoa.
+
+Otherwise it finds a common subsequence of length kmerLength
+and splits the MSA at that location, invoking itself recursively
+to solve the two MSAs.
+
+Each of the input sequences is passed in as a pair.
+The second member of the pair is the "weight" of the sequence
+(that is, typically the number of reads with that sequence).
+
+*******************************************************************************/
+
+#include "cstdint.hpp"
+#include "utility.hpp"
+#include "string.hpp"
+#include "vector.hpp"
+
+namespace shasta {
+
+ class Base;
+ class AlignedBase;
+
+ void globalMsa(
+ const vector< pair<vector<Base>, uint64_t> >& sequences,
+ uint64_t maxSpoaLength,
+ uint64_t kmerLength,
+ vector<Base>& consensus
+ );
+
+ void globalMsaSpoa(
+ const vector< pair<vector<Base>, uint64_t> >& sequences,
+ vector<Base>& consensus
+ );
+ bool globalMsaSpoa(
+ const vector< pair<vector<Base>, uint64_t> >& sequences,
+ vector<Base>& consensus,
+ uint64_t maximumMsaLength
+ );
+ void globalMsaSpoa(
+ const vector< pair<vector<Base>, uint64_t> >& sequences,
+ vector< vector<AlignedBase> >& alignment
+ );
+
+ // Python-callable version.
+ string globalMsaPython(
+ const vector< pair<string, uint64_t> >& sequenceStrings,
+ uint64_t maxSpoaLength,
+ uint64_t kmerLength
+ );
+}
+
+#endif
diff --git a/src/html.cpp b/src/html.cpp
index c30c98e..83aa323 100644
--- a/src/html.cpp
+++ b/src/html.cpp
@@ -176,6 +176,7 @@ function zoomSvg(factor)
y = yCenter - 0.5 * height;
svg.setAttribute('viewBox', `${x} ${y} ${width} ${height}`);
+ svg.setAttribute('font-size', svg.getAttribute('font-size') / factor);
return false;
}
@@ -185,3 +186,10 @@ function zoomSvg(factor)
}
+
+
+void shasta::writeInformationIcon(ostream& html, const string& message)
+{
+ html << "<span style='color:Blue;font-weight:bold' title=\"" <<
+ message << "\">&#9432;</span>";
+}
diff --git a/src/html.hpp b/src/html.hpp
index 9702b86..aa90bdc 100644
--- a/src/html.hpp
+++ b/src/html.hpp
@@ -13,6 +13,8 @@ namespace shasta {
void writeStyle(ostream&);
void addSvgDragAndZoom(ostream& html);
+
+ void writeInformationIcon(ostream& html, const string& message);
}
#endif
diff --git a/src/invalid.hpp b/src/invalid.hpp
index 32bb79a..2f14c27 100644
--- a/src/invalid.hpp
+++ b/src/invalid.hpp
@@ -1,14 +1,14 @@
#ifndef SHASTA_INVALID_HPP
#define SHASTA_INVALID_HPP
-// In many contexts, we use invalid<uint64_t> (or similar for other integer types)
+// In many contexts, we use invalid<T>
// to indicate a value that is invalid, uninitialized, or unknown.
-#include <concepts>
#include <numeric>
namespace shasta {
- template<std::integral Int> static const Int invalid = std::numeric_limits<Int>::max();
+ template<class T> static const T invalid = std::numeric_limits<T>::max();
+ template<class T> static const T unlimited = std::numeric_limits<T>::max();
}
#endif
diff --git a/src/localTransitiveReduction.hpp b/src/localTransitiveReduction.hpp
new file mode 100644
index 0000000..b4eb07d
--- /dev/null
+++ b/src/localTransitiveReduction.hpp
@@ -0,0 +1,115 @@
+#ifndef SHASTA_LOCAL_TRANSITIVE_REDUCTION_HPP
+#define SHASTA_LOCAL_TRANSITIVE_REDUCTION_HPP
+
+// Boost libraries.
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/iteration_macros.hpp>
+
+// Standard library.
+#include <queue>
+#include "vector.hpp"
+
+namespace shasta {
+ template<class Graph> void localTransitiveReduction(
+ const Graph&,
+ uint64_t maxPathLength,
+ vector<typename Graph::edge_descriptor>& nonTransitiveReductionEdges);
+}
+
+
+
+// For each directed edge v0->v1, look for a path that:
+// - Starts at v0.
+// - Ends at v1.
+// - Has length at most maxPathLength;
+// - Does not use edge v0>v1;
+// If such a path is found, the edge descriptor is added to nonTransitiveReductionEdges.
+// The edges that are not in nonTransitiveReductionEdges form a sort of "local transitive reduction"
+// of the graph.
+template<class Graph> void shasta::localTransitiveReduction(
+ const Graph& graph,
+ uint64_t maxPathLength,
+ vector<typename Graph::edge_descriptor>& nonTransitiveReductionEdges)
+{
+
+ using namespace boost;
+ using vertex_descriptor = typename Graph::vertex_descriptor;
+ // using edge_descriptor = typename Graph::edge_descriptor;
+
+ // Check the Graph type.
+ static_assert(
+ std::is_same<typename Graph::directed_selector, directedS>::value
+ or
+ std::is_same<typename Graph::directed_selector, bidirectionalS>::value,
+ "shasta::transitiveReduction requires an adjacency_list "
+ "with the third template argument set to boost::directedS or boost::bidirectionalS.");
+
+ // Loop over all edges v0->v1.
+ nonTransitiveReductionEdges.clear();
+ BGL_FORALL_EDGES_T(e01, graph, Graph) {
+ const vertex_descriptor v0 = source(e01, graph);
+ const vertex_descriptor v1 = target(e01, graph);
+
+ // Do a BFS starting at v0, up to a distance maxPathLength.
+ // Stop if we encounter v1.
+
+ // The BFS queue.
+ std::queue<vertex_descriptor> q;
+ q.push(v0);
+
+ // The vertices we encountered so far, with their distance from v0.
+ std::map<vertex_descriptor, uint64_t> m;
+ m.insert({v0, 0});
+
+ // BFS loop.
+ // cout << "BFS loop begins for " << v0 << "->" << v1 << endl;
+ while(not q.empty()) {
+
+ // Dequeue a vertex.
+ const vertex_descriptor vA = q.front();
+ q.pop();
+ const auto itA = m.find(vA);
+ SHASTA_ASSERT(itA != m.end());
+ const uint64_t distanceA = itA->second;
+ const uint64_t distanceB = distanceA + 1;
+ // cout << "Dequeued " << vA << " at distance " << distanceA << endl;
+
+ // Loop over the out-edges of vA.
+ bool endBfs = false;
+ BGL_FORALL_OUTEDGES_T(vA, eAB, graph, Graph) {
+
+ // Dont's use e01 in the BFS.
+ if(eAB == e01) {
+ continue;
+ }
+
+ // If we reached v1, mark e01 as a nonTransitiveReduction edge
+ // and stop the BFS.
+ const vertex_descriptor vB = target(eAB, graph);
+ if(vB == v1) {
+ nonTransitiveReductionEdges.push_back(e01);
+ endBfs = true;
+ // cout << "Reached " << v1 << endl;
+ break;
+ }
+
+ // If we already reached this vertex, do nothing.
+ if(m.contains(vB)) {
+ continue;
+ }
+
+ // If not at maximum distance, enqueue vB.
+ if(distanceB < maxPathLength) {
+ q.push(vB);
+ m.insert({vB, distanceB});
+ // cout << "Enqueued " << vB << " at distance " << distanceB << endl;
+ }
+ }
+ if(endBfs) {
+ break;
+ }
+ }
+ }
+}
+
+#endif
diff --git a/src/longestPath.cpp b/src/longestPath.cpp
new file mode 100644
index 0000000..ce66d74
--- /dev/null
+++ b/src/longestPath.cpp
@@ -0,0 +1,23 @@
+#include "longestPath.hpp"
+#include "iostream.hpp"
+using namespace shasta;
+
+void shasta::testLongestPath()
+{
+ using Graph = boost::adjacency_list<boost::listS, boost::vecS, boost::bidirectionalS>;
+ Graph graph(7);
+ add_edge(0, 1, graph);
+ add_edge(1, 2, graph);
+ add_edge(2, 3, graph);
+ add_edge(4, 1, graph);
+ add_edge(2, 5, graph);
+ add_edge(6, 4, graph);
+
+ vector<Graph::vertex_descriptor> longestPath;
+ shasta::longestPath(graph, longestPath);
+
+ for(const auto v: longestPath) {
+ cout << v << " ";
+ }
+ cout << endl;
+}
diff --git a/src/longestPath.hpp b/src/longestPath.hpp
new file mode 100644
index 0000000..0f92fa7
--- /dev/null
+++ b/src/longestPath.hpp
@@ -0,0 +1,114 @@
+#ifndef SHASTA_LONGEST_PATH_HPP
+#define SHASTA_LONGEST_PATH_HPP
+
+// Boost libraries.
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/iteration_macros.hpp>
+#include <boost/graph/topological_sort.hpp>
+
+// Standard library.
+#include "algorithm.hpp"
+#include <map>
+#include "utility.hpp"
+#include "vector.hpp"
+
+namespace shasta {
+ template<class Graph> void longestPath(
+ const Graph &graph,
+ vector<typename Graph::vertex_descriptor>& longestPath);
+ void testLongestPath();
+}
+
+
+
+// Find the longest path in a directed graph without cycles.
+// Class Graph must be a boost::adjacency_list with
+// the first three template arguments set to <listS, vecS, bidirectionalS>.
+// If the graph has cycles, this throws boost::not_a_dag.
+// This uses the algorithm described here:
+// https://en.wikipedia.org/wiki/Longest_path_problem#Acyclic_graphs
+template<class Graph> void shasta::longestPath(
+ const Graph &graph,
+ vector<typename Graph::vertex_descriptor>& longestPath)
+{
+ using namespace boost;
+ using vertex_descriptor = typename Graph::vertex_descriptor;
+ // using edge_descriptor = typename Graph::edge_descriptor;
+ // using edge_iterator = typename Graph::edge_iterator;
+
+ // Check the Graph type.
+ // Use C++20 concepts instead.
+ static_assert(
+ std::is_same<typename Graph::out_edge_list_selector, listS>::value,
+ "shasta::transitiveReduction requires an adjacency_list "
+ "with the first template argument set to boost::listS.");
+ static_assert(
+ std::is_same<typename Graph::vertex_list_selector, vecS>::value,
+ "shasta::transitiveReduction requires an adjacency_list "
+ "with the second template argument set to boost::vecS.");
+ static_assert(
+ std::is_same<typename Graph::directed_selector, bidirectionalS>::value,
+ "shasta::transitiveReduction requires an adjacency_list "
+ "with the third template argument set to boost::bidirectionalS.");
+
+ // Use boost topological_sort to get a vector of vertex descriptors
+ // in topological order. The output from the boost call is in
+ // reverse topological order.
+ vector<vertex_descriptor> sortedVertices;
+ topological_sort(graph, back_inserter(sortedVertices));
+ std::reverse(sortedVertices.begin(), sortedVertices.end());
+
+ // Map to contain the length of the longest path ending at each vertex.
+ std::map<vertex_descriptor, uint64_t> lengthMap;
+ BGL_FORALL_VERTICES_T(v, graph, Graph) {
+ lengthMap.insert(make_pair(v, 0));
+ }
+
+ // Compute the maximum length of a path ending at each vertex.
+ for(const vertex_descriptor v: sortedVertices) {
+ uint64_t maximumLength = 0;
+ BGL_FORALL_INEDGES_T(v, e, graph, Graph) {
+ maximumLength = max(maximumLength, lengthMap[source(e, graph)]);
+ }
+ lengthMap[v] = maximumLength + 1;
+ }
+
+ // Find the vertex with the longest length.
+ // This will be the end of the longest path.
+ vertex_descriptor v = Graph::null_vertex();
+ uint64_t maximumLength = 0;
+ for(const auto& p: lengthMap) {
+ if(p.second > maximumLength) {
+ v = p.first;
+ maximumLength = p.second;
+ }
+ }
+
+ // Constuct the path, moving backward from here.
+ longestPath.clear();
+ longestPath.push_back(v);
+ while(true) {
+ vertex_descriptor vPrevious = Graph::null_vertex();
+ uint64_t maximumLength = 0;
+ BGL_FORALL_INEDGES_T(v, e, graph, Graph) {
+ const vertex_descriptor v0 = source(e, graph);
+ const uint64_t length = lengthMap[v0];
+ if(length > maximumLength) {
+ vPrevious = v0;
+ maximumLength = length;
+ }
+ }
+ if(vPrevious == Graph::null_vertex()) {
+ break;
+ }
+ v = vPrevious;
+ longestPath.push_back(v);
+
+ }
+ std::reverse(longestPath.begin(), longestPath.end());
+
+}
+
+
+
+#endif
diff --git a/src/markerAccessFunctions.cpp b/src/markerAccessFunctions.cpp
new file mode 100644
index 0000000..7fff818
--- /dev/null
+++ b/src/markerAccessFunctions.cpp
@@ -0,0 +1,86 @@
+#include "markerAccessFunctions.hpp"
+#include "extractKmer.hpp"
+#include "Marker.hpp"
+#include "Reads.hpp"
+using namespace shasta;
+
+
+
+Kmer shasta::getOrientedReadMarkerKmer(
+ OrientedReadId orientedReadId,
+ uint32_t ordinal,
+ uint64_t k,
+ const Reads& reads,
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers
+ )
+{
+ const ReadId readId = orientedReadId.getReadId();
+ const Strand strand = orientedReadId.getStrand();
+
+ if(strand == 0) {
+ return getOrientedReadMarkerKmerStrand0(readId, ordinal, k, reads, markers);
+ } else {
+ return getOrientedReadMarkerKmerStrand1(readId, ordinal, k, reads, markers);
+ }
+
+}
+
+
+
+Kmer shasta::getOrientedReadMarkerKmerStrand0(
+ ReadId readId,
+ uint32_t ordinal0,
+ uint64_t k,
+ const Reads& reads,
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers
+ )
+{
+ const auto read = reads.getRead(uint32_t(readId));
+ const OrientedReadId orientedReadId0(readId, 0);
+ const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()];
+
+ Kmer kmer0;
+ extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0);
+
+ return kmer0;
+}
+
+
+
+Kmer shasta::getOrientedReadMarkerKmerStrand1(
+ ReadId readId,
+ uint32_t ordinal1,
+ uint64_t k,
+ const Reads& reads,
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers
+ )
+{
+
+ // We only have the read stored without reverse complement, so get it from there...
+ const auto read = reads.getRead(uint32_t(readId));
+ const OrientedReadId orientedReadId0(readId, 0);
+ const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()];
+ const uint64_t readMarkerCount = orientedReadMarkers0.size();
+ const uint64_t ordinal0 = readMarkerCount - 1 - ordinal1;
+ Kmer kmer0;
+ extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0);
+
+ // ... then do the reverse complement.
+ const Kmer kmer1 = kmer0.reverseComplement(k);
+ return kmer1;
+}
+
+
+
+// Get the marker KmerId for an oriented read and ordinal.
+KmerId shasta::getOrientedReadMarkerKmerId(
+ OrientedReadId orientedReadId,
+ uint32_t ordinal,
+ uint64_t k,
+ const Reads& reads,
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers
+ )
+{
+ const Kmer kmer = getOrientedReadMarkerKmer(orientedReadId, ordinal, k, reads, markers);
+ return KmerId(kmer.id(k));
+}
diff --git a/src/markerAccessFunctions.hpp b/src/markerAccessFunctions.hpp
new file mode 100644
index 0000000..0e9b7c9
--- /dev/null
+++ b/src/markerAccessFunctions.hpp
@@ -0,0 +1,53 @@
+#ifndef SHASTA_MARKER_ACCESS_FUNCTIONS_HPP
+
+#include "Kmer.hpp"
+#include "ReadId.hpp"
+
+namespace shasta {
+
+ class CompressedMarker;
+ class Reads;
+ namespace MemoryMapped {
+ template<class T, class Int> class VectorOfVectors;
+ }
+
+ // Access functions for markers Kmers and KmerIds.
+ // There are similar member functions in class Assembler,
+ // but these are accessible anywhere else.
+
+ Kmer getOrientedReadMarkerKmer(
+ OrientedReadId,
+ uint32_t ordinal,
+ uint64_t k,
+ const Reads&,
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers
+ );
+
+ Kmer getOrientedReadMarkerKmerStrand0(
+ ReadId,
+ uint32_t ordinal,
+ uint64_t k,
+ const Reads&,
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers
+ );
+
+ Kmer getOrientedReadMarkerKmerStrand1(
+ ReadId,
+ uint32_t ordinal,
+ uint64_t k,
+ const Reads&,
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers
+ );
+
+ // Get the marker KmerId for an oriented read and ordinal.
+ KmerId getOrientedReadMarkerKmerId(
+ OrientedReadId,
+ uint32_t ordinal,
+ uint64_t k,
+ const Reads&,
+ const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers
+ );
+
+}
+
+#endif
diff --git a/src/mode3-AssemblyGraph.cpp b/src/mode3-AssemblyGraph.cpp
new file mode 100644
index 0000000..4b18a8e
--- /dev/null
+++ b/src/mode3-AssemblyGraph.cpp
@@ -0,0 +1,8499 @@
+// Shasta.
+#include "mode3-AssemblyGraph.hpp"
+#include "mode3-LocalAssembly.hpp"
+#include "mode3-PrimaryGraph.hpp"
+#include "mode3-PhasingTable.hpp"
+#include "Assembler.hpp"
+#include "AssemblerOptions.hpp"
+#include "copyNumber.hpp"
+#include "deduplicate.hpp"
+#include "diploidBayesianPhase.hpp"
+#include "dominatorTree.hpp"
+#include "enumeratePaths.hpp"
+#include "findLinearChains.hpp"
+#include "orderPairs.hpp"
+#include "performanceLog.hpp"
+#include "timestamp.hpp"
+using namespace shasta;
+using namespace mode3;
+
+// Boost libraries.
+#include <boost/archive/binary_oarchive.hpp>
+#include <boost/archive/binary_iarchive.hpp>
+#include <boost/graph/adj_list_serialize.hpp>
+#include <boost/graph/filtered_graph.hpp>
+#include <boost/pending/disjoint_sets.hpp>
+#include <boost/graph/reverse_graph.hpp>
+#include <boost/graph/strong_components.hpp>
+
+// Standard library.
+#include "fstream.hpp"
+#include <queue>
+#include "tuple.hpp"
+
+// Explicit instantiation.
+#include "MultithreadedObject.tpp"
+template class MultithreadedObject<AssemblyGraph>;
+
+
+// Create from a connected component of the PrimaryGraph, then call run.
+AssemblyGraph::AssemblyGraph(
+ const PrimaryGraph& graph,
+ uint64_t componentId,
+ const Assembler& assembler,
+ uint64_t threadCount,
+ const Mode3AssemblyOptions& options,
+ bool assembleSequence,
+ bool debug) :
+ MultithreadedObject<AssemblyGraph>(*this),
+ componentId(componentId),
+ assembler(assembler),
+ options(options)
+{
+ // Adjust the numbers of threads, if necessary.
+ if(threadCount == 0) {
+ threadCount = std::thread::hardware_concurrency();
+ }
+
+ performanceLog << timestamp << "Creating the assembly graph for component " << componentId << endl;
+ create(graph, debug);
+
+ // Serialize it so we can restore it to facilitate debugging.
+ save("AssemblyGraph-" + to_string(componentId) + ".data");
+
+ performanceLog << timestamp << "Processing the assembly graph for component " << componentId << endl;
+ run(threadCount, assembleSequence, debug);
+ performanceLog << timestamp << "Done with the assembly graph for component " << componentId << endl;
+}
+
+
+
+// Load it from a binary archive, then call run.
+AssemblyGraph::AssemblyGraph(
+ const string& fileName,
+ const Assembler& assembler,
+ uint64_t threadCount,
+ const Mode3AssemblyOptions& options,
+ bool assembleSequence,
+ bool debug) :
+ MultithreadedObject<AssemblyGraph>(*this),
+ assembler(assembler),
+ options(options)
+{
+ // Adjust the numbers of threads, if necessary.
+ if(threadCount == 0) {
+ threadCount = std::thread::hardware_concurrency();
+ }
+
+ load(fileName);
+ run(threadCount, assembleSequence, debug);
+}
+
+
+
+void AssemblyGraph::run(
+ uint64_t threadCount,
+ bool assembleSequence,
+ bool debug)
+{
+ const bool useBayesianModel = true;
+ // const uint64_t detangleWithSearchToleranceLow = 1;
+ // const uint64_t detangleWithSearchToleranceHigh = 6;
+ // const uint64_t optimizeChainsMinCommon = 3;
+ // const uint64_t optimizeChainsK = 100;
+
+ if(debug) write("A");
+
+ // Don't do any detangling before cleanup of bubbles and superbubbles and phasing.
+
+ // Cleanup bubbles and superbubbles.
+ // Must do compress to make sure all bubbles are in bubble chains.
+ compress();
+ for(uint64_t iteration=0; ; iteration ++) {
+ performanceLog << timestamp << "Iteration " << iteration <<
+ " of bubble cleanup begins." << endl;
+ const uint64_t cleanedUpBubbleCount = cleanupBubbles(
+ false,
+ options.assemblyGraphOptions.bubbleCleanupMaxOffset,
+ options.assemblyGraphOptions.chainTerminalCommonThreshold,
+ threadCount);
+ if(cleanedUpBubbleCount == 0) {
+ break;
+ }
+ if(debug) {
+ cout << "Cleaned up " << cleanedUpBubbleCount << " bubbles probably caused by errors." << endl;
+ }
+ compressBubbleChains();
+ compress();
+ }
+ if(debug) write("B");
+ cleanupSuperbubbles(false,
+ options.assemblyGraphOptions.superbubbleLengthThreshold1,
+ options.assemblyGraphOptions.chainTerminalCommonThreshold);
+ compress();
+
+ // Remove short superbubbles.
+ removeShortSuperbubbles(false,
+ options.assemblyGraphOptions.superbubbleLengthThreshold2,
+ options.assemblyGraphOptions.superbubbleLengthThreshold3);
+ compress();
+
+ // Phase.
+ compressBubbleChains();
+ if(debug) write("C");
+ phaseBubbleChainsUsingPhasingTable(
+ debug ? "C" : "",
+ options.assemblyGraphOptions.phaseErrorThreshold,
+ options.assemblyGraphOptions.bubbleErrorThreshold,
+ options.assemblyGraphOptions.longBubbleThreshold);
+ compress();
+
+ // For detangling, expand all bubble chains.
+ expand();
+
+ // Detangle.
+ if(debug) write("D");
+ performanceLog << timestamp << "Detangling begins." << endl;
+ while(compressSequentialEdges());
+ compressBubbleChains();
+ detangleEdges(false,
+ options.assemblyGraphOptions.detangleToleranceLow,
+ options.assemblyGraphOptions.detangleToleranceHigh,
+ useBayesianModel,
+ options.assemblyGraphOptions.epsilon,
+ options.assemblyGraphOptions.minLogP);
+ while(compressSequentialEdges());
+ compressBubbleChains();
+ detangleVertices(false,
+ options.assemblyGraphOptions.detangleToleranceLow,
+ options.assemblyGraphOptions.detangleToleranceHigh,
+ useBayesianModel,
+ options.assemblyGraphOptions.epsilon,
+ options.assemblyGraphOptions.minLogP);
+ while(compressSequentialEdges());
+ compressBubbleChains();
+ detangleEdges(false,
+ options.assemblyGraphOptions.detangleToleranceLow,
+ options.assemblyGraphOptions.detangleToleranceHigh,
+ useBayesianModel,
+ options.assemblyGraphOptions.epsilon,
+ options.assemblyGraphOptions.minLogP);
+ detangleShortSuperbubbles(false,
+ options.assemblyGraphOptions.superbubbleLengthThreshold4,
+ options.assemblyGraphOptions.detangleToleranceLow,
+ options.assemblyGraphOptions.detangleToleranceHigh,
+ useBayesianModel,
+ options.assemblyGraphOptions.epsilon,
+ options.assemblyGraphOptions.minLogP);
+ performanceLog << timestamp << "Detangling ends." << endl;
+
+ compress();
+ compressBubbleChains();
+ if(debug) write("E");
+
+#if 0
+ // Optimize the chains.
+ optimizeChains(
+ false,
+ optimizeChainsMinCommon,
+ optimizeChainsK);
+#endif
+
+ // Before final output, renumber the edges contiguously.
+ renumberEdges();
+ if(debug) write("F");
+
+ if(assembleSequence) {
+
+ // Assemble sequence.
+ assembleAllChainsMultithreaded(
+ options.assemblyGraphOptions.chainTerminalCommonThreshold,
+ threadCount);
+ writeAssemblyDetails();
+
+ if(debug) write("G", true);
+
+ } else {
+
+ // Skip sequence assembly.
+ write("Final");
+ }
+
+
+}
+
+
+
+// Initial creation from the PrimaryGraph.
+// Each linear chain of edges in the PrimaryGraph after transitive reduction generates
+// an AssemblyGraphEdge (BubbleChain) consisting of a single haploid bubble.
+void AssemblyGraph::create(const PrimaryGraph& graph, bool debug)
+{
+ AssemblyGraph& cGraph = *this;
+
+ // Create a filtered version of the PathGraph, containing only the
+ // transitive reduction edges.
+ class EdgePredicate {
+ public:
+ bool operator()(const PrimaryGraph::edge_descriptor e) const
+ {
+ return not (*graph)[e].isNonTransitiveReductionEdge;
+ }
+ EdgePredicate(const PrimaryGraph& graph) : graph(&graph) {}
+ EdgePredicate() : graph(0) {}
+ private:
+ const PrimaryGraph* graph;
+ };
+ using FilteredPrimaryGraph = boost::filtered_graph<PrimaryGraph, EdgePredicate>;
+ FilteredPrimaryGraph filteredGraph(graph, EdgePredicate(graph));
+
+ // Find linear chains in the PathGraph after transitive reduction.
+ vector< vector<PrimaryGraph::edge_descriptor> > inputChains;
+ findLinearChains(filteredGraph, 0, inputChains);
+
+ // Each chain generates an edge.
+ // Vertices are added as needed.
+ std::map<MarkerGraphEdgeId, vertex_descriptor> vertexMap;
+ for(const vector<PrimaryGraph::edge_descriptor>& inputChain: inputChains) {
+ const PrimaryGraph::vertex_descriptor v0 = source(inputChain.front(), graph);
+ const PrimaryGraph::vertex_descriptor v1 = target(inputChain.back(), graph);
+ const MarkerGraphEdgeId markerGraphEdgeId0 = graph[v0].edgeId;
+ const MarkerGraphEdgeId markerGraphEdgeId1 = graph[v1].edgeId;
+ const vertex_descriptor cv0 = getVertex(markerGraphEdgeId0, vertexMap);
+ const vertex_descriptor cv1 = getVertex(markerGraphEdgeId1, vertexMap);
+
+ // Create an edge for this input chain.
+ edge_descriptor ce;
+ tie(ce, ignore) = add_edge(cv0, cv1, cGraph);
+ AssemblyGraphEdge& edge = cGraph[ce];
+ edge.id = nextEdgeId++;
+
+ // The edge is a degenerate BubbleChain consisting of a single haploid bubble.
+ edge.resize(1); // BubbleChain has length 1.
+ Bubble& bubble = edge.front();
+ bubble.resize(1); // Bubble is haploid.
+
+ // Store the chain.
+ Chain& chain = bubble.front();
+ for(const PrimaryGraph::edge_descriptor e: inputChain) {
+ const PrimaryGraph::vertex_descriptor v = source(e, graph);
+ chain.push_back(graph[v].edgeId);
+ }
+ const PrimaryGraph::edge_descriptor eLast = inputChain.back();
+ const PrimaryGraph::vertex_descriptor vLast = target(eLast, graph);
+ chain.push_back(graph[vLast].edgeId);
+ }
+}
+
+
+
+// Return the vertex corresponding to a given MarkerGraphEdgeId,
+// creating it if it is not in the given vertexMap
+AssemblyGraph::vertex_descriptor AssemblyGraph::getVertex(
+ MarkerGraphEdgeId markerGraphEdgeId,
+ std::map<MarkerGraphEdgeId, vertex_descriptor>& vertexMap)
+{
+ AssemblyGraph& cGraph = *this;
+
+ auto it = vertexMap.find(markerGraphEdgeId);
+ if(it == vertexMap.end()) {
+ const vertex_descriptor cv = add_vertex({markerGraphEdgeId}, cGraph);
+ vertexMap.insert({markerGraphEdgeId, cv});
+ return cv;
+ } else {
+ return it->second;
+ }
+}
+
+
+
+// Create a new vertex with a given MarkerGraphEdgeId.
+AssemblyGraph::vertex_descriptor AssemblyGraph::createVertex(
+ MarkerGraphEdgeId markerGraphEdgeId)
+{
+ return add_vertex({markerGraphEdgeId}, *this);
+}
+
+
+
+void AssemblyGraph::removeVertex(vertex_descriptor cv)
+{
+ AssemblyGraph& cGraph = *this;
+
+ SHASTA_ASSERT(in_degree(cv, cGraph) == 0);
+ SHASTA_ASSERT(out_degree(cv, cGraph) == 0);
+
+ boost::remove_vertex(cv, cGraph);
+}
+
+
+
+// Compute vertexIndex for every vertex.
+// This numbers vertices consecutively starting at zero.
+// This numbering becomes invalid as soon as a vertex is added or removed.
+void AssemblyGraph::numberVertices()
+{
+ AssemblyGraph& cGraph = *this;
+ uint64_t index = 0;
+ BGL_FORALL_VERTICES(cv, cGraph, AssemblyGraph) {
+ cGraph[cv].index = index++;
+ }
+}
+
+
+
+void AssemblyGraph::clearVertexNumbering()
+{
+ AssemblyGraph& cGraph = *this;
+ BGL_FORALL_VERTICES(cv, cGraph, AssemblyGraph) {
+ cGraph[cv].index = invalid<uint64_t>;
+ }
+
+}
+
+
+void AssemblyGraph::renumberEdges()
+{
+ AssemblyGraph& cGraph = *this;
+ nextEdgeId = 0;
+
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+ cGraph[ce].id = nextEdgeId++;
+ }
+}
+
+
+
+// Compress parallel edges into bubbles, where possible.
+bool AssemblyGraph::compressParallelEdges()
+{
+ AssemblyGraph& cGraph = *this;
+ bool changesWereMade = false;
+
+ // Look for sets of parallel edges v0->v1.
+ vector<vertex_descriptor> childrenVertices;
+ vector<edge_descriptor> edgesToBeRemoved;
+ Bubble newBubble;
+ BGL_FORALL_VERTICES(v0, cGraph, AssemblyGraph) {
+ if(out_degree(v0, cGraph) < 2) {
+ continue;
+ }
+
+ // Find distinct children vertices of v0.
+ childrenVertices.clear();
+ BGL_FORALL_OUTEDGES(v0, e, cGraph, AssemblyGraph) {
+ childrenVertices.push_back(target(e, cGraph));
+ }
+ deduplicate(childrenVertices);
+
+ // Handle the children vertices one at a time.
+ for(const vertex_descriptor v1: childrenVertices) {
+
+ // Create the new bubble using parallel edges v0->v1.
+ newBubble.clear();
+ edgesToBeRemoved.clear();
+ BGL_FORALL_OUTEDGES(v0, e, cGraph, AssemblyGraph) {
+ if(target(e, cGraph) != v1) {
+ continue;
+ }
+ AssemblyGraphEdge& edge = cGraph[e];
+
+ // The BubbleChain must have length 1.
+ if(edge.size() > 1) {
+ continue;
+ }
+ const Bubble& oldBubble = edge.front();
+
+ copy(oldBubble.begin(), oldBubble.end(), back_inserter(newBubble));
+ edgesToBeRemoved.push_back(e);
+ }
+ if(edgesToBeRemoved.size() < 2) {
+ continue;
+ }
+
+ // Create the new edge.
+ changesWereMade = true;
+ edge_descriptor eNew;
+ tie(eNew, ignore) = add_edge(v0, v1, cGraph);
+ AssemblyGraphEdge& newEdge = cGraph[eNew];
+ newEdge.id = nextEdgeId++;
+ newEdge.resize(1); // Make it a single bubble.
+ Bubble& newEdgeBubble = newEdge.front();
+ newEdgeBubble = newBubble;
+ newEdgeBubble.deduplicate();
+
+ // Remove the old edges.
+ for(const edge_descriptor e: edgesToBeRemoved) {
+ boost::remove_edge(e, cGraph);
+ }
+
+ }
+ }
+ return changesWereMade;
+}
+
+
+
+// Remove duplicate chains.
+void Bubble::deduplicate()
+{
+ shasta::deduplicate(*this);
+}
+
+
+
+// Compress linear sequences of edges (BubbleChains) into longer BubbleChains.
+bool AssemblyGraph::compressSequentialEdges()
+{
+ AssemblyGraph& cGraph = *this;
+ bool changesWereMade = false;
+
+ // Find linear chains of edges.
+ vector< vector<edge_descriptor> > linearChains;
+ findLinearChains(cGraph, 0, linearChains);
+
+
+
+ // Each linear chain of more than one edge gets compressed into a single edge (BubbleChain).
+ for(const vector<edge_descriptor>& linearChain: linearChains) {
+ if(linearChain.size() < 2) {
+ continue;
+ }
+
+ // Create the new edge.
+ changesWereMade = true;
+ const vertex_descriptor v0 = source(linearChain.front(), cGraph);
+ const vertex_descriptor v1 = target(linearChain.back(), cGraph);
+ edge_descriptor ceNew;
+ tie(ceNew, ignore) = add_edge(v0, v1, cGraph);
+ AssemblyGraphEdge& newEdge = cGraph[ceNew];
+ newEdge.id = nextEdgeId++;
+ for(const edge_descriptor ce: linearChain) {
+ const AssemblyGraphEdge& oldEdge = cGraph[ce];
+ copy(oldEdge.begin(), oldEdge.end(), back_inserter(newEdge));
+ }
+
+ // Remove the old edges.
+ for(const edge_descriptor ce: linearChain) {
+ boost::remove_edge(ce, cGraph);
+ }
+
+ // Remove the vertices internal to the old edge.
+ for(uint64_t i=1; i<linearChain.size(); i++) {
+ const vertex_descriptor cv = source(linearChain[i], cGraph);
+ cGraph.removeVertex(cv);
+ }
+ }
+ return changesWereMade;
+}
+
+
+
+// Call compressParallelEdges and compressSequentialEdges iteratively until nothing changes.
+bool AssemblyGraph::compress()
+{
+ bool changesWereMade = false;
+
+ while(true) {
+ const bool compressBubbleChainChanges = compressBubbleChains();
+ const bool compressParallelChanges = compressParallelEdges();
+ const bool compressSequentialChanges = compressSequentialEdges();
+
+ if(compressBubbleChainChanges or compressParallelChanges or compressSequentialChanges) {
+ // Something changed. Continue the iteration loop.
+ changesWereMade = true;
+ continue;
+ } else {
+ // Nothing changed at this iteration. Stop iteration loop.
+ break;
+ }
+ }
+
+ return changesWereMade;
+}
+
+
+
+// Call compress on all BubbleChains to merge adjacent haploid bubbles.
+bool AssemblyGraph::compressBubbleChains()
+{
+ AssemblyGraph& cGraph = *this;
+
+ bool changesWereMade = false;
+ BGL_FORALL_EDGES(e, cGraph, AssemblyGraph) {
+ if(cGraph[e].compress()) {
+ changesWereMade = true;
+ }
+ }
+
+ return changesWereMade;
+}
+
+
+
+// This does the opposite of compress. All bubble chains that
+// consist of more than one simple haploid bubble are expanded into one
+// edge for each edge of each bubble.
+// For optimal results it is best to call compressBubbleChains before expand.
+void AssemblyGraph::expand()
+{
+ AssemblyGraph& cGraph = *this;
+
+ // Gather all edges that exist at this point.
+ vector<edge_descriptor> initialEdges;
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+ initialEdges.push_back(ce);
+ }
+
+
+
+ // Loop over the initial edges.
+ for(const edge_descriptor ce: initialEdges) {
+ BubbleChain& bubbleChain = cGraph[ce];
+
+ // If this bubbleChain consists of a single haploid bubble, don't do anything.
+ if(bubbleChain.isSimpleChain()) {
+ continue;
+ }
+
+ // Prepare a vector of the vertices that will be the sources and targets
+ // of the edges we will create.
+ vector<vertex_descriptor> newVertices;
+ newVertices.push_back(source(ce, cGraph));
+ for(uint64_t positionInBubbleChain=1; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) {
+ const vertex_descriptor cv = createVertex(bubbleChain[positionInBubbleChain].front().front());
+ newVertices.push_back(cv);
+ }
+ newVertices.push_back(target(ce, cGraph));
+
+ // Create a new edge for each chain of each bubble in this bubble chain.
+ for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) {
+ Bubble& bubble = bubbleChain[positionInBubbleChain];
+ const vertex_descriptor cv0 = newVertices[positionInBubbleChain];
+ const vertex_descriptor cv1 = newVertices[positionInBubbleChain + 1];
+
+ for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) {
+ Chain& chain = bubble[indexInBubble];
+
+ // Create a new edge for this chain.
+ edge_descriptor ceNew;
+ tie(ceNew, ignore) = add_edge(cv0, cv1, cGraph);
+ AssemblyGraphEdge& edge = cGraph[ceNew];
+ edge.id = nextEdgeId++;
+
+ // Store this Chain in the new edge.
+ BubbleChain& newBubbleChain = cGraph[ceNew];
+ newBubbleChain.resize(1);
+ Bubble& newBubble = newBubbleChain.front();
+ newBubble.resize(1);
+ Chain& newChain = newBubble.front();
+ newChain.swap(chain);
+ }
+ }
+
+ // Now we can remove the BubbleChain.
+ boost::remove_edge(ce, cGraph);
+ }
+}
+
+
+
+void AssemblyGraph::write(const string& name, bool writeSequence) const
+{
+ const string fileNamePrefix = name + "-" + to_string(componentId);
+
+ cout << fileNamePrefix << ": " << num_vertices(*this) <<
+ " vertices, " << num_edges(*this) << " edges. Next edge id " << nextEdgeId << endl;
+
+ writeCsv(fileNamePrefix);
+ writeGraphviz(fileNamePrefix, true);
+ writeGraphviz(fileNamePrefix, false);
+ writeGfa(fileNamePrefix);
+ writeGfaExpanded(name, writeSequence, writeSequence);
+ if(writeSequence) {
+ writeFastaExpanded(name);
+ }
+}
+
+
+
+void AssemblyGraph::writeCsv(const string& fileNamePrefix) const
+{
+ writeChainsDetailsCsv(fileNamePrefix);
+ writeChainsCsv(fileNamePrefix);
+ writeBubblesCsv(fileNamePrefix);
+ writeBubbleChainsCsv(fileNamePrefix);
+}
+
+
+
+void AssemblyGraph::writeBubbleChainsCsv(const string& fileNamePrefix) const
+{
+ const AssemblyGraph& cGraph = *this;
+
+ ofstream csv(fileNamePrefix + "-BubbleChains.csv");
+ csv << "Id,ComponentId,BubbleChainId,v0,v1,BubbleCount,AverageOffset,MinOffset,MaxOffset,\n";
+
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+ const vertex_descriptor cv0 = source(ce, cGraph);
+ const vertex_descriptor cv1 = target(ce, cGraph);
+ const BubbleChain& bubbleChain = cGraph[ce];
+
+ uint64_t averageOffset;
+ uint64_t minOffset;
+ uint64_t maxOffset;
+ bubbleChainOffset(bubbleChain, averageOffset, minOffset, maxOffset);
+
+ csv << bubbleChainStringId(ce) << ",";
+ csv << componentId << ",";
+ csv << cGraph[ce].id << ",";
+ csv << cGraph[cv0].edgeId << ",";
+ csv << cGraph[cv1].edgeId << ",";
+ csv << bubbleChain.size() << ",";
+ csv << averageOffset << ",";
+ csv << minOffset << ",";
+ csv << maxOffset << ",";
+ csv << "\n";
+ }
+}
+
+
+
+
+void AssemblyGraph::writeBubblesCsv(const string& fileNamePrefix) const
+{
+ const AssemblyGraph& cGraph = *this;
+
+ ofstream csv(fileNamePrefix + "-Bubbles.csv");
+ csv << "Id,ComponentId,BubbleChainId,Position in bubble chain,v0,v1,Ploidy,AverageOffset,MinOffset,MaxOffset,\n";
+
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+
+ for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) {
+ const Bubble& bubble = bubbleChain[positionInBubbleChain];
+ const Chain& firstChain = bubble.front();
+
+ // Check that all the chains begins/end in the same place.
+ for(const Chain& chain: bubble) {
+ SHASTA_ASSERT(chain.front() == firstChain.front());
+ SHASTA_ASSERT(chain.back() == firstChain.back());
+ }
+
+ uint64_t averageOffset;
+ uint64_t minOffset;
+ uint64_t maxOffset;
+ bubbleOffset(bubble, averageOffset, minOffset, maxOffset);
+
+ csv << bubbleStringId(ce, positionInBubbleChain) << ",";
+ csv << componentId << ",";
+ csv << cGraph[ce].id << ",";
+ csv << positionInBubbleChain << ",";
+ csv << firstChain.front() << ",";
+ csv << firstChain.back() << ",";
+ csv << bubble.size() << ",";
+ csv << averageOffset << ",";
+ csv << minOffset << ",";
+ csv << maxOffset << ",";
+ csv << "\n";
+ }
+ }
+
+}
+
+
+void AssemblyGraph::writeChainsCsv(const string& fileNamePrefix) const
+{
+ const AssemblyGraph& cGraph = *this;
+
+ ofstream csv(fileNamePrefix + "-Chains.csv");
+ csv << "Id,ComponentId,BubbleChainId,Position in bubble chain,Index in bubble,Length,Offset\n";
+
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+
+ for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) {
+ const Bubble& bubble = bubbleChain[positionInBubbleChain];
+ const uint64_t ploidy = bubble.size();
+
+ for(uint64_t indexInBubble=0; indexInBubble<ploidy; indexInBubble++) {
+ const Chain& chain = bubble[indexInBubble];
+ SHASTA_ASSERT(chain.size() >= 2);
+
+ csv << chainStringId(ce, positionInBubbleChain, indexInBubble) << ",";
+ csv << componentId << ",";
+ csv << cGraph[ce].id << ",";
+ csv << positionInBubbleChain << ",";
+ csv << indexInBubble << ",";
+ csv << chain.size() << ",";
+ csv << chainOffset(chain) << ",";
+ csv << "\n";
+ }
+ }
+ }
+
+}
+
+
+
+void AssemblyGraph::writeChainsDetailsCsv(const string& fileNamePrefix) const
+{
+ const AssemblyGraph& cGraph = *this;
+
+ ofstream csv(fileNamePrefix + "-ChainsDetails.csv");
+ csv << "Id,ComponentId,BubbleChainId,Position in bubble chain,"
+ "Index in bubble,Position in chain,MarkerGraphEdgeId,Coverage,Common,Offset\n";
+
+ BGL_FORALL_EDGES(e, cGraph, AssemblyGraph) {
+ writeChainDetailsCsv(csv, e, false);
+ }
+}
+
+
+
+void AssemblyGraph::writeChainDetailsCsv(
+ ostream& csv,
+ edge_descriptor e,
+ bool writeHeader) const
+{
+ const AssemblyGraph& cGraph = *this;
+ const BubbleChain& bubbleChain = cGraph[e];
+
+ if(writeHeader) {
+ csv << "Id,ComponentId,BubbleChainId,Position in bubble chain,"
+ "Index in bubble,Position in chain,MarkerGraphEdgeId,Coverage,Common,Offset\n";
+ }
+
+ for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) {
+ const Bubble& bubble = bubbleChain[positionInBubbleChain];
+ const uint64_t ploidy = bubble.size();
+
+ for(uint64_t indexInBubble=0; indexInBubble<ploidy; indexInBubble++) {
+ const Chain& chain = bubble[indexInBubble];
+ SHASTA_ASSERT(chain.size() >= 2);
+
+ for(uint64_t positionInChain=0; positionInChain<chain.size(); positionInChain++) {
+ const MarkerGraphEdgeId markerGraphEdgeId = chain[positionInChain];
+ const uint64_t coverage = assembler.markerGraph.edgeCoverage(markerGraphEdgeId);
+ csv << chainStringId(e, positionInBubbleChain, indexInBubble) << ",";
+ csv << componentId << ",";
+ csv << cGraph[e].id << ",";
+ csv << positionInBubbleChain << ",";
+ csv << indexInBubble << ",";
+ csv << positionInChain << ",";
+ csv << markerGraphEdgeId << ",";
+ csv << coverage << ",";
+
+ if(positionInChain != 0) {
+ const MarkerGraphEdgeId previousMarkerGraphEdgeId = chain[positionInChain - 1];
+ MarkerGraphEdgePairInfo info;
+ SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(
+ previousMarkerGraphEdgeId, markerGraphEdgeId, info));
+ csv << info.common << ",";
+ if(info.common != 0) {
+ csv << info.offsetInBases << ",";
+ }
+ }
+ csv << "\n";
+ }
+ }
+ }
+}
+
+
+
+void AssemblyGraph::writeGraphviz(
+ const string& fileNamePrefix,
+ bool labels) const
+{
+ const AssemblyGraph& cGraph = *this;
+
+ ofstream dot;
+ if(labels) {
+ dot.open(fileNamePrefix + ".dot");
+ } else {
+ dot.open(fileNamePrefix + "-NoLabels.dot");
+ }
+
+ dot << "digraph Component_" << componentId << "{\n";
+
+ BGL_FORALL_VERTICES(cv, cGraph, AssemblyGraph) {
+ const MarkerGraphEdgeId edgeId = cGraph[cv].edgeId;
+ const uint64_t coverage = assembler.markerGraph.edgeCoverage(edgeId);
+ dot << edgeId << "[label=\"" << edgeId << "\\n" << coverage << "\"];\n";
+ }
+
+
+
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+ const vertex_descriptor cv0 = source(ce, cGraph);
+ const vertex_descriptor cv1 = target(ce, cGraph);
+
+ uint64_t averageOffset;
+ uint64_t minOffset;
+ uint64_t maxOffset;
+ bubbleChainOffset(cGraph[ce], averageOffset, minOffset, maxOffset);
+
+ dot << cGraph[cv0].edgeId << "->" << cGraph[cv1].edgeId;
+
+ if(labels) {
+ dot << " [label=\"";
+ dot << bubbleChainStringId(ce) << "\\noff=" << averageOffset;
+
+ // Additional annotation if this BubbleChain consists of a single
+ // haploid bubble.
+ const uint64_t bubbleCount = bubbleChain.size();
+ if(bubbleCount == 1) {
+ const Bubble& bubble = bubbleChain.front();
+ const uint64_t ploidy = bubble.size();
+ if(ploidy == 1) {
+ const Chain& chain = bubble.front();
+ dot << "\\nlen=" << chain.size();
+ if(chain.size() > 2) {
+ // Compute average coverage for the internal edges.
+ uint64_t coverageSum = 0;
+ for(uint64_t i=1; i<chain.size()-1; i++) {
+ coverageSum += assembler.markerGraph.edgeCoverage(chain[i]);
+ }
+ const double averageCoverage = double(coverageSum) / double(chain.size() - 2);
+ dot << "\\ncov=" << uint64_t(std::round(averageCoverage));
+
+ dot << "\\n" << chain.second();
+ if(chain.size() > 3) {
+ dot << "\\n" << chain.secondToLast();
+ }
+ }
+ }
+ }
+
+ dot << "\"]";
+ }
+ dot << ";\n";
+ }
+
+ dot << "}\n";
+}
+
+
+
+void AssemblyGraph::writeGfa(const string& fileNamePrefix) const
+{
+ const AssemblyGraph& cGraph = *this;
+
+ ofstream gfa(fileNamePrefix + ".gfa");
+
+ // Write the header line.
+ gfa << "H\tVN:Z:1.0\n";
+
+ // Write a segment for each edge.
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+
+ uint64_t averageOffset;
+ uint64_t minOffset;
+ uint64_t maxOffset;
+ bubbleChainOffset(cGraph[ce], averageOffset, minOffset, maxOffset);
+
+ // Record type.
+ gfa << "S\t";
+
+ // Name.
+ gfa << bubbleChainStringId(ce) << "\t";
+
+ // Sequence.
+ gfa << "*\t";
+
+ // Sequence length in bases.
+ gfa << "LN:i:" << averageOffset << "\n";
+ }
+
+ // For each vertex, write links between each pair of incoming/outgoing edges.
+ BGL_FORALL_VERTICES(cv, cGraph, AssemblyGraph) {
+ BGL_FORALL_INEDGES(cv, ceIn, cGraph, AssemblyGraph) {
+ BGL_FORALL_OUTEDGES(cv, ceOut, cGraph, AssemblyGraph) {
+ gfa <<
+ "L\t" <<
+ bubbleChainStringId(ceIn) << "\t+\t" <<
+ bubbleChainStringId(ceOut) << "\t+\t*\n";
+ }
+ }
+ }
+}
+
+
+
+void AssemblyGraph::writeGfaExpanded(
+ ostream& gfa,
+ bool includeSequence,
+ bool useSequenceLength) const
+{
+ writeGfaHeader(gfa);
+ writeGfaSegmentsExpanded(gfa, includeSequence, useSequenceLength);
+ writeGfaLinksExpanded(gfa);
+}
+
+
+
+void AssemblyGraph::writeGfaSegmentsExpanded(
+ ostream& gfa,
+ bool includeSequence,
+ bool useSequenceLength
+) const
+{
+ if(includeSequence) {
+ SHASTA_ASSERT(useSequenceLength);
+ }
+
+ const AssemblyGraph& graph = *this;
+
+ // Loop over BubbleChains. Each Chain of each Bubble generates a GFA segment.
+ BGL_FORALL_EDGES(ce, graph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = graph[ce];
+
+ // Loop over Bubbles of this chain.
+ for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size();
+ ++positionInBubbleChain) {
+ const Bubble& bubble = bubbleChain[positionInBubbleChain];
+
+ // Loop over chains of this bubble.
+ for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) {
+ const Chain& chain = bubble[indexInBubble];
+
+ // Record type.
+ gfa << "S\t";
+
+ // Name.
+ gfa << chainStringId(ce, positionInBubbleChain, indexInBubble) << "\t";
+
+ if(includeSequence) {
+ using shasta::Base;
+ const vector<Base>& sequence = chain.sequence;
+
+ // Sequence.
+ copy(sequence.begin(), sequence.end(), ostream_iterator<Base>(gfa));
+ gfa << "\t";
+
+ // Sequence length in bases.
+ gfa << "LN:i:" << sequence.size() << "\n";
+
+ } else {
+
+ // Sequence.
+ gfa << "*\t";
+
+ // Sequence length in bases.
+ if(useSequenceLength) {
+ gfa << "LN:i:" << chain.sequence.size() << "\n";
+ } else {
+ const uint64_t offset = chainOffset(chain);
+ gfa << "LN:i:" << offset << "\n";
+ }
+ }
+ }
+ }
+ }
+}
+
+
+
+// This writes a csv summary with one line for each assembled segment.
+void AssemblyGraph::writeCsvSummary(ostream& csv) const
+{
+ const AssemblyGraph& assemblyGraph = *this;
+
+ // Loop over BubbleChains. Each Chain of each Bubble generates a GFA segment.
+ BGL_FORALL_EDGES(e, assemblyGraph, AssemblyGraph) {
+ const AssemblyGraphEdge& edge = assemblyGraph[e];
+ const BubbleChain& bubbleChain = edge;
+ const vertex_descriptor v0 = source(e, assemblyGraph);
+ const vertex_descriptor v1 = target(e, assemblyGraph);
+
+ // Loop over Bubbles of this chain.
+ for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size();
+ ++positionInBubbleChain) {
+ const Bubble& bubble = bubbleChain[positionInBubbleChain];
+
+ // Loop over chains of this bubble.
+ for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) {
+ const Chain& chain = bubble[indexInBubble];
+ const uint64_t pValue = chainPValue(e, positionInBubbleChain, indexInBubble);
+
+ // Define connectivity string.
+ string connectivity;
+ if(pValue == 0) {
+ const bool danglingAtBeginning = (in_degree(v0, assemblyGraph) == 0);
+ const bool danglingAtEnd = (out_degree(v1, assemblyGraph) == 0);
+ const bool isDangling = (danglingAtBeginning or danglingAtEnd);
+ const bool isIsolated = (danglingAtBeginning and danglingAtEnd);
+ if(isIsolated) {
+ connectivity = "Isolated";
+ } else if(isDangling) {
+ connectivity = "Dangling";
+ } else {
+ connectivity = "Complex";
+ }
+
+ } else if(pValue == 1) {
+ connectivity = "Haploid";
+ } else if(pValue == 2) {
+ connectivity = "Diploid";
+ } else {
+ connectivity = "Ploidy-" + to_string(pValue);
+ }
+
+ // Set the color for display in Bandage.
+ // The colors below are constructed using HSV(hue,75%,750%).
+ // Bandage support for HSV appears to be buggy.
+ string color;
+ switch(pValue) {
+
+ case 0:
+ {
+ // The only Chain of this BubbleChain.
+ // Figure out if it is dangling.
+ const vertex_descriptor v0 = source(e, assemblyGraph);
+ const vertex_descriptor v1 = target(e, assemblyGraph);
+ const bool isDanglingBackward = (in_degree(v0, assemblyGraph) == 0);
+ const bool isDanglingForward = (out_degree(v1, assemblyGraph) == 0);
+ const bool isIsolated = (isDanglingBackward and isDanglingForward);
+ const bool isDangling = (isDanglingBackward or isDanglingForward);
+
+ if(isIsolated) {
+ color = "#3030bf"; // Blue
+ } else if(isDangling) {
+ color = "#30bfbf"; // Cyan
+ } else {
+ color = "#bf30bf"; // Purple
+ }
+ }
+ break;
+
+ case 1:
+ // Haploid Chain in a non-trivial BubbleChain.
+ color = "#bf3030"; // Red
+ break;
+ case 2:
+ // Diploid segment.
+ color = "#30bf30"; // Green
+ break;
+ default:
+ // Ploidy > 2.
+ color = "#bfbf30"; // Yellow
+ break;
+ }
+
+ csv << chainStringId(e, positionInBubbleChain, indexInBubble) << ",";
+ csv << connectivity << ",";
+ csv << componentId << ",";
+ csv << edge.id << ",";
+ csv << positionInBubbleChain << ",";
+ csv << indexInBubble << ",";
+ csv << chain.sequence.size() << ",";
+ if(chain.size() > 2) {
+ csv << std::fixed << std::setprecision(1) << primaryCoverage(chain);
+ }
+ csv << ",";
+ csv << pValue << ",";
+ csv << color << ",";
+
+
+
+ // Write the preceding segments.
+ if(positionInBubbleChain == 0) {
+
+ // The preceding segments are the Chains of the last Bubble
+ // of each previous BubbleChain.
+ bool isFirst = true;
+ BGL_FORALL_INEDGES(v0, e, assemblyGraph, AssemblyGraph) {
+ const AssemblyGraphEdge& edge = assemblyGraph[e];
+ const BubbleChain& bubbleChain = edge;
+ const uint64_t positionInBubbleChain = bubbleChain.size() - 1;
+ const Bubble& bubble = bubbleChain[positionInBubbleChain];
+ for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) {
+ if(isFirst) {
+ isFirst = false;
+ } else {
+ csv << " ";
+ }
+ csv << chainStringId(e, positionInBubbleChain, indexInBubble);
+ }
+ }
+ } else {
+
+ // The preceding segments are the Chains of the previous Bubble
+ // in this BubbleChain.
+ const Bubble& bubble = bubbleChain[positionInBubbleChain - 1];
+ for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) {
+ if(indexInBubble != 0) {
+ csv << " ";
+ }
+ csv << chainStringId(e, positionInBubbleChain - 1, indexInBubble);
+ }
+ }
+ csv << ",";
+
+
+
+ // Write the following segments.
+ if(positionInBubbleChain == bubbleChain.size() - 1) {
+
+ // The following segments are the Chains of the first Bubble
+ // of each next BubbleChain.
+ bool isFirst = true;
+ BGL_FORALL_OUTEDGES(v1, e, assemblyGraph, AssemblyGraph) {
+ const AssemblyGraphEdge& edge = assemblyGraph[e];
+ const BubbleChain& bubbleChain = edge;
+ const uint64_t positionInBubbleChain = 0;
+ const Bubble& bubble = bubbleChain[positionInBubbleChain];
+ for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) {
+ if(isFirst) {
+ isFirst = false;
+ } else {
+ csv << " ";
+ }
+ csv << chainStringId(e, positionInBubbleChain, indexInBubble);
+ }
+ }
+ } else {
+
+ // The following segments are the Chains of the next Bubble
+ // in this BubbleChain.
+ const Bubble& bubble = bubbleChain[positionInBubbleChain + 1];
+ for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) {
+ if(indexInBubble != 0) {
+ csv << " ";
+ }
+ csv << chainStringId(e, positionInBubbleChain + 1, indexInBubble);
+ }
+ }
+ csv << ",";
+
+ csv << "\n";
+ }
+ }
+ }
+}
+
+
+
+void AssemblyGraph::writeGfaLinksExpanded(ostream& gfa) const
+{
+ const AssemblyGraph& graph = *this;
+
+ // Write links between adjacent Chains of each BubbleChain.
+ BGL_FORALL_EDGES(ce, graph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = graph[ce];
+
+ // Loop over Bubbles of this chain.
+ for(uint64_t positionInBubbleChain=1; positionInBubbleChain<bubbleChain.size();
+ ++positionInBubbleChain) {
+ const Bubble& bubble0 = bubbleChain[positionInBubbleChain - 1];
+ const Bubble& bubble1 = bubbleChain[positionInBubbleChain];
+ const uint64_t overlapLength = assembler.markerGraph.edgeSequence[bubble1.front().front()].size();
+
+ for(uint64_t indexInBubble0=0; indexInBubble0<bubble0.size(); indexInBubble0++) {
+ const string chain0StringId = chainStringId(ce, positionInBubbleChain-1, indexInBubble0);
+
+ for(uint64_t indexInBubble1=0; indexInBubble1<bubble1.size(); indexInBubble1++) {
+ const string chain1StringId = chainStringId(ce, positionInBubbleChain, indexInBubble1);
+
+ gfa <<
+ "L\t" <<
+ chain0StringId << "\t+\t" <<
+ chain1StringId << "\t+\t" << overlapLength << "M\n";
+ }
+ }
+ }
+ }
+
+
+
+ // Write links between Chains in different bubble chains.
+ BGL_FORALL_VERTICES(cv, graph, AssemblyGraph) {
+ const uint64_t overlapLength = assembler.markerGraph.edgeSequence[graph[cv].edgeId].size();
+
+ BGL_FORALL_INEDGES(cv, ce0, graph, AssemblyGraph) {
+ const BubbleChain& bubbleChain0 = graph[ce0];
+ const Bubble& bubble0 = bubbleChain0.back();
+ BGL_FORALL_OUTEDGES(cv, ce1, graph, AssemblyGraph) {
+ const BubbleChain& bubbleChain1 = graph[ce1];
+ const Bubble& bubble1 = bubbleChain1.front();
+
+ for(uint64_t indexInBubble0=0; indexInBubble0<bubble0.size(); indexInBubble0++) {
+ const string chain0StringId = chainStringId(ce0, bubbleChain0.size()-1, indexInBubble0);
+
+ for(uint64_t indexInBubble1=0; indexInBubble1<bubble1.size(); indexInBubble1++) {
+ const string chain1StringId = chainStringId(ce1, 0, indexInBubble1);
+
+ gfa <<
+ "L\t" <<
+ chain0StringId << "\t+\t" <<
+ chain1StringId << "\t+\t" << overlapLength << "M\n";
+ }
+ }
+ }
+ }
+ }
+
+
+}
+
+
+
+void AssemblyGraph::writeGfaHeader(ostream& gfa)
+{
+ gfa << "H\tVN:Z:1.0\n";
+}
+
+
+// This version writes each chain as a segment, so it shows the
+// details of the BubbleChains.
+void AssemblyGraph::writeGfaExpanded(
+ const string& fileNamePrefix,
+ bool includeSequence,
+ bool useSequenceLength) const
+{
+ ofstream gfa(fileNamePrefix + "-" + to_string(componentId) + "-Expanded.gfa");
+ writeGfaExpanded(gfa, includeSequence, useSequenceLength);
+}
+
+
+
+
+void AssemblyGraph::writeFastaExpanded(const string& fileNamePrefix) const
+{
+ ofstream fasta(fileNamePrefix + "-" + to_string(componentId) + "-Expanded.fasta");
+ writeFastaExpanded(fasta);
+}
+
+
+
+void AssemblyGraph::writeFastaExpanded(ostream& fasta) const
+{
+ const AssemblyGraph& cGraph = *this;
+
+
+ // Loop over BubbleChains. Each Chain of each Bubble generates a GFA segment.
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+
+ // Loop over Bubbles of this chain.
+ for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size();
+ ++positionInBubbleChain) {
+ const Bubble& bubble = bubbleChain[positionInBubbleChain];
+
+ // Loop over chains of this bubble.
+ for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) {
+ const Chain& chain = bubble[indexInBubble];
+
+ using shasta::Base;
+ const vector<Base>& sequence = chain.sequence;
+
+ fasta << ">" << chainStringId(ce, positionInBubbleChain, indexInBubble) <<
+ " " << sequence.size() << "\n";
+ copy(sequence.begin(), sequence.end(), ostream_iterator<Base>(fasta));
+ fasta << "\n";
+
+
+
+ }
+ }
+ }
+}
+
+
+
+void AssemblyGraph::writeSnapshot(uint64_t& snapshotNumber) const
+{
+ const string name = to_string(snapshotNumber++);
+ write(name);
+ writeGfaExpanded(name, false, false);
+}
+
+
+
+string AssemblyGraph::bubbleChainStringId(edge_descriptor ce) const
+{
+ const AssemblyGraph& cGraph = *this;
+ const AssemblyGraphEdge& edge = cGraph[ce];
+ return to_string(componentId) + "-" + to_string(edge.id);
+}
+
+
+
+string AssemblyGraph::bubbleStringId(
+ edge_descriptor ce,
+ uint64_t positionInBubbleChain) const
+{
+ const AssemblyGraph& cGraph = *this;
+ const AssemblyGraphEdge& edge = cGraph[ce];
+
+ return
+ to_string(componentId) + "-" +
+ to_string(edge.id) + "-" +
+ to_string(positionInBubbleChain);
+}
+
+
+
+string AssemblyGraph::chainStringId(
+ edge_descriptor e,
+ uint64_t positionInBubbleChain,
+ uint64_t indexInBubble) const
+{
+ // Locate the AssemblyGraphEdge.
+ const AssemblyGraph& cGraph = *this;
+ const AssemblyGraphEdge& edge = cGraph[e];
+
+ // Get the P-value for the Chain.
+ const uint64_t pValue = chainPValue(e, positionInBubbleChain, indexInBubble);
+
+ return
+ to_string(componentId) + "-" +
+ to_string(edge.id) + "-" +
+ to_string(positionInBubbleChain) + "-" +
+ to_string(indexInBubble) + "-P" +
+ to_string(pValue);
+}
+
+
+
+// This returns a "P-value" for a Chain defined as follows:
+// If the Chain is the only chain of a BubbleChain, the P-value is 0.
+// Otherwise, the P-value is the ploidy of the Bubble that the Chain belongs to.
+// The P-value is used to create the -P suffix in the name (stringId) of the Chain.
+uint64_t AssemblyGraph::chainPValue(
+ edge_descriptor e,
+ uint64_t positionInBubbleChain,
+ uint64_t indexInBubble) const
+{
+ // Locate the chain.
+ const AssemblyGraph& cGraph = *this;
+ const AssemblyGraphEdge& edge = cGraph[e];
+ const BubbleChain& bubbleChain = edge;
+ const Bubble& bubble = bubbleChain[positionInBubbleChain];
+
+ if(bubbleChain.size() == 1 and bubble.size() == 1) {
+ // This is the only Chain in this BubbleChain.
+ return 0;
+ } else {
+ // Return the ploidy of the Bubble this Chain belongs to.
+ return bubble.size();
+ }
+}
+
+
+
+// Get the lengths of Chains assembled sequence for each Chain P-value.
+// On return, chainLengths[pValue] contains the lengths of all
+// Chains with that pValue, sorted in decreasing order.
+// This can be used for N50 statistics.
+void AssemblyGraph::getChainLengthsByPValue(vector< vector<uint64_t> >& chainLengths) const
+{
+ const AssemblyGraph& assemblyGraph = *this;
+ chainLengths.clear();
+
+ // Loop over all BubbleChains.
+ BGL_FORALL_EDGES(e, assemblyGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = assemblyGraph[e];
+
+ // Loop over all Bubbles in this BubbleChain.
+ for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) {
+ const Bubble& bubble = bubbleChain[positionInBubbleChain];
+ const uint64_t ploidy = bubble.size();
+
+ // Loop over all Chains in this Bubble.
+ for(uint64_t indexInBubble=0; indexInBubble<ploidy; indexInBubble++) {
+ const Chain& chain = bubble[indexInBubble];
+ const uint64_t pValue = chainPValue(e, positionInBubbleChain, indexInBubble);
+
+ // Make sure we have a vector for this pValue.
+ if(pValue >= chainLengths.size()) {
+ chainLengths.resize(pValue + 1);
+ }
+
+ // Store the sequence length of this chain.
+ chainLengths[pValue].push_back(chain.sequence.size());
+ }
+ }
+ }
+
+ // Sort by decreasing Chain lengths.
+ for(auto& v: chainLengths) {
+ sort(v.begin(), v.end(), std::greater<uint64_t>());
+ }
+}
+
+
+
+// Get the lengths of all non-trivial bubble chains.
+void AssemblyGraph::getBubbleChainLengths(vector<uint64_t>& bubbleChainLengths) const
+{
+ const AssemblyGraph& assemblyGraph = *this;
+
+ bubbleChainLengths.clear();
+ BGL_FORALL_EDGES(e, assemblyGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = assemblyGraph[e];
+ if(not bubbleChain.isSimpleChain()) {
+ bubbleChainLengths.push_back(bubbleChain.totalLength());
+ }
+ }
+ sort(bubbleChainLengths.begin(), bubbleChainLengths.end(), std::greater<uint64_t>());
+}
+
+
+
+// Return the total lenght of this bubble chain.
+uint64_t BubbleChain::totalLength() const
+{
+ double length = 0.;
+ for(const Bubble& bubble: *this) {
+ uint64_t bubbleTotalLength = 0;
+ for(const Chain& chain: bubble) {
+ bubbleTotalLength += chain.sequence.size();
+ }
+ const double bubbleLength = double(bubbleTotalLength) / double(bubble.size());
+ length += bubbleLength;
+ }
+ return uint64_t(std::round(length));
+}
+
+
+
+// Given a vector of lengths in decreasing order, compute the total length and N50.
+pair<uint64_t, uint64_t> AssemblyGraph::n50(const vector<uint64_t>& lengths)
+{
+ // Handle the trivial case.
+ if(lengths.empty()) {
+ return {0, 0};
+ }
+
+ // Compute the total length.
+ const uint64_t totalLength = accumulate(lengths.begin(), lengths.end(), 0UL);
+
+ // Compute the N50.
+ uint64_t cumulativeLength = 0;
+ for(const uint64_t length: lengths) {
+ cumulativeLength += length;
+ if(2 * cumulativeLength >= totalLength) {
+ return {totalLength, length};
+ }
+ }
+
+
+
+ // We should never get here.
+ // Before asserting, write some diagnostics.
+ ofstream csv("Assertion.csv");
+ csv << "N," << lengths.size() << endl;
+ csv << "Total length," << totalLength << endl;
+
+ // Check that it is sorted in decreasing order.
+ if(lengths.size() > 1) {
+ for(uint64_t i1=1; i1<lengths.size(); i1++) {
+ const uint64_t i0 = i1 - 1;
+ if(lengths[i0] < lengths[i1]) {
+ csv << "Not sorted at," << i0 << "," << i1 << "," <<
+ lengths[i0] << "," << lengths[i1] << endl;
+ }
+ }
+ }
+
+ // Write it all out.
+ for(uint64_t i=0; i<lengths.size(); i++) {
+ csv << i << "," << lengths[i] << endl;
+ }
+
+ SHASTA_ASSERT(0);
+}
+
+
+
+
+uint64_t AssemblyGraph::chainOffset(const Chain& chain) const
+{
+ const uint64_t length = chain.size();
+ SHASTA_ASSERT(length >= 2);
+
+ uint64_t offset = 0;
+ for(uint64_t i=1; i<length; i++) {
+ const MarkerGraphEdgeId edgeId0 = chain[i-1];
+ const MarkerGraphEdgeId edgeId1 = chain[i];
+
+ const uint64_t offsetThisPair = assembler.estimateBaseOffsetUnsafe(edgeId0, edgeId1);
+
+ if(offsetThisPair != invalid<uint64_t>) {
+ offset += offsetThisPair;
+ }
+ }
+ return offset;
+}
+
+
+
+// Return average coverage for the internal MarkerGraphEdgeIds of a Chain.
+// For chain of length 2, this returns 0.
+double AssemblyGraph::primaryCoverage(const Chain& chain) const
+{
+ if(chain.size() < 3) {
+ return 0.;
+ }
+
+ uint64_t sum = 0;
+ for(uint64_t positionInChain=1; positionInChain<chain.size()-1; positionInChain++) {
+ const MarkerGraphEdgeId markerGraphEdgeId = chain[positionInChain];
+ const uint64_t coverage = assembler.markerGraph.edgeCoverage(markerGraphEdgeId);
+ sum += coverage;
+ }
+
+ return double(sum) / double(chain.size() - 2);
+}
+
+
+
+void AssemblyGraph::bubbleOffset(
+ const Bubble& bubble,
+ uint64_t& averageOffset,
+ uint64_t& minOffset,
+ uint64_t& maxOffset
+ ) const
+{
+ averageOffset = 0;
+ minOffset = std::numeric_limits<uint64_t>::max();
+ maxOffset = 0;
+
+ for(const Chain& chain: bubble) {
+ const uint64_t offset = chainOffset(chain);
+
+ averageOffset += offset;
+ minOffset = min(minOffset, offset);
+ maxOffset = max(maxOffset, offset);
+ }
+ averageOffset /= bubble.size();
+}
+
+
+
+bool AssemblyGraph::bubbleOffsetNoException(
+ const Bubble& bubble,
+ uint64_t& averageOffset,
+ uint64_t& minOffset,
+ uint64_t& maxOffset
+ ) const
+{
+ averageOffset = 0;
+ minOffset = std::numeric_limits<uint64_t>::max();
+ maxOffset = 0;
+
+ for(const Chain& chain: bubble) {
+ const uint64_t offset = chainOffset(chain);
+ if(offset == invalid<uint64_t>) {
+ return false;
+ }
+
+ averageOffset += offset;
+ minOffset = min(minOffset, offset);
+ maxOffset = max(maxOffset, offset);
+ }
+ averageOffset /= bubble.size();
+ return true;
+}
+
+
+
+void AssemblyGraph::bubbleChainOffset(
+ const BubbleChain& bubbleChain,
+ uint64_t& averageOffset,
+ uint64_t& minOffset,
+ uint64_t& maxOffset
+ ) const
+{
+ averageOffset = 0;
+ minOffset = 0;
+ maxOffset = 0;
+
+ for(const Bubble& bubble: bubbleChain) {
+ uint64_t bubbleAverageOffset;
+ uint64_t bubbleMinOffset;
+ uint64_t bubbleMaxOffset;
+ bubbleOffset(bubble, bubbleAverageOffset, bubbleMinOffset, bubbleMaxOffset);
+
+ averageOffset += bubbleAverageOffset;
+ minOffset += bubbleMinOffset;
+ maxOffset += bubbleMaxOffset;
+ }
+}
+
+
+
+AssemblyGraph::Superbubbles::Superbubbles(
+ AssemblyGraph& cGraph,
+ uint64_t maxOffset1 // Used to define superbubbles
+ ) :
+ cGraph(cGraph)
+{
+ cGraph.numberVertices();
+ const uint64_t vertexCount = num_vertices(cGraph);
+
+ vector<uint64_t> rank(vertexCount);
+ vector<uint64_t> parent(vertexCount);
+ boost::disjoint_sets<uint64_t*, uint64_t*> disjointSets(&rank[0], &parent[0]);
+
+ // Compute connected components, using only edges with average offset up to maxOffset1.
+ for(uint64_t i=0; i<vertexCount; i++) {
+ disjointSets.make_set(i);
+ }
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+ uint64_t averageOffset;
+ uint64_t minOffset;
+ uint64_t maxOffset;
+ cGraph.bubbleChainOffset(cGraph[ce], averageOffset, minOffset, maxOffset);
+ if(averageOffset <= maxOffset1) {
+ const vertex_descriptor cv0 = source(ce, cGraph);
+ const vertex_descriptor cv1 = target(ce, cGraph);
+ disjointSets.union_set(cGraph[cv0].index, cGraph[cv1].index);
+ }
+ }
+
+ // Gather the vertices in each connected component.
+ vector< vector<vertex_descriptor> > components(vertexCount);
+ BGL_FORALL_VERTICES(cv, cGraph, AssemblyGraph) {
+ const uint64_t componentId = disjointSets.find_set(cGraph[cv].index);
+ components[componentId].push_back(cv);
+ }
+
+ // The superbubbles are the components with size at least 2.
+ for(uint64_t componentId=0; componentId<components.size(); componentId++) {
+ const vector<vertex_descriptor> component = components[componentId];
+ if(components[componentId].size() > 1) {
+ superbubbles.emplace_back(Superbubble(component));
+ }
+ }
+
+ // Store superbubble ids in the vertices.
+ BGL_FORALL_VERTICES(cv, cGraph, AssemblyGraph) {
+ cGraph[cv].superbubbleId = invalid<uint64_t>;
+ }
+ for(uint64_t superbubbleId=0; superbubbleId<superbubbles.size(); superbubbleId++) {
+ const vector<vertex_descriptor>& superbubble = getSuperbubble(superbubbleId);
+ for(const vertex_descriptor cv: superbubble) {
+ cGraph[cv].superbubbleId = superbubbleId;
+ }
+ }
+
+
+
+ // Find entrances and exists of each superbubble.
+ for(uint64_t superbubbleId=0; superbubbleId<superbubbles.size(); superbubbleId++) {
+ Superbubble& superbubble = getSuperbubble(superbubbleId);
+
+ // Find entrances. These are superbubble vertices with in-edges
+ // from outside the superbubble.
+ for(const vertex_descriptor cv0: superbubble) {
+ BGL_FORALL_INEDGES(cv0, ce, cGraph, AssemblyGraph) {
+ const vertex_descriptor cv1 = source(ce, cGraph);
+ if(not isInSuperbubble(superbubbleId, cv1)) {
+ superbubble.entrances.push_back(cv0);
+ break;
+ }
+ }
+ }
+
+ // Find exits. These are superbubble vertices with out-edges
+ // to outside the superbubble.
+ vector<vertex_descriptor> exits;
+ for(const vertex_descriptor cv0: superbubble) {
+ BGL_FORALL_OUTEDGES(cv0, ce, cGraph, AssemblyGraph) {
+ const vertex_descriptor cv1 = target(ce, cGraph);
+ if(not isInSuperbubble(superbubbleId, cv1)) {
+ superbubble.exits.push_back(cv0);
+ break;
+ }
+ }
+ }
+ }
+
+}
+
+
+
+// This uses dominator trees.
+// It only finds superbubbles with one entrance and one exit.
+AssemblyGraph::Superbubbles::Superbubbles(
+ AssemblyGraph& cGraph) :
+ cGraph(cGraph)
+{
+ const bool debug = false;
+
+ // Map vertices to integers.
+ std::map<vertex_descriptor, uint64_t> indexMap;
+ uint64_t vertexIndex = 0;
+ BGL_FORALL_VERTICES(v, cGraph, AssemblyGraph) {
+ indexMap.insert({v, vertexIndex++});
+ }
+ auto associativeIndexMap = boost::make_assoc_property_map(indexMap);
+ const uint64_t vertexCount = vertexIndex;
+
+ // Vectors used below to compute the dominator tree.
+ vector<uint64_t> dfNum(vertexCount);
+ vector<vertex_descriptor> parent(vertexCount);
+ vector<vertex_descriptor> verticesByDFNum(vertexCount);
+
+ // Tree pairs found on forward and backward dominator tree.
+ vector< pair<vertex_descriptor, vertex_descriptor> > forwardPairs;
+ vector< pair<vertex_descriptor, vertex_descriptor> > backwardPairs;
+
+
+
+ // Compute dominator trees using as entrance each of the
+ // vertices with zero in-degree.
+ BGL_FORALL_VERTICES(entrance, cGraph, AssemblyGraph) {
+ if(in_degree(entrance, cGraph) != 0) {
+ continue;
+ }
+
+ // Compute the dominator tree.
+ fill(dfNum.begin(), dfNum.end(), invalid<uint64_t>);
+ fill(parent.begin(), parent.end(), null_vertex());
+ fill(verticesByDFNum.begin(), verticesByDFNum.end(), null_vertex());
+ std::map<vertex_descriptor, vertex_descriptor> predecessorMap;
+
+ boost::lengauer_tarjan_dominator_tree(
+ cGraph,
+ entrance,
+ boost::make_assoc_property_map(indexMap),
+ boost::make_iterator_property_map(dfNum.begin(), associativeIndexMap),
+ boost::make_iterator_property_map(parent.begin(), associativeIndexMap),
+ verticesByDFNum,
+ boost::make_assoc_property_map(predecessorMap));
+
+ if(debug) {
+ cout << "Forward dominator tree with entrance at " << cGraph[entrance].edgeId << endl;
+ }
+ for(const auto& p: predecessorMap) {
+ const vertex_descriptor cv0 = p.second;
+ const vertex_descriptor cv1 = p.first;
+ forwardPairs.push_back({cv0, cv1});
+ if(debug) {
+ cout << "F " << cGraph[cv0].edgeId << "->" << cGraph[cv1].edgeId << endl;
+ }
+ }
+ }
+
+
+
+ // Compute dominator trees on the reverse graph using as entrance each of the
+ // vertices with zero in-degree on the reverse graph
+ // (that is, zero out-degree on the AssemblyGraph).
+ using ReverseAssemblyGraph = boost::reverse_graph<AssemblyGraph>;
+ ReverseAssemblyGraph reverseGraph(cGraph);
+ BGL_FORALL_VERTICES(entrance, reverseGraph, ReverseAssemblyGraph) {
+ if(in_degree(entrance, reverseGraph) != 0) {
+ continue;
+ }
+
+ // Compute the dominator tree.
+ fill(dfNum.begin(), dfNum.end(), invalid<uint64_t>);
+ fill(parent.begin(), parent.end(), null_vertex());
+ fill(verticesByDFNum.begin(), verticesByDFNum.end(), null_vertex());
+ std::map<vertex_descriptor, vertex_descriptor> predecessorMap;
+
+ boost::lengauer_tarjan_dominator_tree(
+ reverseGraph,
+ entrance,
+ boost::make_assoc_property_map(indexMap),
+ boost::make_iterator_property_map(dfNum.begin(), associativeIndexMap),
+ boost::make_iterator_property_map(parent.begin(), associativeIndexMap),
+ verticesByDFNum,
+ boost::make_assoc_property_map(predecessorMap));
+
+ if(debug) {
+ cout << "Backward dominator tree with exit at " << cGraph[entrance].edgeId << endl;
+ }
+ for(const auto& p: predecessorMap) {
+ const vertex_descriptor cv0 = p.first;
+ const vertex_descriptor cv1 = p.second;
+ backwardPairs.push_back({cv0, cv1});
+ if(debug) {
+ cout << "B " << cGraph[cv0].edgeId << "->" << cGraph[cv1].edgeId << endl;
+ }
+ }
+ }
+
+ // Compute strongly connected components.
+ std::map<vertex_descriptor, uint64_t> componentMap;
+ boost::strong_components(
+ cGraph,
+ boost::make_assoc_property_map(componentMap),
+ boost::vertex_index_map(boost::make_assoc_property_map(indexMap)));
+
+ // Gather the vertices in each strong component.
+ vector< vector<vertex_descriptor> > strongComponents(vertexCount);
+ for(const auto& p: componentMap) {
+ const vertex_descriptor v = p.first;
+ const uint64_t componentId = p.second;
+ SHASTA_ASSERT(componentId < vertexCount);
+ strongComponents[componentId].push_back(v);
+ }
+
+
+
+ // The pairs that appear both in forwardPairs and backwardPairs define our superbubbles
+ deduplicate(forwardPairs);
+ deduplicate(backwardPairs);
+ vector< pair<vertex_descriptor, vertex_descriptor> > bidirectionalPairs;
+ std::set_intersection(
+ forwardPairs.begin(), forwardPairs.end(),
+ backwardPairs.begin(), backwardPairs.end(),
+ back_inserter(bidirectionalPairs)
+ );
+
+ if(debug) {
+ cout << "Bidirectional pairs:" << endl;
+ for(const auto& p: bidirectionalPairs) {
+ const vertex_descriptor cv0 = p.first;
+ const vertex_descriptor cv1 = p.second;
+ cout << cGraph[cv0].edgeId << "->" << cGraph[cv1].edgeId << endl;
+ }
+ }
+
+ // Each bidirectional pair generates a superbubble if
+ // the out-degree of the entrance and
+ // the in-degree of the exit are greater than 1,
+ // unless the entrance or exit or any of the
+ // superbubble vertices are in a non-trivial strong component..
+ for(const auto& p: bidirectionalPairs) {
+ const vertex_descriptor cv0 = p.first;
+ const vertex_descriptor cv1 = p.second;
+ if(out_degree(cv0, cGraph) <= 1) {
+ continue;
+ }
+ if(in_degree(cv1, cGraph) <= 1) {
+ continue;
+ }
+ if(strongComponents[componentMap[cv0]].size() > 1) {
+ // The entrance is in a non-trivial strong component.
+ continue;
+ }
+ if(strongComponents[componentMap[cv1]].size() > 1) {
+ // The exit is in a non-trivial strong component.
+ continue;
+ }
+ superbubbles.resize(superbubbles.size() + 1);
+ Superbubble& superbubble = superbubbles.back();
+ superbubble.entrances.push_back(cv0);
+ superbubble.exits.push_back(cv1);
+ superbubble.fillInFromEntranceAndExit(cGraph);
+
+ if(debug) {
+ cout << "Tentative superbubble with entrance " << cGraph[cv0].edgeId <<
+ " exit " << cGraph[cv1].edgeId << " and " << superbubble.size() <<
+ " vertices total." << endl;
+ }
+
+ // If any vertices in the superbubble are in a non-trivial
+ // strong component, remove it.
+ for(const vertex_descriptor cv: superbubble) {
+ if(strongComponents[componentMap[cv]].size() > 1) {
+ superbubbles.pop_back();
+ if(debug) {
+ cout << "This superbubble will not be stored because some vertices are in a non-trivial strong component." << endl;
+ }
+ break;
+ }
+ }
+ }
+
+ if(debug) {
+ cout << "Superbubble entrance/exit pairs:" << endl;
+ for(const Superbubble& superbubble: superbubbles) {
+ const vertex_descriptor cv0 = superbubble.entrances.front();
+ const vertex_descriptor cv1 = superbubble.exits.front();;
+ cout << cGraph[cv0].edgeId << "->" << cGraph[cv1].edgeId << endl;
+ }
+ }
+}
+
+
+
+// Fill in the superbubble given a single entrance and exit.
+void AssemblyGraph::Superbubble::fillInFromEntranceAndExit(const AssemblyGraph& cGraph)
+{
+ SHASTA_ASSERT(empty());
+ SHASTA_ASSERT(entrances.size() == 1);
+ SHASTA_ASSERT(exits.size() == 1);
+
+ const vertex_descriptor entrance = entrances.front();
+ const vertex_descriptor exit = exits.front();
+
+ // Do a BFS starting at the entrance and stopping at the exit.
+ std::set<vertex_descriptor> internalVertices;
+ std::queue<vertex_descriptor> q;
+ q.push(entrance);
+ while(not q.empty()) {
+ const vertex_descriptor cv0 = q.front();
+ q.pop();
+ BGL_FORALL_OUTEDGES(cv0, e, cGraph, AssemblyGraph) {
+ const vertex_descriptor cv1 = target(e, cGraph);
+ if(cv1 != exit) {
+ if(not internalVertices.contains(cv1)) {
+ internalVertices.insert(cv1);
+ q.push(cv1);
+ }
+ }
+ }
+ }
+
+ push_back(entrance);
+ copy(internalVertices.begin(), internalVertices.end(), back_inserter(*this));
+ push_back(exit);
+
+}
+
+
+
+AssemblyGraph::Superbubbles::~Superbubbles()
+{
+ cGraph.clearVertexNumbering();
+}
+
+
+
+// Remove short superbubbles with one entry and one exit.
+bool AssemblyGraph::removeShortSuperbubbles(
+ bool debug,
+ uint64_t maxOffset1, // Used to define superbubbles
+ uint64_t maxOffset2) // Compared against the offset between entry and exit
+{
+ AssemblyGraph& cGraph = *this;
+ bool changesWereMade = false;
+
+ // Find the superbubbles.
+ Superbubbles superbubbles(cGraph, maxOffset1);
+
+ // Loop over the superbubbles.
+ for(uint64_t superbubbleId=0; superbubbleId<superbubbles.size(); superbubbleId++) {
+ Superbubble& superbubble = superbubbles.getSuperbubble(superbubbleId);
+ SHASTA_ASSERT(superbubble.size() > 1);
+
+ if(debug) {
+ cout << "Found a superbubble with " << superbubble.size() << " vertices:";
+ for(const vertex_descriptor v: superbubble) {
+ cout << " " << cGraph[v].edgeId;
+ }
+ cout << endl;
+ }
+
+ // Skip it if it has more than one entrance or exit.
+ if(not(superbubble.entrances.size()==1 and superbubble.exits.size()==1)) {
+ if(debug) {
+ cout << "This superbubble will not be removed because it has " <<
+ superbubble.entrances.size() << " entrances and " <<
+ superbubble.exits.size() << " exits." << endl;
+ }
+ continue;
+ }
+
+ const vertex_descriptor entrance = superbubble.entrances.front();
+ const vertex_descriptor exit = superbubble.exits.front();
+ if(entrance == exit) {
+ if(debug) {
+ cout << "This superbubble will not be removed because it the entrance vertex"
+ " is the same as the exit vertex." << endl;
+ }
+ continue;
+ }
+
+ // Check the base offset between the entrance and the exit.
+ MarkerGraphEdgePairInfo info;
+ SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(cGraph[entrance].edgeId, cGraph[exit].edgeId, info));
+ if(info.common == 0) {
+ if(debug) {
+ cout << "This superbubble will not be removed because "
+ "there are no common oriented reads between the entrance and the exit." << endl;
+ }
+ continue;
+ }
+ if(info.offsetInBases > int64_t(maxOffset2)) {
+ if(debug) {
+ cout << "This superbubble will not be removed because offsetInBases is " <<
+ info.offsetInBases << endl;
+ }
+ continue;
+ }
+
+#if 1
+ // If a trivial superbubble, skip it.
+ // Trivial means:
+ // - Has two vertices of which one is the entrance and one is the exit.
+ // - There is only one edge between the two.
+ if(superbubble.size() == 2) {
+ uint64_t edgeCount = 0;
+ BGL_FORALL_OUTEDGES(entrance, e, cGraph, AssemblyGraph) {
+ if(target(e, cGraph) == exit) {
+ ++edgeCount;
+ }
+ }
+ if(edgeCount == 1) {
+ if(debug) {
+ cout << "This superbubble will not be removed because it is trivial." << endl;
+ }
+ continue;
+ }
+ }
+#endif
+ if(debug) {
+ cout << "This superbubble will be removed." << endl;
+ }
+
+ // Remove all vertices and edges internal to the superbubble.
+ for(const vertex_descriptor cv: superbubble) {
+ if(cv!=entrance and cv!=exit) {
+ boost::clear_vertex(cv, cGraph);
+ cGraph.removeVertex(cv);
+ }
+ }
+ // We must also remove edges between the entrance and the exit.
+ vector<edge_descriptor> entranceToExitEdges;
+ BGL_FORALL_OUTEDGES(entrance, ce, cGraph, AssemblyGraph) {
+ if(target(ce, cGraph) == exit) {
+ entranceToExitEdges.push_back(ce);
+ }
+ }
+ for(const edge_descriptor ce: entranceToExitEdges) {
+ boost::remove_edge(ce, cGraph);
+ }
+ vector<edge_descriptor> exitToEntranceEdges;
+ BGL_FORALL_OUTEDGES(exit, ce, cGraph, AssemblyGraph) {
+ if(target(ce, cGraph) == entrance) {
+ exitToEntranceEdges.push_back(ce);
+ }
+ }
+ for(const edge_descriptor ce: exitToEntranceEdges) {
+ boost::remove_edge(ce, cGraph);
+ }
+
+ // Generate an edge between the entrance and the exit.
+ // This will be a BubbleChain consisting of a single haploid Bubble.
+ edge_descriptor eNew;
+ tie(eNew, ignore) = add_edge(entrance, exit, cGraph);
+ AssemblyGraphEdge& newEdge = cGraph[eNew];
+ newEdge.id = nextEdgeId++;
+ BubbleChain& bubbleChain = newEdge;
+ bubbleChain.resize(1);
+ Bubble& bubble = bubbleChain.front();
+ bubble.resize(1);
+ Chain& chain = bubble.front();
+ chain.push_back(cGraph[entrance].edgeId);
+ chain.push_back(cGraph[exit].edgeId);
+
+ changesWereMade = true;
+ }
+
+ return changesWereMade;
+}
+
+
+
+// Cleanup/simplify superbubbles that are likely to be caused by errors,
+// completely or in part.
+void AssemblyGraph::cleanupSuperbubbles(
+ bool debug,
+ uint64_t maxOffset1, // Used to define superbubbles
+ uint64_t maxOffset2, // Compared against the offset between entry and exit
+ uint64_t chainTerminalCommonThreshold)
+{
+ AssemblyGraph& cGraph = *this;
+
+ if(debug) {
+ cout << "cleanupSuperbubbles begins." << endl;
+ }
+
+ // Find the superbubbles.
+ Superbubbles superbubbles(cGraph, maxOffset1);
+
+ // The bubbles constructed in this way are guaranteed to not overlap,
+ // so we don't have to worry about overlapping bubbles.
+ std::set<vertex_descriptor> previousSuperbubblesVertices;
+
+ // Loop over the superbubbles.
+ for(uint64_t superbubbleId=0; superbubbleId<superbubbles.size(); superbubbleId++) {
+ cleanupSuperbubble(debug, superbubbles, superbubbleId,
+ maxOffset2, chainTerminalCommonThreshold, previousSuperbubblesVertices);
+ }
+ if(debug) {
+ cout << "cleanupSuperbubbles ends." << endl;
+ }
+}
+
+
+
+// This version of superbubble cleanup uses dominator trees to define superbubbles,
+// instead of computing connected components using edges of length uo tp maxOffset1.
+void AssemblyGraph::cleanupSuperbubbles(
+ bool debug,
+ uint64_t maxOffset2, // Compared against the offset between entry and exit
+ uint64_t chainTerminalCommonThreshold)
+{
+ performanceLog << timestamp << "AssemblyGraph::cleanupSuperbubbles begins." << endl;
+ AssemblyGraph& cGraph = *this;
+
+ if(debug) {
+ cout << "cleanupSuperbubbles begins." << endl;
+ }
+
+ // Find the superbubbles using dominator trees.
+ Superbubbles superbubbles(cGraph);
+
+ // The superbubbles found in this way can have overlaps.
+ // To deal with this, we process superbubbles in order of increasing size
+ // and keep track of the vertices.
+ // If a bubble contains a previously encountered vertex, don't process it.
+ // Note cleanupSuperbubble does not create any new vertices,
+ // so keeping track of the vertex descriptors that were removed is save.
+ std::set<vertex_descriptor> previousSuperbubblesVertices;
+
+ // Sort the superbubbles in order of increasing size.
+ vector< pair<uint64_t, uint64_t> > superbubbleTable; // (superbubbleId, size)
+ for(uint64_t superbubbleId=0; superbubbleId<superbubbles.size(); superbubbleId++) {
+ const Superbubble& superbubble = superbubbles.getSuperbubble(superbubbleId);
+ superbubbleTable.push_back({superbubbleId, superbubble.size()});
+ }
+ sort(superbubbleTable.begin(), superbubbleTable.end(),
+ OrderPairsBySecondOnly<uint64_t, uint64_t>());
+
+ // Loop over the superbubbles in order of increasing size.
+ for(const auto& p: superbubbleTable) {
+ const uint64_t superbubbleId = p.first;
+ cleanupSuperbubble(debug, superbubbles, superbubbleId, maxOffset2,
+ chainTerminalCommonThreshold, previousSuperbubblesVertices);
+ }
+ if(debug) {
+ cout << "cleanupSuperbubbles ends." << endl;
+ }
+ performanceLog << timestamp << "AssemblyGraph::cleanupSuperbubbles ends." << endl;
+
+}
+
+
+
+// Cleanup/simplify a superbubble that is likely to be caused by errors,
+// completely or in part.
+// This handles superbubbles caused by two marker graph bubbles with
+// no primary edges in between.
+void AssemblyGraph::cleanupSuperbubble(
+ bool debug,
+ const Superbubbles& superbubbles,
+ uint64_t superbubbleId,
+ uint64_t maxOffset2, // Compared against the offset between entry and exit
+ uint64_t chainTerminalCommonThreshold,
+ std::set<vertex_descriptor>& previousSuperbubblesVertices)
+{
+ AssemblyGraph& cGraph = *this;
+ const Superbubble& superbubble = superbubbles.getSuperbubble(superbubbleId);
+
+#if 0
+ debug = (superbubble.entrances.size() == 1 and
+ (cGraph[superbubble.entrances.front()].edgeId == 16093908 or
+ cGraph[superbubble.entrances.front()].edgeId == 9555933));
+#endif
+
+ if(debug) {
+ cout << "Working on a superbubble with " << superbubble.size() << " vertices:";
+ for(const vertex_descriptor v: superbubble) {
+ cout << " " << cGraph[v].edgeId;
+ }
+ cout << endl;
+ }
+
+ // See if it overlaps any vertices of previous superbubbles.
+ bool overlaps = false;
+ for(const vertex_descriptor v: superbubble) {
+ if(previousSuperbubblesVertices.contains(v)) {
+ if(debug) {
+ cout << "This superbubble ignored because it contains vertex " << cGraph[v].edgeId <<
+ " which is in a previously processed superbubble." << endl;
+ }
+ overlaps = true;
+ break;
+ }
+ }
+ for(const vertex_descriptor v: superbubble) {
+ previousSuperbubblesVertices.insert(v);
+ }
+ if(overlaps) {
+ return;
+ }
+
+ // Skip it if it has more than one entrance or exit.
+ if(not(superbubble.entrances.size()==1 and superbubble.exits.size()==1)) {
+ if(debug) {
+ cout << "This superbubble will be skipped because it has " <<
+ superbubble.entrances.size() << " entrances and " <<
+ superbubble.exits.size() << " exits." << endl;
+ }
+ return;
+ }
+
+ const vertex_descriptor entrance = superbubble.entrances.front();
+ const vertex_descriptor exit = superbubble.exits.front();
+ if(debug) {
+ cout << "Entrance " << cGraph[entrance].edgeId << endl;
+ cout << "Exit " << cGraph[exit].edgeId << endl;
+ }
+
+ if(entrance == exit) {
+ if(debug) {
+ cout << "This superbubble will be skipped because the entrance vertex"
+ " is the same as the exit vertex." << endl;
+ }
+ return;
+ }
+
+
+
+ // Check the base offset between the entrance and the exit.
+ MarkerGraphEdgePairInfo info;
+ SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(cGraph[entrance].edgeId, cGraph[exit].edgeId, info));
+ if(info.common == 0) {
+ if(debug) {
+ cout << "This superbubble will be skipped because "
+ "there are no common oriented reads between the entrance and the exit." << endl;
+ }
+ return;
+ }
+ if(info.offsetInBases > int64_t(maxOffset2)) {
+ if(debug) {
+ cout << "This superbubble will be skipped because offsetInBases is " <<
+ info.offsetInBases << endl;
+ }
+ return;
+ }
+
+ // If a trivial superbubble, skip it.
+ // Trivial means:
+ // - Has two vertices of which one is the entrance and one is the exit.
+ // - There is only one edge between the two.
+ if(superbubble.size() == 2) {
+ uint64_t edgeCount = 0;
+ BGL_FORALL_OUTEDGES(entrance, e, cGraph, AssemblyGraph) {
+ if(target(e, cGraph) == exit) {
+ ++edgeCount;
+ }
+ }
+ if(edgeCount == 1) {
+ if(debug) {
+ cout << "This superbubble be skipped because it is trivial." << endl;
+ }
+ return;
+ }
+ }
+
+ // Find the out-edges of the entrance that go inside the superbubble.
+ vector<edge_descriptor> entranceOutEdges;
+ BGL_FORALL_OUTEDGES(entrance, ce, cGraph, AssemblyGraph) {
+ const vertex_descriptor cv = target(ce, cGraph);
+ if(superbubbles.isInSuperbubble(superbubbleId, cv)) {
+ entranceOutEdges.push_back(ce);
+ }
+ }
+ sort(entranceOutEdges.begin(), entranceOutEdges.end());
+
+ // Find the in-edges of the exit that come from inside the superbubble.
+ vector<edge_descriptor> exitInEdges;
+ BGL_FORALL_INEDGES(exit, ce, cGraph, AssemblyGraph) {
+ const vertex_descriptor cv = source(ce, cGraph);
+ if(superbubbles.isInSuperbubble(superbubbleId, cv)) {
+ exitInEdges.push_back(ce);
+ }
+ }
+ sort(exitInEdges.begin(), exitInEdges.end());
+
+ if(debug) {
+ cout << "Entrance out-edges to inside the superbubble:";
+ for(const edge_descriptor ce: entranceOutEdges) {
+ cout << " " << bubbleChainStringId(ce);
+ }
+ cout << endl;
+ cout << "Exit in-edges from inside the superbubble:";
+ for(const edge_descriptor ce: exitInEdges) {
+ cout << " " << bubbleChainStringId(ce);
+ }
+ cout << endl;
+ }
+
+ // If there are common edges between the entranceOutEdges and exitInEdges,
+ // skip this superbubble.
+ {
+ vector<edge_descriptor> commonEdges;
+ std::set_intersection(
+ entranceOutEdges.begin(), entranceOutEdges.end(),
+ exitInEdges.begin(), exitInEdges.end(),
+ back_inserter(commonEdges));
+
+ if(not commonEdges.empty()) {
+ if(debug) {
+ cout << "This superbubble will be skipped because there are " <<
+ commonEdges.size() << " common edges between the out-edges of the entrance "
+ "and the in-edges of the exit." << endl;
+ }
+ return;
+ }
+ }
+
+
+ // We will consider replacing this superbubble with either its "entrance bubble"
+ // or its "exit bubble":
+ // - The "entrance bubble" is obtained by removing all edges
+ // except for the out-edges of the entrance, and joining them directly with the exit.
+ // - The "exit bubble" is obtained by removing all edges
+ // except for the in-edges of the exit, and joining the entry directly with them.
+
+
+
+ // If there are exactly two entranceOutEdges, construct the entrance bubble.
+ // This can only be done if the two entranceOutEdges consist of simple chains.
+ Bubble entranceBubble;
+ if(entranceOutEdges.size() == 2) {
+
+ // See if the two entranceOutEdges consist of simple chains.
+ bool canDo = true;
+ for(const edge_descriptor ce: entranceOutEdges) {
+ if(not cGraph[ce].isSimpleChain()) {
+ canDo = false;
+ break;
+ }
+ }
+
+ // Only continue creating the entranceBubble if both entranceOutEdges
+ // consist of single chains.
+ if(canDo) {
+
+ // Construct the two chains of the entranceBubble and assemble their sequence.
+ entranceBubble.resize(2);
+ ofstream noCsv;
+ for(uint64_t i=0; i<2; i++) {
+ const edge_descriptor entranceOutEdge = entranceOutEdges[i];
+ Chain& chain = entranceBubble[i];
+ chain = cGraph[entranceOutEdge].getOnlyChain();
+ chain.push_back(cGraph[exit].edgeId);
+ assembleChain(chain, chainTerminalCommonThreshold);
+ }
+
+ if(debug) {
+ cout << "Entrance bubble:" << endl;
+ for(uint64_t i=0; i<2; i++) {
+ const Chain& chain = entranceBubble[i];
+ cout << "Entrance bubble chain " << i << ":";
+ for (const MarkerGraphEdgeId edgeId: chain) {
+ cout << " " << edgeId;
+ }
+ cout << endl;
+ }
+ for(uint64_t i=0; i<2; i++) {
+ const Chain& chain = entranceBubble[i];
+ cout << ">Entrance-" << i << " " << chain.sequence.size() << "\n";
+ copy(chain.sequence.begin(), chain.sequence.end(), ostream_iterator<shasta::Base>(cout));
+ cout << "\n";
+ }
+ }
+
+ // If the sequences differ just by a copy number of short periodicity,
+ // the entrance bubble is probably causes by errors and so we don't wat to use it.
+ const uint64_t period = isCopyNumberDifference(entranceBubble[0].sequence, entranceBubble[1].sequence, 4);
+ if(debug) {
+ cout << "Period " << period << "\n";
+ }
+ if(period != 0) {
+ entranceBubble.clear();
+ }
+ }
+ }
+
+
+
+ // If there are exactly two exitEdges, construct the exit bubble.
+ // This can only be done if the two exitInEdges consist of simple chains.
+ Bubble exitBubble;
+ if(exitInEdges.size() == 2) {
+
+ // See if the two exitInEdges consist of simple chains.
+ bool canDo = true;
+ for(const edge_descriptor ce: exitInEdges) {
+ if(not cGraph[ce].isSimpleChain()) {
+ canDo = false;
+ break;
+ }
+ }
+
+ // Only continue creating the exitBubble if both exitInEdges
+ // consist of single chains.
+ if(canDo) {
+
+ // Construct the two chains of the exitBubble and assemble their sequence.
+ exitBubble.resize(2);
+ ofstream noCsv;
+ for(uint64_t i=0; i<2; i++) {
+ const edge_descriptor exitInEdge = exitInEdges[i];
+ Chain& chain = exitBubble[i];
+ chain.push_back(cGraph[entrance].edgeId);
+ const Chain& exitChain = cGraph[exitInEdge].getOnlyChain();
+ copy(exitChain.begin(), exitChain.end(), back_inserter(chain));
+ assembleChain(chain, chainTerminalCommonThreshold);
+ }
+
+ if(debug) {
+ cout << "Exit bubble:" << endl;
+ for(uint64_t i=0; i<2; i++) {
+ const Chain& chain = exitBubble[i];
+ cout << "Exit bubble chain " << i << ":";
+ for (const MarkerGraphEdgeId edgeId: chain) {
+ cout << " " << edgeId;
+ }
+ cout << endl;
+ }
+ for(uint64_t i=0; i<2; i++) {
+ const Chain& chain = exitBubble[i];
+ cout << ">Exit-" << i << " " << chain.sequence.size() << "\n";
+ copy(chain.sequence.begin(), chain.sequence.end(), ostream_iterator<shasta::Base>(cout));
+ cout << "\n";
+ }
+ }
+
+ // If the sequences differ just by a copy number of short periodicity,
+ // the exit bubble is probably causes by errors and so we don't wat to use it.
+ const uint64_t period = isCopyNumberDifference(exitBubble[0].sequence, exitBubble[1].sequence, 4);
+ if(debug) {
+ cout << "Period " << period << "\n";
+ }
+ if(period != 0) {
+ exitBubble.clear();
+ }
+ }
+ }
+
+
+ // Handle the case where both the entrance and the exit bubble look usable.
+ if(entranceBubble.size() == 2 and exitBubble.size() == 2) {
+
+ // If the entrance and exit bubbles have the same assembled sequences, we can just keep one of them.
+ const auto& entrance0 = entranceBubble[0].sequence;
+ const auto& entrance1 = entranceBubble[1].sequence;
+ const auto& exit0 = exitBubble[0].sequence;
+ const auto& exit1 = exitBubble[1].sequence;
+ if(
+ (entrance0 == exit0 and entrance1 == exit1)
+ or
+ (entrance0 == exit1 and entrance1 == exit0)) {
+ if(debug) {
+ cout << "The entrance and exit bubbles are equivalent." << endl;
+ cout << "Keeping only the entrance bubble." << endl;
+ }
+ exitBubble.clear();
+ } else {
+
+ // In other cases it is difficult to pick which bubble is best to keep,
+ // so we remove both of them.
+ // This is no worse than letting removeShortBubbles remove it.
+ // The sequence assembly process will still pick the best sequence
+ // for each haplotype, but these bubbles are excluded from the
+ // phasing/detangling process.
+ entranceBubble.clear();
+ exitBubble.clear();
+
+ if(debug) {
+ cout << "Both the entrance and the exit bubble are usable but both will be removed." << endl;
+ }
+
+ }
+ }
+
+
+
+ // Figure out which ones of the entrance/exit bubbles is usable.
+ SHASTA_ASSERT(entranceBubble.size() == 0 or entranceBubble.size() == 2);
+ SHASTA_ASSERT(exitBubble.size() == 0 or exitBubble.size() == 2);
+ const bool entranceBubbleIsGood = (entranceBubble.size() == 2);
+ const bool exitBubbleIsGood = (exitBubble.size() == 2);
+
+
+ if(entranceBubbleIsGood) {
+ if(exitBubbleIsGood) {
+ if(debug) {
+ cout << "Both the entrance bubble and the exit bubble are good." << endl;
+ }
+ SHASTA_ASSERT(0);
+ } else {
+ if(debug) {
+ cout << "Only the entrance bubble is good." << endl;
+ }
+ }
+ } else {
+ if(exitBubbleIsGood) {
+ if(debug) {
+ cout << "Only the exit bubble is good." << endl;
+ }
+ } else {
+ if(debug) {
+ cout << "Neither the entrance bubble nor the exit bubble are good." << endl;
+ }
+ }
+ }
+
+
+ // Remove all vertices and edges internal to the superbubble.
+ for(const vertex_descriptor cv: superbubble) {
+ if(cv != entrance and cv != exit) {
+ clear_vertex(cv, cGraph);
+ remove_vertex(cv, cGraph);
+ }
+ }
+
+ // Create the new edge and bubble chain between the entrance and the exit that will replace
+ // the superbubble.
+ edge_descriptor ce;
+ tie(ce, ignore) = add_edge(entrance, exit, cGraph);
+ AssemblyGraphEdge& edge = cGraph[ce];
+ edge.id = nextEdgeId++;
+ BubbleChain& bubbleChain = edge;
+ SHASTA_ASSERT(not (entranceBubbleIsGood and exitBubbleIsGood));
+ if(entranceBubbleIsGood or exitBubbleIsGood) {
+ const Bubble& newBubble = entranceBubbleIsGood ? entranceBubble : exitBubble;
+ SHASTA_ASSERT(newBubble.size() == 2);
+ bubbleChain.push_back(newBubble);
+ } else {
+ Chain newChain;
+ newChain.push_back(cGraph[entrance].edgeId);
+ newChain.push_back(cGraph[exit].edgeId);
+ Bubble newBubble;
+ newBubble.push_back(newChain);
+ bubbleChain.push_back(newBubble);
+ }
+
+}
+
+
+
+#if 0
+bool AssemblyGraph::detangleVerticesStrict(bool debug)
+{
+ if(debug) {
+ cout << "Detangling vertices." << endl;
+ }
+ AssemblyGraph& cGraph = *this;
+
+ vector<vertex_descriptor> allVertices;
+ BGL_FORALL_VERTICES(cv, cGraph, AssemblyGraph) {
+ allVertices.push_back(cv);
+ }
+
+ uint64_t detangledCount = 0;
+ for(const vertex_descriptor cv: allVertices) {
+ if(detangleVertexStrict(cv, debug)) {
+ ++detangledCount;
+ }
+ }
+
+ if(debug) {
+ cout << "Detangled " << detangledCount << " vertices." << endl;
+
+ }
+
+ return detangledCount > 0;
+}
+#endif
+
+
+
+bool AssemblyGraph::detangleVertices(
+ bool debug,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP)
+{
+ if(debug) {
+ cout << "Detangling vertices." << endl;
+ }
+ AssemblyGraph& cGraph = *this;
+
+ vector<vertex_descriptor> allVertices;
+ BGL_FORALL_VERTICES(cv, cGraph, AssemblyGraph) {
+ allVertices.push_back(cv);
+ }
+
+ uint64_t detangledCount = 0;
+ for(const vertex_descriptor cv: allVertices) {
+ if(detangleVertex(cv, debug, detangleToleranceLow, detangleToleranceHigh,
+ useBayesianModel, epsilon, minLogP)) {
+ ++detangledCount;
+ }
+ }
+
+ if(debug) {
+ cout << "Detangled " << detangledCount << " vertices." << endl;
+ }
+
+ return detangledCount > 0;
+}
+
+
+
+bool AssemblyGraph::detangleVerticesGeneral(
+ bool debug,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP)
+{
+ if(debug) {
+ cout << "Detangling vertices (general detangling)." << endl;
+ }
+ AssemblyGraph& cGraph = *this;
+
+ vector<vertex_descriptor> allVertices;
+ BGL_FORALL_VERTICES(cv, cGraph, AssemblyGraph) {
+ allVertices.push_back(cv);
+ }
+
+ uint64_t detangledCount = 0;
+ for(const vertex_descriptor cv: allVertices) {
+ if(detangleVertexGeneral(cv, debug, detangleToleranceLow, detangleToleranceHigh,
+ useBayesianModel, epsilon, minLogP)) {
+ ++detangledCount;
+ }
+ }
+
+ if(debug) {
+ cout << "Detangled " << detangledCount << " vertices." << endl;
+
+ }
+
+ return detangledCount > 0;
+}
+
+
+// Compute the tangle matrix given in-edges and out-edges.
+// The last bubble of each in-edge and the first bubble
+// of each out-edge must be haploid.
+void AssemblyGraph::computeTangleMatrix(
+ const vector<edge_descriptor>& inEdges,
+ const vector<edge_descriptor>& outEdges,
+ vector< vector<uint64_t> >& tangleMatrix,
+ bool setToZeroForComplementaryPairs
+ ) const
+{
+ const AssemblyGraph& cGraph = *this;
+
+ tangleMatrix.clear();
+ tangleMatrix.resize(inEdges.size(), vector<uint64_t>(outEdges.size()));
+
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ const edge_descriptor ce0 = inEdges[i0];
+ const BubbleChain& bubbleChain0 = cGraph[ce0];
+ const Bubble& bubble0 = bubbleChain0.lastBubble();
+ SHASTA_ASSERT(bubble0.isHaploid());
+ const Chain& chain0 = bubble0.front();
+ SHASTA_ASSERT(chain0.size() >= 2);
+ const MarkerGraphEdgeId markerGraphEdgeId0 = chain0[chain0.size() - 2]; // Exclude last
+
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ const edge_descriptor ce1 = outEdges[i1];
+ const BubbleChain& bubbleChain1 = cGraph[ce1];
+ const Bubble& bubble1 = bubbleChain1.firstBubble();
+ SHASTA_ASSERT(bubble1.isHaploid());
+ const Chain& chain1 = bubble1.front();
+ SHASTA_ASSERT(chain1.size() >= 2);
+ const MarkerGraphEdgeId markerGraphEdgeId1 = chain1[1]; // Exclude first
+
+ if(setToZeroForComplementaryPairs and
+ assembler.markerGraph.reverseComplementEdge[markerGraphEdgeId0] == markerGraphEdgeId1) {
+ tangleMatrix[i0][i1] = 0;
+ } else {
+ MarkerGraphEdgePairInfo info;
+ SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(markerGraphEdgeId0, markerGraphEdgeId1, info));
+ tangleMatrix[i0][i1] = info.common;
+ }
+ }
+ }
+}
+
+
+
+#if 0
+// This works if the following is true:
+// - For all incoming edges (bubble chains) of cv, the last bubble is haploid.
+// - For all outgoing edges (bubble chains) of cv, the first bubble is haploid.
+bool AssemblyGraph::detangleVertexStrict(
+ vertex_descriptor cv, bool debug)
+{
+ AssemblyGraph& cGraph = *this;
+
+ // Gather the in-edges and check that the last bubble is haploid.
+ vector<edge_descriptor> inEdges;
+ BGL_FORALL_INEDGES(cv, ce, cGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+ if(not bubbleChain.lastBubble().isHaploid()) {
+ return false;
+ }
+ inEdges.push_back(ce);
+ }
+
+ // Gather the out-edges and check that the first bubble is haploid.
+ vector<edge_descriptor> outEdges;
+ BGL_FORALL_OUTEDGES(cv, ce, cGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+ if(not bubbleChain.firstBubble().isHaploid()) {
+ return false;
+ }
+ outEdges.push_back(ce);
+ }
+
+ if(inEdges.size() == 1 and outEdges.size() == 1) {
+ return false;
+ }
+
+ // Compute the tangle matrix.
+ vector< vector<uint64_t> > tangleMatrix;
+ computeTangleMatrix(inEdges, outEdges, tangleMatrix, false);
+
+ if(debug) {
+ cout << "Tangle matrix for vertex " << cGraph[cv].edgeId << endl;
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ cout << bubbleChainStringId(inEdges[i0]) << " " <<
+ bubbleChainStringId(outEdges[i1]) << " " <<
+ tangleMatrix[i0][i1] << endl;
+ }
+ }
+ }
+
+ // If the tangle matrix contains no zeros, there is nothing to do.
+ bool foundZero = false;
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ if(tangleMatrix[i0][i1] == 0) {
+ foundZero = true;
+ break;
+ }
+ }
+ if(foundZero) {
+ break;
+ }
+ }
+ if(not foundZero) {
+ return false;
+ }
+
+ // To avoid breaking contiguity, we require each column and each row of the
+ // tangle matrix to have at least one non-zero element.
+ // This means that each in-edge will be "merged" with at least one out-edge,
+ // and each out-edge will be "merged" with at least one in-edge.
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ bool foundNonZero = false;
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ if(tangleMatrix[i0][i1] != 0) {
+ foundNonZero = true;
+ break;
+ }
+ }
+ if(not foundNonZero) {
+ return false;
+ }
+ }
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ bool foundNonZero = false;
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ if(tangleMatrix[i0][i1] != 0) {
+ foundNonZero = true;
+ break;
+ }
+ }
+ if(not foundNonZero) {
+ return false;
+ }
+ }
+
+ if(debug) {
+ cout << "This vertex will be detangled " << inEdges.size() << " by " << outEdges.size() << endl;
+ }
+
+
+
+ // Each non-zero element of the tangle matrix generates a new edge,
+ // obtained by "merging" an in-edge with an out-edge.
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ const edge_descriptor ce0 = inEdges[i0];
+ const BubbleChain& bubbleChain0 = cGraph[ce0];
+ const Bubble& bubble0 = bubbleChain0.lastBubble();
+ SHASTA_ASSERT(bubble0.isHaploid());
+ const Chain& chain0 = bubble0.front();
+ SHASTA_ASSERT(chain0.size() >= 2);
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ if(tangleMatrix[i0][i1] == 0) {
+ continue;
+ }
+ const edge_descriptor ce1 = outEdges[i1];
+ const BubbleChain& bubbleChain1 = cGraph[ce1];
+ const Bubble& bubble1 = bubbleChain1.firstBubble();
+ SHASTA_ASSERT(bubble1.isHaploid());
+ const Chain& chain1 = bubble1.front();
+ SHASTA_ASSERT(chain1.size() >= 2);
+
+ edge_descriptor eNew;
+ tie(eNew, ignore) = add_edge(source(ce0, cGraph), target(ce1, graph), cGraph);
+ AssemblyGraphEdge& newEdge = cGraph[eNew];
+ newEdge.id = nextEdgeId++;
+ BubbleChain& newBubbleChain = newEdge;
+
+ if(debug) {
+ cout << "Merging " <<
+ bubbleChainStringId(ce0) << " " <<
+ bubbleChainStringId(ce1) << " into " <<
+ bubbleChainStringId(eNew) << endl;
+ }
+
+ // Create the new BubbleChain. It is obtained by joining
+ // bubbleChain0 and bubbleChain1, with vertex cv
+ // removed from the end of bubbleChain0
+ // and from the beginning of bubbleChain1.
+ // Here we use the above assumption that
+ // the last bubble of bubbleChain0 and the first bubble of bubbleChain1
+ // are haploid.
+ newBubbleChain = bubbleChain0;
+
+ // Remove cv from the end.
+ Bubble& newBubbleLast = newBubbleChain.back();
+ SHASTA_ASSERT(newBubbleLast.size() == 1);
+ Chain& newChainLast = newBubbleLast.front();
+ SHASTA_ASSERT(newChainLast.back() == cGraph[cv].edgeId);
+ newChainLast.resize(newChainLast.size() - 1);
+
+ // Append chain1, except for cv.
+ SHASTA_ASSERT(chain1.front() == cGraph[cv].edgeId);
+ copy(chain1.begin() + 1, chain1.end(), back_inserter(newChainLast));
+
+ // Append the rest of bubbleChain1.
+ copy(bubbleChain1.begin() + 1, bubbleChain1.end(), back_inserter(newBubbleChain));
+ }
+
+ }
+
+ // Now we can remove cv and all of its in-edges and out-edges.
+ clear_vertex(cv, cGraph);
+ cGraph.removeVertex(cv);
+
+ return true;
+}
+#endif
+
+
+
+bool AssemblyGraph::detangleVertex(
+ vertex_descriptor cv,
+ bool debug,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP)
+{
+ AssemblyGraph& cGraph = *this;
+
+ if(debug) {
+ cout << "Attempting to detangle vertex " << cGraph[cv].edgeId << endl;
+ }
+
+
+ // Gather the in-edges and check that the last bubble is haploid.
+ vector<edge_descriptor> inEdges;
+ BGL_FORALL_INEDGES(cv, ce, cGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+ if(not bubbleChain.lastBubble().isHaploid()) {
+ if(debug) {
+ cout << "Not detangled because the last bubble of in-edge " <<
+ bubbleChainStringId(ce) << " is not haploid." << endl;
+ }
+ return false;
+ }
+ inEdges.push_back(ce);
+ }
+
+ // Gather the out-edges and check that the first bubble is haploid.
+ vector<edge_descriptor> outEdges;
+ BGL_FORALL_OUTEDGES(cv, ce, cGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+ if(not bubbleChain.firstBubble().isHaploid()) {
+ if(debug) {
+ cout << "Not detangled because the first bubble of out-edge " <<
+ bubbleChainStringId(ce) << " is not haploid." << endl;
+ }
+ return false;
+ }
+ outEdges.push_back(ce);
+ }
+
+ if(inEdges.size() == 0 or outEdges.size() == 0) {
+ if(debug) {
+ cout << "Not detangling due to degree (case 1)." << endl;
+ }
+ return false;
+ }
+ if(inEdges.size() < 2 and outEdges.size() < 2) {
+ if(debug) {
+ cout << "Not detangling due to degree (case 2)." << endl;
+ }
+ return false;
+ }
+
+
+
+ // If a MarkerGraphEdgeId appears both in the inEdges and in the outEdges,
+ // detangling could generate a chain with two consecutive copies of the same
+ // MarkerGraphEdgeId. Don't detangle.
+ for(const edge_descriptor ce0: inEdges) {
+ const BubbleChain& bubbleChain0 = cGraph[ce0];
+ const Bubble& bubble0 = bubbleChain0.lastBubble();
+ SHASTA_ASSERT(bubble0.isHaploid());
+ const Chain& chain0 = bubble0.front();
+ SHASTA_ASSERT(chain0.size() >= 2);
+ const MarkerGraphEdgeId markerGraphEdgeId0 = chain0[chain0.size() - 2]; // Exclude last
+
+ for(const edge_descriptor ce1: outEdges) {
+ const BubbleChain& bubbleChain1 = cGraph[ce1];
+ const Bubble& bubble1 = bubbleChain1.firstBubble();
+ SHASTA_ASSERT(bubble1.isHaploid());
+ const Chain& chain1 = bubble1.front();
+ SHASTA_ASSERT(chain1.size() >= 2);
+ const MarkerGraphEdgeId markerGraphEdgeId1 = chain1[1]; // Exclude first
+
+ if(markerGraphEdgeId0 == markerGraphEdgeId1) {
+ if(debug) {
+ cout << "Not detangling due to cycle." << endl;
+ }
+ return false;
+ }
+ }
+ }
+
+
+
+ // Compute the tangle matrix.
+ vector< vector<uint64_t> > tangleMatrix;
+ computeTangleMatrix(inEdges, outEdges, tangleMatrix, false);
+
+ if(debug) {
+ cout << "Tangle matrix for vertex " << cGraph[cv].edgeId << endl;
+
+ cout << "In-edges: ";
+ for(const edge_descriptor ce: inEdges) {
+ cout << " " << bubbleChainStringId(ce);
+ }
+ cout << endl;
+
+ cout << "Out-edges: ";
+ for(const edge_descriptor ce: outEdges) {
+ cout << " " << bubbleChainStringId(ce);
+ }
+ cout << endl;
+
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ cout << bubbleChainStringId(inEdges[i0]) << " " <<
+ bubbleChainStringId(outEdges[i1]) << " " <<
+ tangleMatrix[i0][i1] << endl;
+ }
+ }
+ }
+
+
+
+ // Do the detangling based on the tangle matrix.
+ if(useBayesianModel and inEdges.size() == 2 and outEdges.size() == 2) {
+
+ // Use the 2 by 2 Bayesian model for detangling.
+ array< array<uint64_t, 2>, 2> tangleMatrix22;
+ for(uint64_t i=0; i<2; i++) {
+ for(uint64_t j=0; j<2; j++) {
+ tangleMatrix22[i][j] = tangleMatrix[i][j];
+ }
+ }
+
+ // Compute logarithmic probability ratio of in-phase and out-of-phase
+ // against random.
+ double logPin;
+ double logPout;
+ tie(logPin, logPout) = diploidBayesianPhase(tangleMatrix22, epsilon);
+ if(debug) {
+ cout << "logPin = " << logPin << ", logPout = " << logPout << endl;
+ }
+
+ // const bool isInPhase = (logPin >= minLogP) and ((logPin - logPout) >= minLogP);
+ // const bool isOutOfPhase = (logPout >= minLogP) and ((logPout - logPin) >= minLogP);
+ // Ignore the random hypothesis.
+ const bool isInPhase = (logPin - logPout) >= minLogP;
+ const bool isOutOfPhase = (logPout - logPin) >= minLogP;
+
+ if(isInPhase or isOutOfPhase) {
+
+ // We can detangle.
+ if(debug) {
+ cout << "This vertex will be detangled." << endl;
+ }
+
+ // Create truncated versions of the inEdges and outEdges.
+ vector<vertex_descriptor> inVertices;
+ for(const edge_descriptor ce: inEdges) {
+ inVertices.push_back(cloneAndTruncateAtEnd(ce));
+ }
+ vector<vertex_descriptor> outVertices;
+ for(const edge_descriptor ce: outEdges) {
+ outVertices.push_back(cloneAndTruncateAtBeginning(ce));
+ }
+
+ if(isInPhase) {
+ connect(inVertices[0], outVertices[0]);
+ connect(inVertices[1], outVertices[1]);
+ } else {
+ connect(inVertices[0], outVertices[1]);
+ connect(inVertices[1], outVertices[0]);
+ }
+
+ // Now we can remove cv and all of its in-edges and out-edges.
+ clear_vertex(cv, cGraph);
+ cGraph.removeVertex(cv);
+ return true;
+
+ } else {
+
+ // Ambiguous. Don't detangle.
+ if(debug) {
+ cout << "This vertex will not be detangled." << endl;
+ }
+ return false;
+ }
+
+ } else {
+
+ // Don't use the Bayesian model.
+ // Instead, do simple counting of tangle matrix elements.
+
+ // Count the number of significant, ambiguous, and negligible elements
+ // in the tangle matrix.
+ uint64_t significantCount = 0;
+ uint64_t ambiguousCount = 0;
+ uint64_t negligibleCount = 0;
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ const uint64_t t = tangleMatrix[i0][i1];
+ if(t <= detangleToleranceLow) {
+ ++negligibleCount;
+ } else if(t >= detangleToleranceHigh) {
+ ++significantCount;
+ } else {
+ ++ambiguousCount;
+ }
+ }
+ }
+
+ // If the tangle matrix contains any ambiguous elements, do nothing.
+ if(ambiguousCount > 0) {
+ return false;
+ }
+
+ // There are no ambiguous elements.
+ // If there are no negligible element, that is all elements of the tangle matrix are significant,
+ // there is nothing to do.
+ if(negligibleCount == 0) {
+ return false;
+ }
+
+ // To avoid breaking contiguity, we require each column and each row of the
+ // tangle matrix to have at least one significant element.
+ // This means that each in-edge will be "merged" with at least one out-edge,
+ // and each out-edge will be "merged" with at least one in-edge.
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ bool foundSignificant = false;
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ if(tangleMatrix[i0][i1] >= detangleToleranceHigh) {
+ foundSignificant = true;
+ break;
+ }
+ }
+ if(not foundSignificant) {
+ return false;
+ }
+ }
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ bool foundSignificant = false;
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ if(tangleMatrix[i0][i1] >= detangleToleranceHigh) {
+ foundSignificant = true;
+ break;
+ }
+ }
+ if(not foundSignificant) {
+ return false;
+ }
+ }
+
+ if(debug) {
+ cout << "This vertex will be detangled " << inEdges.size() << " by " << outEdges.size() << endl;
+ }
+
+ // Create truncated versions of the inEdges and outEdges.
+ vector<vertex_descriptor> inVertices;
+ for(const edge_descriptor ce: inEdges) {
+ inVertices.push_back(cloneAndTruncateAtEnd(ce));
+ }
+ vector<vertex_descriptor> outVertices;
+ for(const edge_descriptor ce: outEdges) {
+ outVertices.push_back(cloneAndTruncateAtBeginning(ce));
+ }
+
+ // Each significant element of the tangle matrix generates a new edge.
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ if(tangleMatrix[i0][i1] >= detangleToleranceHigh) {
+ connect(inVertices[i0], outVertices[i1]);
+ }
+ }
+ }
+
+ // Now we can remove cv and all of its in-edges and out-edges.
+ clear_vertex(cv, cGraph);
+ cGraph.removeVertex(cv);
+ return true;
+ }
+
+
+#if 0
+ // Each significant element of the tangle matrix generates a new edge,
+ // obtained by "merging" an in-edge with an out-edge.
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ const edge_descriptor ce0 = inEdges[i0];
+ const BubbleChain& bubbleChain0 = cGraph[ce0];
+ const Bubble& bubble0 = bubbleChain0.lastBubble();
+ SHASTA_ASSERT(bubble0.isHaploid());
+ const Chain& chain0 = bubble0.front();
+ SHASTA_ASSERT(chain0.size() >= 2);
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ if(tangleMatrix[i0][i1] < detangleToleranceHigh) {
+ continue;
+ }
+ const edge_descriptor ce1 = outEdges[i1];
+ const BubbleChain& bubbleChain1 = cGraph[ce1];
+ const Bubble& bubble1 = bubbleChain1.firstBubble();
+ SHASTA_ASSERT(bubble1.isHaploid());
+ const Chain& chain1 = bubble1.front();
+ SHASTA_ASSERT(chain1.size() >= 2);
+
+ edge_descriptor eNew;
+ tie(eNew, ignore) = add_edge(source(ce0, cGraph), target(ce1, graph), cGraph);
+ AssemblyGraphEdge& newEdge = cGraph[eNew];
+ newEdge.id = nextEdgeId++;
+ BubbleChain& newBubbleChain = newEdge;
+
+ if(debug) {
+ cout << "Merging " <<
+ bubbleChainStringId(ce0) << " " <<
+ bubbleChainStringId(ce1) << " into " <<
+ bubbleChainStringId(eNew) << endl;
+ }
+
+ // Create the new BubbleChain. It is obtained by joining
+ // bubbleChain0 and bubbleChain1, with vertex cv
+ // removed from the end of bubbleChain0
+ // and from the beginning of bubbleChain1.
+ // Here we use the above assumption that
+ // the last bubble of bubbleChain0 and the first bubble of bubbleChain1
+ // are haploid.
+ newBubbleChain = bubbleChain0;
+
+ // Remove cv from the end.
+ Bubble& newBubbleLast = newBubbleChain.back();
+ SHASTA_ASSERT(newBubbleLast.size() == 1);
+ Chain& newChainLast = newBubbleLast.front();
+ SHASTA_ASSERT(newChainLast.back() == cGraph[cv].edgeId);
+ newChainLast.resize(newChainLast.size() - 1);
+
+ // Append chain1, except for cv.
+ SHASTA_ASSERT(chain1.front() == cGraph[cv].edgeId);
+ copy(chain1.begin() + 1, chain1.end(), back_inserter(newChainLast));
+
+ // Append the rest of bubbleChain1.
+ copy(bubbleChain1.begin() + 1, bubbleChain1.end(), back_inserter(newBubbleChain));
+ }
+
+ }
+#endif
+
+
+ SHASTA_ASSERT(0);
+}
+
+
+
+// Ths version can handle the case where the last bubble of an in-edge
+// or the first bubble of an out-edge is not haploid.
+// It works like this:
+// - Compute a generalized tangle matrix taking using the next to last
+// MarkerGraphEdgeId of each incoming chain
+// and the second MarkerGraphEdgeId of each outgoing chain.
+// - If detangling is possible based on this generalized tangle matrix,
+// split the last bubble of each incoming edge and the first
+// bubble of each outgoing edge. After this operation,
+// the last bubble of each in-edge is haploid and the first bubble
+// of each out-edge is haploid.
+// - Call detangleVertex to do the detangling.
+bool AssemblyGraph::detangleVertexGeneral(
+ vertex_descriptor cv,
+ bool debug,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP)
+{
+ AssemblyGraph& cGraph = *this;
+
+#if 0
+ // Use detangleVertex, if possible.
+ bool involvesNonHaploidBubbles = false;
+ BGL_FORALL_INEDGES(cv, ce, cGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+ if(not bubbleChain.lastBubble().isHaploid()) {
+ involvesNonHaploidBubbles = true;
+ }
+ }
+ BGL_FORALL_OUTEDGES(cv, ce, cGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+ if(not bubbleChain.firstBubble().isHaploid()) {
+ involvesNonHaploidBubbles = true;
+ }
+ }
+ if(not involvesNonHaploidBubbles) {
+ if(debug) {
+ cout << "No non-haploid bubbles involved, using detangleVertex." << endl;
+ }
+ return detangleVertex(cv, debug, detangleToleranceLow, detangleToleranceHigh);
+ }
+#endif
+
+ if(in_degree(cv, cGraph) < 2 or out_degree(cv, cGraph) < 2) {
+ return false;
+ }
+
+ if(debug) {
+ cout << "Attempting general detangling for vertex " << cGraph[cv].edgeId << endl;
+ }
+
+ class ChainInfo {
+ public:
+ edge_descriptor ce;
+ uint64_t indexInBubble;
+ MarkerGraphEdgeId edgeId;
+ };
+ vector<ChainInfo> inChains;
+ BGL_FORALL_INEDGES(cv, ce, cGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+ const Bubble& bubble = bubbleChain.lastBubble();
+ for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) {
+ const Chain& chain = bubble[indexInBubble];
+ inChains.push_back({ce, indexInBubble, chain[chain.size() - 2]});
+ }
+ }
+ vector<ChainInfo> outChains;
+ BGL_FORALL_OUTEDGES(cv, ce, cGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+ const Bubble& bubble = bubbleChain.firstBubble();
+ for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) {
+ const Chain& chain = bubble[indexInBubble];
+ outChains.push_back({ce, indexInBubble, chain[1]});
+ }
+ }
+
+ if(debug) {
+
+ cout << "In:" << endl;
+ for(const ChainInfo& chainInfo: inChains) {
+ cout << bubbleChainStringId(chainInfo.ce) << " " <<
+ chainInfo.indexInBubble << " " <<
+ chainInfo.edgeId << endl;
+ }
+
+ cout << "Out:" << endl;
+ for(const ChainInfo& chainInfo: outChains) {
+ cout << bubbleChainStringId(chainInfo.ce) << " " <<
+ chainInfo.indexInBubble << " " <<
+ chainInfo.edgeId << endl;
+ }
+ }
+
+ // Compute a generalized tangle matrix.
+ vector<vector<uint64_t> > tangleMatrix(inChains.size(), vector<uint64_t>(outChains.size()));
+ for(uint64_t i0=0; i0<inChains.size(); i0++) {
+ const MarkerGraphEdgeId markerGraphEdgeId0 = inChains[i0].edgeId;
+
+ for(uint64_t i1=0; i1<outChains.size(); i1++) {
+ const MarkerGraphEdgeId markerGraphEdgeId1 = outChains[i1].edgeId;
+
+ MarkerGraphEdgePairInfo info;
+ SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(markerGraphEdgeId0, markerGraphEdgeId1, info));
+ tangleMatrix[i0][i1] = info.common;
+ }
+ }
+
+ if(debug) {
+ cout << "Tangle matrix:" << endl;
+ for(uint64_t i0=0; i0<inChains.size(); i0++) {
+ const ChainInfo& chainInfo0 = inChains[i0];
+ for(uint64_t i1=0; i1<outChains.size(); i1++) {
+ const ChainInfo& chainInfo1 = outChains[i1];
+
+ cout <<
+ bubbleChainStringId(chainInfo0.ce) << " " <<
+ chainInfo0.indexInBubble << " " <<
+ chainInfo0.edgeId << " " <<
+ bubbleChainStringId(chainInfo1.ce) << " " <<
+ chainInfo1.indexInBubble << " " <<
+ chainInfo1.edgeId << " " <<
+ tangleMatrix[i0][i1] << endl;
+ }
+ }
+
+ }
+
+
+ // Figure out if we can detangle.
+ if(useBayesianModel and
+ (inChains.size() == 2) and
+ (outChains.size() == 2)) {
+
+ // Use the 2 by 2 Bayesian model for detangling.
+ array< array<uint64_t, 2>, 2> tangleMatrix22;
+ for(uint64_t i=0; i<2; i++) {
+ for(uint64_t j=0; j<2; j++) {
+ tangleMatrix22[i][j] = tangleMatrix[i][j];
+ }
+ }
+
+ // Compute logarithmic probability ratio of in-phase and out-of-phase
+ // against random.
+ double logPin;
+ double logPout;
+ tie(logPin, logPout) = diploidBayesianPhase(tangleMatrix22, epsilon);
+ if(debug) {
+ cout << "logPin = " << logPin << ", logPout = " << logPout << endl;
+ }
+
+ const bool isInPhase = (logPin >= minLogP) and ((logPin - logPout) >= minLogP);
+ const bool isOutOfPhase = (logPout >= minLogP) and ((logPout - logPin) >= minLogP);
+ if(not (isInPhase or isOutOfPhase)) {
+ if(debug) {
+ cout << "Ambiguous, don't detangle." << endl;
+ }
+ return false;
+ }
+
+ } else {
+
+ // Not using the Bayesian model.
+ // Count the number of significant, ambiguous, and negligible elements
+ // in the tangle matrix.
+ uint64_t significantCount = 0;
+ uint64_t ambiguousCount = 0;
+ uint64_t negligibleCount = 0;
+ for(uint64_t i0=0; i0<inChains.size(); i0++) {
+ for(uint64_t i1=0; i1<outChains.size(); i1++) {
+ const uint64_t t = tangleMatrix[i0][i1];
+ if(t <= detangleToleranceLow) {
+ ++negligibleCount;
+ } else if(t >= detangleToleranceHigh) {
+ ++significantCount;
+ } else {
+ ++ambiguousCount;
+ }
+ }
+ }
+
+ // If the tangle matrix contains any ambiguous elements, do nothing.
+ if(ambiguousCount > 0) {
+ if(debug) {
+ cout << "Tangle matrix is ambiguous." << endl;
+ }
+ return false;
+ }
+ // There are no ambiguous elements.
+ // If there are no negligible element, that is all elements of the tangle matrix are significant,
+ // there is nothing to do.
+ if(negligibleCount == 0) {
+ return false;
+ }
+
+ // To avoid breaking contiguity, we require each column and each row of the
+ // tangle matrix to have at least one significant element.
+ // This means that each in-edge will be "merged" with at least one out-edge,
+ // and each out-edge will be "merged" with at least one in-edge.
+ for(uint64_t i0=0; i0<inChains.size(); i0++) {
+ bool foundSignificant = false;
+ for(uint64_t i1=0; i1<outChains.size(); i1++) {
+ if(tangleMatrix[i0][i1] >= detangleToleranceHigh) {
+ foundSignificant = true;
+ break;
+ }
+ }
+ if(not foundSignificant) {
+ return false;
+ }
+ }
+ for(uint64_t i1=0; i1<outChains.size(); i1++) {
+ bool foundSignificant = false;
+ for(uint64_t i0=0; i0<inChains.size(); i0++) {
+ if(tangleMatrix[i0][i1] >= detangleToleranceHigh) {
+ foundSignificant = true;
+ break;
+ }
+ }
+ if(not foundSignificant) {
+ return false;
+ }
+ }
+ }
+
+ if(debug) {
+ cout << "This vertex can be detangled after some splitting of bubble chains." << endl;
+ }
+
+
+ // Make sure the last bubble of all in-edges is haploid.
+ in_edge_iterator itIn, itInEnd;
+ tie(itIn, itInEnd) = in_edges(cv, cGraph);
+ while(itIn != itInEnd) {
+ const edge_descriptor ce = *itIn;
+ ++itIn; // Increment before possibly removing this edge!
+ const BubbleChain& bubbleChain = cGraph[ce];
+ if(not bubbleChain.lastBubble().isHaploid()) {
+ if(debug) {
+ cout << "In-edge " << bubbleChainStringId(ce) <<
+ " needs to be split at the end." << endl;
+ }
+ splitBubbleChainAtEnd(ce);
+ }
+ }
+
+ // Make sure the first bubble of all out-edges is haploid.
+ out_edge_iterator itOut, itOutEnd;
+ tie(itOut, itOutEnd) = out_edges(cv, cGraph);
+ while(itOut != itOutEnd) {
+ const edge_descriptor ce = *itOut;
+ ++itOut; // Increment before possibly removing this edge!
+ const BubbleChain& bubbleChain = cGraph[ce];
+ if(not bubbleChain.firstBubble().isHaploid()) {
+ if(debug) {
+ cout << "Out-edge " << bubbleChainStringId(ce) <<
+ " needs to be split at the beginning." << endl;
+ }
+ splitBubbleChainAtBeginning(ce);
+ }
+ }
+
+ // Now we can detangle using detangleVertex.
+ if(debug) {
+ cout << "Calling detangleVertex." << endl;
+ }
+ return detangleVertex(cv, debug, detangleToleranceLow, detangleToleranceHigh,
+ useBayesianModel, epsilon, minLogP);
+}
+
+
+
+// Split the first bubble of a bubble chain.
+// Used by detangleVertexGeneral to eliminate
+// non-haploid bubbles adjacent to a vertex to be detangled.
+void AssemblyGraph::splitBubbleChainAtBeginning(edge_descriptor ce)
+{
+ AssemblyGraph& cGraph = *this;
+
+ const BubbleChain& bubbleChain = cGraph[ce];
+ const Bubble& firstBubble = bubbleChain.firstBubble();
+ SHASTA_ASSERT(not firstBubble.isHaploid());
+
+ const vertex_descriptor cv0 = source(ce, cGraph);
+ const vertex_descriptor cv1 = target(ce, cGraph);
+
+
+
+ // General case where the bubble chain has more than one bubble.
+ // Generate a new edge containing the bubble chain except for
+ // the first bubble, plus one new edge for each chain in the firstBubble.
+ if(bubbleChain.size() > 1) {
+
+ // Generate one new edge containing the bubble chain except for
+ // the first bubble.
+ AssemblyGraphEdge newEdge;
+ newEdge.id = nextEdgeId++;
+ copy(bubbleChain.begin() + 1, bubbleChain.end(), back_inserter(newEdge));
+ const vertex_descriptor cv2 = createVertex(newEdge.front().front().front());
+ boost::add_edge(cv2, cv1, newEdge, cGraph);
+
+ // Generate a new edge for each chain in the firstBubble.
+ for(const Chain& chain: firstBubble) {
+ AssemblyGraphEdge newEdge;
+ newEdge.resize(1); // The new edge has only one bubble.
+ Bubble& newBubble = newEdge.front();
+ newEdge.id = nextEdgeId++;
+ newBubble.push_back(chain);
+ boost::add_edge(cv0, cv2, newEdge, cGraph);
+ }
+ }
+
+
+ // Special case where the bubble chain has one bubble.
+ // We generate one new edge for each chain in the firstBubble.
+ else {
+
+ // Generate a new edge for each chain in the firstBubble.
+ for(const Chain& chain: firstBubble) {
+ AssemblyGraphEdge newEdge;
+ newEdge.resize(1); // The new edge has only one bubble.
+ Bubble& newBubble = newEdge.front();
+ newEdge.id = nextEdgeId++;
+ newBubble.push_back(chain);
+ boost::add_edge(cv0, cv1, newEdge, cGraph);
+ }
+ }
+
+ // Now we can remove the original bubble chain.
+ boost::remove_edge(ce, cGraph);
+}
+
+
+
+// Split the last bubble of a bubble chain.
+// Used by detangleVertexGeneral to eliminate
+// non-haploid bubbles adjacent to a vertex to be detangled.
+void AssemblyGraph::splitBubbleChainAtEnd(edge_descriptor ce)
+{
+ AssemblyGraph& cGraph = *this;
+
+ const BubbleChain& bubbleChain = cGraph[ce];
+ const Bubble& lastBubble = bubbleChain.lastBubble();
+ SHASTA_ASSERT(not lastBubble.isHaploid());
+
+ const vertex_descriptor cv0 = source(ce, cGraph);
+ const vertex_descriptor cv1 = target(ce, cGraph);
+
+
+
+ // General case where the bubble chain has more than one bubble.
+ // Generate a new edge containing the bubble chain except for
+ // the last bubble, plus one new edge for each chain in the lastBubble.
+ if(bubbleChain.size() > 1) {
+
+ // Generate one new edge containing the bubble chain except for
+ // the last bubble.
+ AssemblyGraphEdge newEdge;
+ newEdge.id = nextEdgeId++;
+ copy(bubbleChain.begin(), bubbleChain.end()-1, back_inserter(newEdge));
+ const vertex_descriptor cv2 = createVertex(newEdge.back().front().back());
+ boost::add_edge(cv0, cv2, newEdge, cGraph);
+
+ // Generate a new edge for each chain in the lastBubble.
+ for(const Chain& chain: lastBubble) {
+ AssemblyGraphEdge newEdge;
+ newEdge.resize(1); // The new edge has only one bubble.
+ Bubble& newBubble = newEdge.front();
+ newEdge.id = nextEdgeId++;
+ newBubble.push_back(chain);
+ boost::add_edge(cv2, cv1, newEdge, cGraph);
+ }
+ }
+
+
+ // Special case where the bubble chain has one bubble.
+ // We generate one new edge for each chain in the lastBubble.
+ else {
+
+ // Generate a new edge for each chain in the lastBubble.
+ for(const Chain& chain: lastBubble) {
+ AssemblyGraphEdge newEdge;
+ newEdge.resize(1); // The new edge has only one bubble.
+ Bubble& newBubble = newEdge.front();
+ newEdge.id = nextEdgeId++;
+ newBubble.push_back(chain);
+ boost::add_edge(cv0, cv1, newEdge, cGraph);
+ }
+ }
+
+ // Now we can remove the original bubble chain.
+ boost::remove_edge(ce, cGraph);
+}
+
+
+
+bool AssemblyGraph::detangleEdges(
+ bool debug,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP)
+{
+ if(debug) {
+ cout << "Detangling edges." << endl;
+ }
+
+ AssemblyGraph& cGraph = *this;
+
+ // To safely iterate over edges while removing edges we must use edge ids
+ // as unique identifiers, because edge descriptors can be reused as edges are
+ // deleted ndw new edges are created.
+ std::map<uint64_t, edge_descriptor> edgeMap;
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+ edgeMap.insert({cGraph[ce].id, ce});
+ }
+
+ uint64_t detangleCount = 0;;
+ for(auto it=edgeMap.begin(); it!=edgeMap.end(); /* Incremented safely by detangleEdgeStrict */) {
+ if(detangleEdge(debug, edgeMap, it, detangleToleranceLow, detangleToleranceHigh,
+ useBayesianModel, epsilon, minLogP)) {
+ ++detangleCount;
+ }
+ }
+
+ if(debug) {
+ cout << "Detangled " << detangleCount << " edges." << endl;
+ }
+
+ return detangleCount > 0;
+}
+
+
+
+bool AssemblyGraph::detangleEdgesGeneral(
+ bool debug,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP)
+{
+ if(debug) {
+ cout << "Detangling edges." << endl;
+ }
+
+ AssemblyGraph& cGraph = *this;
+
+ // To safely iterate over edges while removing edges we must use edge ids
+ // as unique identifiers, because edge descriptors can be reused as edges are
+ // deleted ndw new edges are created.
+ std::map<uint64_t, edge_descriptor> edgeMap;
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+ edgeMap.insert({cGraph[ce].id, ce});
+ }
+
+ uint64_t detangleCount = 0;;
+ for(auto it=edgeMap.begin(); it!=edgeMap.end(); /* Incremented safely by detangleEdgeStrict */) {
+ if(detangleEdgeGeneral(debug, edgeMap, it, detangleToleranceLow, detangleToleranceHigh,
+ useBayesianModel, epsilon, minLogP)) {
+ ++detangleCount;
+ }
+ }
+
+ if(debug) {
+ cout << "Detangled " << detangleCount << " edges." << endl;
+ }
+
+ return detangleCount > 0;
+}
+
+
+
+bool AssemblyGraph::detangleEdge(
+ bool debug,
+ std::map<uint64_t, edge_descriptor>& edgeMap,
+ std::map<uint64_t, edge_descriptor>::iterator& it,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP)
+{
+ AssemblyGraph& cGraph = *this;
+ const edge_descriptor ce = it->second;
+ ++it;
+ // edgeMap.erase(cGraph[ce].id);
+
+ // Only try detangling if the edge consists of a single haploid bubble.
+ // Otherwise detangling would lose information.
+ BubbleChain& bubbleChain = cGraph[ce];
+ if(bubbleChain.size() > 1) {
+ return false;
+ }
+ if(bubbleChain.front().size() > 1) {
+ return false;
+ }
+
+ // Tangle matrix elements <= detangleToleranceLow are treated as negigible.
+ // Tangle matrix elements >= detangleToleranceHigh are treated as significant.
+ // Tangle matrix elements in between are considered ambiguous.
+ SHASTA_ASSERT(detangleToleranceHigh > detangleToleranceLow);
+
+ const vertex_descriptor cv0 = source(ce, cGraph);
+ const vertex_descriptor cv1 = target(ce, cGraph);
+
+ if(out_degree(cv0, cGraph) != 1) {
+ return false;
+ }
+ if(in_degree(cv1, cGraph) != 1) {
+ return false;
+ }
+
+ if(debug) {
+ cout << "Attempting to detangle edge " << bubbleChainStringId(ce) << endl;
+ }
+
+ // Gather the in-edges and check that the last bubble is haploid.
+ // Ignore in-edges coming from cv1 (back-edges).
+ vector<edge_descriptor> inEdges;
+ vector<edge_descriptor> backEdges;
+ BGL_FORALL_INEDGES(cv0, ce, cGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+ if(not bubbleChain.lastBubble().isHaploid()) {
+ if(debug) {
+ cout << "Not detangling because the last bubble of in-edge " <<
+ bubbleChainStringId(ce) << " is not haploid." << endl;
+ }
+ return false;
+ }
+ if(source(ce, cGraph) != cv1) {
+ inEdges.push_back(ce);
+ } else {
+ backEdges.push_back(ce);
+ }
+ }
+
+ // Gather the out-edges and check that the first bubble is haploid.
+ // Ignore out-edges going to cv0 (back-edges).
+ vector<edge_descriptor> outEdges;
+ BGL_FORALL_OUTEDGES(cv1, ce, cGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+ if(not bubbleChain.firstBubble().isHaploid()) {
+ if(debug) {
+ cout << "Not detangling because the first bubble of out-edge " <<
+ bubbleChainStringId(ce) << " is not haploid." << endl;
+ }
+ return false;
+ }
+ if(target(ce, cGraph) != cv0) {
+ outEdges.push_back(ce);
+ }
+ }
+
+ if(inEdges.size() == 0 or outEdges.size() == 0) {
+ if(debug) {
+ cout << "Not detangling due to degree (case 1)." << endl;
+ }
+ return false;
+ }
+ if(inEdges.size() < 2 and outEdges.size() < 2) {
+ if(debug) {
+ cout << "Not detangling due to degree (case 2)." << endl;
+ }
+ return false;
+ }
+ if(inEdges.size() != outEdges.size()) {
+ if(debug) {
+ cout << "Not detangling due to degree (case 3)." << endl;
+ }
+ return false;
+ }
+
+
+
+ // If a MarkerGraphEdgeId appears both in the inEdges and in the outEdges,
+ // detangling could generate a chain with two consecutive copies of the same
+ // MarkerGraphEdgeId. Don't detangle.
+ for(const edge_descriptor ce0: inEdges) {
+ const BubbleChain& bubbleChain0 = cGraph[ce0];
+ const Bubble& bubble0 = bubbleChain0.lastBubble();
+ SHASTA_ASSERT(bubble0.isHaploid());
+ const Chain& chain0 = bubble0.front();
+ SHASTA_ASSERT(chain0.size() >= 2);
+ const MarkerGraphEdgeId markerGraphEdgeId0 = chain0[chain0.size() - 2]; // Exclude last
+
+ for(const edge_descriptor ce1: outEdges) {
+ const BubbleChain& bubbleChain1 = cGraph[ce1];
+ const Bubble& bubble1 = bubbleChain1.firstBubble();
+ SHASTA_ASSERT(bubble1.isHaploid());
+ const Chain& chain1 = bubble1.front();
+ SHASTA_ASSERT(chain1.size() >= 2);
+ const MarkerGraphEdgeId markerGraphEdgeId1 = chain1[1]; // Exclude first
+
+ if(markerGraphEdgeId0 == markerGraphEdgeId1) {
+ if(debug) {
+ cout << "Not detangling due to cycle." << endl;
+ }
+ return false;
+ }
+ }
+ }
+
+
+
+ // Compute the tangle matrix.
+ vector< vector<uint64_t> > tangleMatrix;
+ computeTangleMatrix(inEdges, outEdges, tangleMatrix, false);
+
+ if(debug) {
+ cout << "Computing tangle matrix for edge " << bubbleChainStringId(ce) << endl;
+
+ cout << "In-edges: ";
+ for(const edge_descriptor ce: inEdges) {
+ cout << " " << bubbleChainStringId(ce);
+ }
+ cout << endl;
+
+ cout << "Out-edges: ";
+ for(const edge_descriptor ce: outEdges) {
+ cout << " " << bubbleChainStringId(ce);
+ }
+ cout << endl;
+
+ cout << "Tangle matrix:" << endl;
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ const edge_descriptor ce0 = inEdges[i0];
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ const edge_descriptor ce1 = outEdges[i1];
+ cout <<
+ bubbleChainStringId(ce0) << " " <<
+ bubbleChainStringId(ce1) << " " <<
+ tangleMatrix[i0][i1];
+ if(tangleMatrix[i0][i1] == 0) {
+ cout << " zero tangle matrix element";
+ }
+ cout << endl;
+ }
+ }
+ }
+
+
+
+ // Detangle based on the contents of the tangle matrix.
+ if(useBayesianModel and inEdges.size() == 2 and outEdges.size() == 2) {
+
+ // Use the 2 by 2 Bayesian model for detangling.
+ array< array<uint64_t, 2>, 2> tangleMatrix22;
+ for(uint64_t i=0; i<2; i++) {
+ for(uint64_t j=0; j<2; j++) {
+ tangleMatrix22[i][j] = tangleMatrix[i][j];
+ }
+ }
+
+ // Compute logarithmic probability ratio of in-phase and out-of-phase
+ // against random.
+ double logPin;
+ double logPout;
+ tie(logPin, logPout) = diploidBayesianPhase(tangleMatrix22, epsilon);
+ if(debug) {
+ cout << "logPin = " << logPin << ", logPout = " << logPout << endl;
+ }
+
+ // const bool isInPhase = (logPin >= minLogP) and ((logPin - logPout) >= minLogP);
+ // const bool isOutOfPhase = (logPout >= minLogP) and ((logPout - logPin) >= minLogP);
+ // Ignore the random hypothesis.
+ const bool isInPhase = (logPin - logPout) >= minLogP;
+ const bool isOutOfPhase = (logPout - logPin) >= minLogP;
+
+ if(isInPhase or isOutOfPhase) {
+
+ // We can detangle.
+
+ // Create truncated versions of the inEdges and outEdges.
+ vector<vertex_descriptor> inVertices;
+ for(const edge_descriptor ce: inEdges) {
+ inVertices.push_back(cloneAndTruncateAtEnd(ce));
+ }
+ vector<vertex_descriptor> outVertices;
+ for(const edge_descriptor ce: outEdges) {
+ outVertices.push_back(cloneAndTruncateAtBeginning(ce));
+ }
+
+ if(isInPhase) {
+ const edge_descriptor e0 = connect(inVertices[0], outVertices[0]);
+ const edge_descriptor e1 = connect(inVertices[1], outVertices[1]);
+ if(debug) {
+ cout << "In phase: created " << bubbleChainStringId(e0) << " and " <<
+ bubbleChainStringId(e1) << endl;
+ }
+ } else {
+ const edge_descriptor e0 = connect(inVertices[0], outVertices[1]);
+ const edge_descriptor e1 = connect(inVertices[1], outVertices[0]);
+ if(debug) {
+ cout << "Out of phase phase: created " << bubbleChainStringId(e0) << " and " <<
+ bubbleChainStringId(e1) << endl;
+ }
+ }
+
+ } else {
+
+ // Ambiguous. Don't detangle.
+ if(debug) {
+ cout << "Ambiguous. NOt detangling." << endl;
+ }
+ return false;
+ }
+
+ } else {
+
+
+
+ // We are not using the Bayesian model.
+
+ // Count the number of significant, ambiguous, and negligible elements
+ // in the tangle matrix.
+ uint64_t significantCount = 0;
+ uint64_t ambiguousCount = 0;
+ uint64_t negligibleCount = 0;
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ const uint64_t t = tangleMatrix[i0][i1];
+ if(t <= detangleToleranceLow) {
+ ++negligibleCount;
+ } else if(t >= detangleToleranceHigh) {
+ ++significantCount;
+ } else {
+ ++ambiguousCount;
+ }
+ }
+ }
+
+ // If the tangle matrix contains any ambiguous elements, do nothing.
+ if(ambiguousCount > 0) {
+ return false;
+ }
+
+ // There are no ambiguous elements.
+ // If there are no negligible element, that is all elements of the tangle matrix are significant,
+ // there is nothing to do.
+ if(negligibleCount == 0) {
+ return false;
+ }
+
+ // To avoid breaking contiguity, we require each column and each row of the
+ // tangle matrix to have at least one significant element.
+ // This means that each in-edge will be "merged" with at least one out-edge,
+ // and each out-edge will be "merged" with at least one in-edge.
+ // ACTUALY, FOR MORE ROBUSTNESS REQUIRE EXACTLY OEN SIGNIFICANT ELEMENT PER ROW AND COLUMN.
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ uint64_t significantCount = 0;
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ if(tangleMatrix[i0][i1] >= detangleToleranceHigh) {
+ ++significantCount;
+ }
+ }
+ if(significantCount != 1) {
+ return false;
+ }
+ }
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ uint64_t significantCount = 0;
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ if(tangleMatrix[i0][i1] >= detangleToleranceHigh) {
+ ++significantCount;
+ }
+ }
+ if(significantCount != 1) {
+ return false;
+ }
+ }
+
+ #if 0
+ // In an in-edge is also an out-edge, don't detangle.
+ for(const edge_descriptor ce: inEdges) {
+ if(find(outEdges.begin(), outEdges.end(), ce) != outEdges.end()) {
+ if(debug) {
+ cout << "Not degangled because an in-edge is also an out-edge." << endl;
+ }
+ return false;
+ }
+ }
+ #endif
+
+ if(debug) {
+ cout << "This edge will be detangled " << inEdges.size() << " by " << outEdges.size() << endl;
+ }
+
+ // Create truncated versions of the inEdges and outEdges.
+ vector<vertex_descriptor> inVertices;
+ for(const edge_descriptor ce: inEdges) {
+ inVertices.push_back(cloneAndTruncateAtEnd(ce));
+ }
+ vector<vertex_descriptor> outVertices;
+ for(const edge_descriptor ce: outEdges) {
+ outVertices.push_back(cloneAndTruncateAtBeginning(ce));
+ }
+
+
+ // Each significant element of the tangle matrix generates a new edge.
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ if(tangleMatrix[i0][i1] >= detangleToleranceHigh) {
+ const edge_descriptor ceNew = connect(inVertices[i0], outVertices[i1]);
+ if(debug) {
+ cout << "Created " << bubbleChainStringId(ceNew) << endl;
+ }
+ }
+ }
+ }
+ }
+
+
+ // Now we can remove cv0, cv1, ce, and all of the in-edges and out-edges.
+ // We have to do this while safely incrementing the edge iterator to point to the
+ // next edge that was not removed.
+ // We already incremented the iterator to point past ce.
+ boost::remove_edge(ce, cGraph);
+ for(const edge_descriptor ce: inEdges) {
+ if(it != edgeMap.end() and cGraph[ce].id == it->first) {
+ ++it;
+ }
+ edgeMap.erase(cGraph[ce].id);
+ boost::remove_edge(ce, cGraph);
+ }
+ for(const edge_descriptor ce: outEdges) {
+ if(it != edgeMap.end() and cGraph[ce].id == it->first) {
+ ++it;
+ }
+ edgeMap.erase(cGraph[ce].id);
+ boost::remove_edge(ce, cGraph);
+ }
+ for(const edge_descriptor ce: backEdges) {
+ if(it != edgeMap.end() and cGraph[ce].id == it->first) {
+ ++it;
+ }
+ edgeMap.erase(cGraph[ce].id);
+ boost::remove_edge(ce, cGraph);
+ }
+ cGraph.removeVertex(cv0);
+ cGraph.removeVertex(cv1);
+
+ return true;
+}
+
+
+
+bool AssemblyGraph::detangleEdgeGeneral(
+ bool debug,
+ std::map<uint64_t, edge_descriptor>& edgeMap,
+ std::map<uint64_t, edge_descriptor>::iterator& it,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP)
+{
+ // detangleEdgeGeneral does not have code to use the Bayesian model
+ // for the 2 by 2 case. See detangleEdge.
+ SHASTA_ASSERT(not useBayesianModel);
+
+ AssemblyGraph& cGraph = *this;
+ const edge_descriptor ce = it->second;
+ ++it;
+
+ // Tangle matrix elements <= detangleToleranceLow are treated as negigible.
+ // Tangle matrix elements >= detangleToleranceHigh are treated as significant.
+ // Tangle matrix elements in between are considered ambiguous.
+ SHASTA_ASSERT(detangleToleranceHigh > detangleToleranceLow);
+
+ const vertex_descriptor cv0 = source(ce, cGraph);
+ const vertex_descriptor cv1 = target(ce, cGraph);
+
+ if(out_degree(cv0, cGraph) != 1) {
+ return false;
+ }
+ if(in_degree(cv1, cGraph) != 1) {
+ return false;
+ }
+
+ if(debug) {
+ cout << "Attempting general detangling of edge " << bubbleChainStringId(ce) << endl;
+ }
+
+ class ChainInfo {
+ public:
+ edge_descriptor ce;
+ uint64_t indexInBubble;
+ MarkerGraphEdgeId edgeId;
+ };
+ vector<ChainInfo> inChains;
+ BGL_FORALL_INEDGES(cv0, ce, cGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+ const Bubble& bubble = bubbleChain.lastBubble();
+ for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) {
+ const Chain& chain = bubble[indexInBubble];
+ inChains.push_back({ce, indexInBubble, chain[chain.size() - 2]});
+ }
+ }
+ vector<ChainInfo> outChains;
+ BGL_FORALL_OUTEDGES(cv1, ce, cGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+ const Bubble& bubble = bubbleChain.firstBubble();
+ for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) {
+ const Chain& chain = bubble[indexInBubble];
+ outChains.push_back({ce, indexInBubble, chain[1]});
+ }
+ }
+
+ if(debug) {
+
+ cout << "In:" << endl;
+ for(const ChainInfo& chainInfo: inChains) {
+ cout << bubbleChainStringId(chainInfo.ce) << " " <<
+ chainInfo.indexInBubble << " " <<
+ chainInfo.edgeId << endl;
+ }
+
+ cout << "Out:" << endl;
+ for(const ChainInfo& chainInfo: outChains) {
+ cout << bubbleChainStringId(chainInfo.ce) << " " <<
+ chainInfo.indexInBubble << " " <<
+ chainInfo.edgeId << endl;
+ }
+ }
+
+ if(inChains.size() != outChains.size()) {
+ if(debug) {
+ cout << "Not detangling due to degree." << endl;
+ }
+ return false;
+ }
+
+
+ // Compute a generalized tangle matrix.
+ vector<vector<uint64_t> > tangleMatrix(inChains.size(), vector<uint64_t>(outChains.size()));
+ for(uint64_t i0=0; i0<inChains.size(); i0++) {
+ const MarkerGraphEdgeId markerGraphEdgeId0 = inChains[i0].edgeId;
+
+ for(uint64_t i1=0; i1<outChains.size(); i1++) {
+ const MarkerGraphEdgeId markerGraphEdgeId1 = outChains[i1].edgeId;
+
+ MarkerGraphEdgePairInfo info;
+ SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(markerGraphEdgeId0, markerGraphEdgeId1, info));
+ tangleMatrix[i0][i1] = info.common;
+ }
+ }
+
+ if(debug) {
+ cout << "Tangle matrix:" << endl;
+ for(uint64_t i0=0; i0<inChains.size(); i0++) {
+ const ChainInfo& chainInfo0 = inChains[i0];
+ for(uint64_t i1=0; i1<outChains.size(); i1++) {
+ const ChainInfo& chainInfo1 = outChains[i1];
+
+ cout <<
+ bubbleChainStringId(chainInfo0.ce) << " " <<
+ chainInfo0.indexInBubble << " " <<
+ chainInfo0.edgeId << " " <<
+ bubbleChainStringId(chainInfo1.ce) << " " <<
+ chainInfo1.indexInBubble << " " <<
+ chainInfo1.edgeId << " " <<
+ tangleMatrix[i0][i1] << endl;
+ }
+ }
+
+ }
+
+ // Count the number of significant, ambiguous, and negligible elements
+ // in the tangle matrix.
+ uint64_t significantCount = 0;
+ uint64_t ambiguousCount = 0;
+ uint64_t negligibleCount = 0;
+ for(uint64_t i0=0; i0<inChains.size(); i0++) {
+ for(uint64_t i1=0; i1<outChains.size(); i1++) {
+ const uint64_t t = tangleMatrix[i0][i1];
+ if(t <= detangleToleranceLow) {
+ ++negligibleCount;
+ } else if(t >= detangleToleranceHigh) {
+ ++significantCount;
+ } else {
+ ++ambiguousCount;
+ }
+ }
+ }
+
+ // If the tangle matrix contains any ambiguous elements, do nothing.
+ if(ambiguousCount > 0) {
+ return false;
+ }
+
+ // There are no ambiguous elements.
+ // If there are no negligible element, that is all elements of the tangle matrix are significant,
+ // there is nothing to do.
+ if(negligibleCount == 0) {
+ return false;
+ }
+
+ // To avoid breaking contiguity, we require each column and each row of the
+ // tangle matrix to have at least one significant element.
+ // This means that each in-edge will be "merged" with at least one out-edge,
+ // and each out-edge will be "merged" with at least one in-edge.
+ for(uint64_t i0=0; i0<inChains.size(); i0++) {
+ bool foundSignificant = false;
+ for(uint64_t i1=0; i1<outChains.size(); i1++) {
+ if(tangleMatrix[i0][i1] >= detangleToleranceHigh) {
+ foundSignificant = true;
+ break;
+ }
+ }
+ if(not foundSignificant) {
+ return false;
+ }
+ }
+ for(uint64_t i1=0; i1<outChains.size(); i1++) {
+ bool foundSignificant = false;
+ for(uint64_t i0=0; i0<inChains.size(); i0++) {
+ if(tangleMatrix[i0][i1] >= detangleToleranceHigh) {
+ foundSignificant = true;
+ break;
+ }
+ }
+ if(not foundSignificant) {
+ return false;
+ }
+ }
+
+ if(debug) {
+ cout << "This edge can be detangled after some splitting of bubble chains." << endl;
+ }
+
+ // Make sure the last bubble of all in-edges is haploid.
+ in_edge_iterator itIn, itInEnd;
+ tie(itIn, itInEnd) = in_edges(cv0, cGraph);
+ while(itIn != itInEnd) {
+ const edge_descriptor ce = *itIn;
+ ++itIn; // Increment before possibly removing this edge!
+ const BubbleChain& bubbleChain = cGraph[ce];
+ if(not bubbleChain.lastBubble().isHaploid()) {
+ if(debug) {
+ cout << "In-edge " << bubbleChainStringId(ce) <<
+ " needs to be split at the end." << endl;
+ }
+ splitBubbleChainAtEnd(ce);
+ }
+ }
+
+ // Make sure the first bubble of all out-edges is haploid.
+ out_edge_iterator itOut, itOutEnd;
+ tie(itOut, itOutEnd) = out_edges(cv1, cGraph);
+ while(itOut != itOutEnd) {
+ const edge_descriptor ce = *itOut;
+ ++itOut; // Increment before possibly removing this edge!
+ const BubbleChain& bubbleChain = cGraph[ce];
+ if(not bubbleChain.firstBubble().isHaploid()) {
+ if(debug) {
+ cout << "Out-edge " << bubbleChainStringId(ce) <<
+ " needs to be split at the beginning." << endl;
+ }
+ splitBubbleChainAtBeginning(ce);
+ }
+ }
+
+ // Now we can detangle using detangleEdge.
+ if(debug) {
+ cout << "Calling detangleEdge." << endl;
+ }
+ --it; // Because detangleEdge increments it again.
+ return detangleEdge(debug, edgeMap, it, detangleToleranceLow, detangleToleranceHigh,
+ useBayesianModel, epsilon, minLogP);
+}
+
+
+#if 0
+bool AssemblyGraph::detangleEdgesWithSearch(
+ bool debug,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh)
+{
+ if(debug) {
+ cout << "Detangling edges with search." << endl;
+ }
+
+ AssemblyGraph& cGraph = *this;
+
+ // To safely iterate over edges while removing edges we must use edge ids
+ // as unique identifiers, because edge descriptors can be reused as edges are
+ // deleted ndw new edges are created.
+ std::map<uint64_t, edge_descriptor> edgeMap;
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+ edgeMap.insert({cGraph[ce].id, ce});
+ }
+
+ uint64_t detangleCount = 0;;
+ for(auto it=edgeMap.begin(); it!=edgeMap.end(); /* Incremented safely by detangleEdgeStrict */) {
+ if(detangleEdgeWithSearch(debug, edgeMap, it, detangleToleranceLow, detangleToleranceHigh)) {
+ ++detangleCount;
+ }
+ }
+
+ if(debug) {
+ cout << "Detangled " << detangleCount << " edges." << endl;
+ }
+
+ return detangleCount > 0;
+}
+
+
+
+bool AssemblyGraph::detangleEdgeWithSearch(
+ bool debug,
+ std::map<uint64_t, edge_descriptor>& edgeMap,
+ std::map<uint64_t, edge_descriptor>::iterator& it,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh)
+{
+ AssemblyGraph& cGraph = *this;
+ const edge_descriptor ce = it->second;
+ ++it;
+
+ // Only try detangling if the edge consists of a single haploid bubble.
+ // Otherwise detangling would lose information.
+ BubbleChain& bubbleChain = cGraph[ce];
+ if(bubbleChain.size() > 1) {
+ return false;
+ }
+ if(bubbleChain.front().size() > 1) {
+ return false;
+ }
+
+
+ const vertex_descriptor cv0 = source(ce, cGraph);
+ const vertex_descriptor cv1 = target(ce, cGraph);
+
+ if(out_degree(cv0, cGraph) != 1) {
+ return false;
+ }
+ if(in_degree(cv1, cGraph) != 1) {
+ return false;
+ }
+
+ if(debug) {
+ cout << "Attempting to detangle edge " << bubbleChainStringId(ce) << " with search." << endl;
+ }
+
+ // Gather the in-edges and check that the last bubble is haploid.
+ vector<edge_descriptor> inEdges;
+ vector<edge_descriptor> backEdges;
+ BGL_FORALL_INEDGES(cv0, ce, cGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+ if(not bubbleChain.lastBubble().isHaploid()) {
+ if(debug) {
+ cout << "Not detangling because the last bubble of in-edge " <<
+ bubbleChainStringId(ce) << " is not haploid." << endl;
+ }
+ return false;
+ }
+ if(source(ce, cGraph) != cv1) {
+ inEdges.push_back(ce);
+ } else {
+ backEdges.push_back(ce);
+ }
+ }
+
+ // Gather the out-edges and check that the first bubble is haploid.
+ // Ignore out-edges going to cv0 (back-edges).
+ vector<edge_descriptor> outEdges;
+ BGL_FORALL_OUTEDGES(cv1, ce, cGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+ if(not bubbleChain.firstBubble().isHaploid()) {
+ if(debug) {
+ cout << "Not detangling because the first bubble of out-edge " <<
+ bubbleChainStringId(ce) << " is not haploid." << endl;
+ }
+ return false;
+ }
+ if(target(ce, cGraph) != cv0) {
+ outEdges.push_back(ce);
+ }
+ }
+
+ if(inEdges.size() == 0 or outEdges.size() == 0) {
+ if(debug) {
+ cout << "Not detangling due to degree (case 1)." << endl;
+ }
+ return false;
+ }
+ if(inEdges.size() != 2 and outEdges.size() != 2) {
+ if(debug) {
+ cout << "Not detangling due to degree (case 2)." << endl;
+ }
+ return false;
+ }
+ if(inEdges.size() != outEdges.size()) {
+ if(debug) {
+ cout << "Not detangling due to degree (case 3)." << endl;
+ }
+ return false;
+ }
+
+
+ // Get the second to last MarkerGraphEdgeIds of the incoming chains.
+ array<MarkerGraphEdgeId, 2> in;
+ for(uint64_t i=0; i<2; i++) {
+ const Chain& chain = cGraph[inEdges[i]].back().front();
+ in[i] = chain.secondToLast();
+ }
+
+ // Get the second MarkerGraphEdgeIds of the outgoing chains.
+ array<MarkerGraphEdgeId, 2> out;
+ for(uint64_t i=0; i<2; i++) {
+ const Chain& chain = cGraph[outEdges[i]].front().front();
+ out[i] = chain.second();
+ }
+ if(debug) {
+ cout << "in " << bubbleChainStringId(inEdges[0]) << " " << bubbleChainStringId(inEdges[1]) << endl;
+ cout << "out " << bubbleChainStringId(outEdges[0]) << " " << bubbleChainStringId(outEdges[1]) << endl;
+ cout << "in " << in[0] << " " << in[1] << endl;
+ cout << "out " << out[0] << " " << out[1] << endl;
+ }
+
+ array<array<vector<MarkerGraphEdgeId>, 2>, 2> detanglingCandidates;
+ GlobalPathGraph::searchForDetangling(
+ in, out,
+ detangleToleranceHigh, detangleToleranceLow,
+ assembler, detanglingCandidates);
+ for(uint64_t i0=0; i0<2; i0++) {
+ for(uint64_t i1=0; i1<2; i1++) {
+ const auto& hits = detanglingCandidates[i0][i1];
+ cout << "Found " << hits.size() << " hits for " << i0 << " " << i1 << ":" << endl;
+ if(not hits.empty()) {
+ copy(hits.begin(), hits.end(), ostream_iterator<MarkerGraphEdgeId>(cout, " "));
+ cout << endl;
+ }
+ }
+ }
+
+ return false;
+
+#if 0
+ // Compute the tangle matrix.
+ vector< vector<uint64_t> > tangleMatrix;
+ computeTangleMatrix(inEdges, outEdges, tangleMatrix, false);
+
+ if(debug) {
+ cout << "Computing tangle matrix for edge " << bubbleChainStringId(ce) << endl;
+
+ cout << "In-edges: ";
+ for(const edge_descriptor ce: inEdges) {
+ cout << " " << bubbleChainStringId(ce);
+ }
+ cout << endl;
+
+ cout << "Out-edges: ";
+ for(const edge_descriptor ce: outEdges) {
+ cout << " " << bubbleChainStringId(ce);
+ }
+ cout << endl;
+
+ cout << "Tangle matrix:" << endl;
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ const edge_descriptor ce0 = inEdges[i0];
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ const edge_descriptor ce1 = outEdges[i1];
+ cout <<
+ bubbleChainStringId(ce0) << " " <<
+ bubbleChainStringId(ce1) << " " <<
+ tangleMatrix[i0][i1];
+ if(tangleMatrix[i0][i1] == 0) {
+ cout << " zero tangle matrix element";
+ }
+ cout << endl;
+ }
+ }
+ }
+
+
+
+ // Detangle based on the contents of the tangle matrix.
+ if(useBayesianModel and inEdges.size() == 2 and outEdges.size() == 2) {
+
+ // Use the 2 by 2 Bayesian model for detangling.
+ array< array<uint64_t, 2>, 2> tangleMatrix22;
+ for(uint64_t i=0; i<2; i++) {
+ for(uint64_t j=0; j<2; j++) {
+ tangleMatrix22[i][j] = tangleMatrix[i][j];
+ }
+ }
+
+ // Compute logarithmic probability ratio of in-phase and out-of-phase
+ // against random.
+ double logPin;
+ double logPout;
+ tie(logPin, logPout) = diploidBayesianPhase(tangleMatrix22, epsilon);
+ if(debug) {
+ cout << "logPin = " << logPin << ", logPout = " << logPout << endl;
+ }
+
+ // const bool isInPhase = (logPin >= minLogP) and ((logPin - logPout) >= minLogP);
+ // const bool isOutOfPhase = (logPout >= minLogP) and ((logPout - logPin) >= minLogP);
+ // Ignore the random hypothesis.
+ const bool isInPhase = (logPin - logPout) >= minLogP;
+ const bool isOutOfPhase = (logPout - logPin) >= minLogP;
+
+ if(isInPhase or isOutOfPhase) {
+
+ // We can detangle.
+
+ // Create truncated versions of the inEdges and outEdges.
+ vector<vertex_descriptor> inVertices;
+ for(const edge_descriptor ce: inEdges) {
+ inVertices.push_back(cloneAndTruncateAtEnd(ce));
+ }
+ vector<vertex_descriptor> outVertices;
+ for(const edge_descriptor ce: outEdges) {
+ outVertices.push_back(cloneAndTruncateAtBeginning(ce));
+ }
+
+ if(isInPhase) {
+ const edge_descriptor e0 = connect(inVertices[0], outVertices[0]);
+ const edge_descriptor e1 = connect(inVertices[1], outVertices[1]);
+ if(debug) {
+ cout << "In phase: created " << bubbleChainStringId(e0) << " and " <<
+ bubbleChainStringId(e1) << endl;
+ }
+ } else {
+ const edge_descriptor e0 = connect(inVertices[0], outVertices[1]);
+ const edge_descriptor e1 = connect(inVertices[1], outVertices[0]);
+ if(debug) {
+ cout << "Out of phase phase: created " << bubbleChainStringId(e0) << " and " <<
+ bubbleChainStringId(e1) << endl;
+ }
+ }
+
+ } else {
+
+ // Ambiguous. Don't detangle.
+ if(debug) {
+ cout << "Ambiguous. NOt detangling." << endl;
+ }
+ return false;
+ }
+
+ } else {
+
+
+
+ // We are not using the Bayesian model.
+
+ // Count the number of significant, ambiguous, and negligible elements
+ // in the tangle matrix.
+ uint64_t significantCount = 0;
+ uint64_t ambiguousCount = 0;
+ uint64_t negligibleCount = 0;
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ const uint64_t t = tangleMatrix[i0][i1];
+ if(t <= detangleToleranceLow) {
+ ++negligibleCount;
+ } else if(t >= detangleToleranceHigh) {
+ ++significantCount;
+ } else {
+ ++ambiguousCount;
+ }
+ }
+ }
+
+ // If the tangle matrix contains any ambiguous elements, do nothing.
+ if(ambiguousCount > 0) {
+ return false;
+ }
+
+ // There are no ambiguous elements.
+ // If there are no negligible element, that is all elements of the tangle matrix are significant,
+ // there is nothing to do.
+ if(negligibleCount == 0) {
+ return false;
+ }
+
+ // To avoid breaking contiguity, we require each column and each row of the
+ // tangle matrix to have at least one significant element.
+ // This means that each in-edge will be "merged" with at least one out-edge,
+ // and each out-edge will be "merged" with at least one in-edge.
+ // ACTUALY, FOR MORE ROBUSTNESS REQUIRE EXACTLY OEN SIGNIFICANT ELEMENT PER ROW AND COLUMN.
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ uint64_t significantCount = 0;
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ if(tangleMatrix[i0][i1] >= detangleToleranceHigh) {
+ ++significantCount;
+ }
+ }
+ if(significantCount != 1) {
+ return false;
+ }
+ }
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ uint64_t significantCount = 0;
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ if(tangleMatrix[i0][i1] >= detangleToleranceHigh) {
+ ++significantCount;
+ }
+ }
+ if(significantCount != 1) {
+ return false;
+ }
+ }
+
+ #if 0
+ // In an in-edge is also an out-edge, don't detangle.
+ for(const edge_descriptor ce: inEdges) {
+ if(find(outEdges.begin(), outEdges.end(), ce) != outEdges.end()) {
+ if(debug) {
+ cout << "Not degangled because an in-edge is also an out-edge." << endl;
+ }
+ return false;
+ }
+ }
+ #endif
+
+ if(debug) {
+ cout << "This edge will be detangled " << inEdges.size() << " by " << outEdges.size() << endl;
+ }
+
+ // Create truncated versions of the inEdges and outEdges.
+ vector<vertex_descriptor> inVertices;
+ for(const edge_descriptor ce: inEdges) {
+ inVertices.push_back(cloneAndTruncateAtEnd(ce));
+ }
+ vector<vertex_descriptor> outVertices;
+ for(const edge_descriptor ce: outEdges) {
+ outVertices.push_back(cloneAndTruncateAtBeginning(ce));
+ }
+
+
+ // Each significant element of the tangle matrix generates a new edge.
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ if(tangleMatrix[i0][i1] >= detangleToleranceHigh) {
+ const edge_descriptor ceNew = connect(inVertices[i0], outVertices[i1]);
+ if(debug) {
+ cout << "Created " << bubbleChainStringId(ceNew) << endl;
+ }
+ }
+ }
+ }
+ }
+
+
+ // Now we can remove cv0, cv1, ce, and all of the in-edges and out-edges.
+ // We have to do this while safely incrementing the edge iterator to point to the
+ // next edge that was not removed.
+ // We already incremented the iterator to point past ce.
+ boost::remove_edge(ce, cGraph);
+ for(const edge_descriptor ce: inEdges) {
+ if(it != edgeMap.end() and cGraph[ce].id == it->first) {
+ ++it;
+ }
+ edgeMap.erase(cGraph[ce].id);
+ boost::remove_edge(ce, cGraph);
+ }
+ for(const edge_descriptor ce: outEdges) {
+ if(it != edgeMap.end() and cGraph[ce].id == it->first) {
+ ++it;
+ }
+ edgeMap.erase(cGraph[ce].id);
+ boost::remove_edge(ce, cGraph);
+ }
+ for(const edge_descriptor ce: backEdges) {
+ if(it != edgeMap.end() and cGraph[ce].id == it->first) {
+ ++it;
+ }
+ edgeMap.erase(cGraph[ce].id);
+ boost::remove_edge(ce, cGraph);
+ }
+ cGraph.removeVertex(cv0);
+ cGraph.removeVertex(cv1);
+
+ return true;
+#endif
+}
+#endif
+
+
+// Detangle short superbubbles with any number of entrances and exits.
+bool AssemblyGraph::detangleShortSuperbubbles(
+ bool debug,
+ uint64_t maxOffset1, // Used to define superbubbles
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP)
+{
+ AssemblyGraph& cGraph = *this;
+
+ // Find the superbubbles.
+ Superbubbles superbubbles(cGraph, maxOffset1);
+
+ // Loop over the superbubbles.
+ bool changesWereMade = false;
+ for(uint64_t superbubbleId=0; superbubbleId<superbubbles.size(); superbubbleId++) {
+ if(detangleShortSuperbubble(debug,
+ superbubbles, superbubbleId, detangleToleranceLow, detangleToleranceHigh,
+ useBayesianModel, epsilon, minLogP)) {
+ changesWereMade = true;
+ }
+ }
+
+ return changesWereMade;
+}
+
+
+
+bool AssemblyGraph::detangleShortSuperbubble(
+ bool debug,
+ const Superbubbles& superbubbles,
+ uint64_t superbubbleId,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP)
+{
+ AssemblyGraph& cGraph = *this;
+ const Superbubble& superbubble = superbubbles.getSuperbubble(superbubbleId);
+
+ if(debug) {
+ cout << "Found a superbubble with " << superbubble.size() <<
+ " vertices:";
+ for(const vertex_descriptor cv: superbubble) {
+ cout << " " << cGraph[cv].edgeId;
+ }
+ cout << endl;
+ }
+
+ // Fill in the in-edges and out-edges.
+ // These cannot be computed while constructing the superbubbles
+ // as they can change when other superbubbles are detangled.
+ vector<edge_descriptor> inEdges;
+ vector<edge_descriptor> outEdges;
+ for(const vertex_descriptor cv0: superbubble) {
+ BGL_FORALL_INEDGES(cv0, ce, cGraph, AssemblyGraph) {
+ const vertex_descriptor cv1 = source(ce, cGraph);
+ if(not superbubbles.isInSuperbubble(superbubbleId, cv1)) {
+ inEdges.push_back(ce);
+ }
+ }
+ BGL_FORALL_OUTEDGES(cv0, ce, cGraph, AssemblyGraph) {
+ const vertex_descriptor cv1 = target(ce, cGraph);
+ if(not superbubbles.isInSuperbubble(superbubbleId, cv1)) {
+ outEdges.push_back(ce);
+ }
+ }
+ }
+ const uint64_t inDegree = inEdges.size();
+ const uint64_t outDegree = outEdges.size();
+
+ if(debug) {
+ cout << inDegree << " in-edges:";
+ for(const edge_descriptor ce: inEdges) {
+ cout << " " << bubbleChainStringId(ce);
+ }
+ cout << endl;
+ cout << outDegree << " out-edges:";
+ for(const edge_descriptor ce: outEdges) {
+ cout << " " << bubbleChainStringId(ce);
+ }
+ cout << endl;
+ }
+
+ if(inDegree == 0 or outDegree == 0) {
+ if(debug) {
+ cout << "Not detangling due to degree (case 1)." << endl;
+ }
+ return false;
+ }
+
+#if 0
+ // Skip this check. We still want to remove the superbubble if possible.
+ if(inDegree < 2 and outDegree < 2) {
+ if(debug) {
+ cout << "Not detangling due to degree (case 2)." << endl;
+ }
+ return false;
+ }
+#endif
+
+ // This requires the last bubble of each in-edge
+ // and the first bubble of each out-edge to be haploid.
+ bool canDo = true;
+ for(const edge_descriptor ce: inEdges) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+ if(not bubbleChain.lastBubble().isHaploid()) {
+ if(debug) {
+ cout << "Not detangling because the last bubble of in-edge " <<
+ bubbleChainStringId(ce) << " is not haploid." << endl;
+ }
+ canDo = false;
+ break;
+ }
+ }
+ for(const edge_descriptor ce: outEdges) {
+ const BubbleChain& bubbleChain = cGraph[ce];
+ if(not bubbleChain.firstBubble().isHaploid()) {
+ if(debug) {
+ cout << "Not detangling because the first bubble of out-edge " <<
+ bubbleChainStringId(ce) << " is not haploid." << endl;
+ }
+ canDo = false;
+ break;
+ }
+ }
+ if(not canDo) {
+ return false;
+ }
+
+
+
+ // If a MarkerGraphEdgeId appears both in the inEdges and in the outEdges,
+ // detangling could generate a chain with two consecutive copies of the same
+ // MarkerGraphEdgeId. Don't detangle.
+ for(const edge_descriptor ce0: inEdges) {
+ const BubbleChain& bubbleChain0 = cGraph[ce0];
+ const Bubble& bubble0 = bubbleChain0.lastBubble();
+ SHASTA_ASSERT(bubble0.isHaploid());
+ const Chain& chain0 = bubble0.front();
+ SHASTA_ASSERT(chain0.size() >= 2);
+ const MarkerGraphEdgeId markerGraphEdgeId0 = chain0[chain0.size() - 2]; // Exclude last
+
+ for(const edge_descriptor ce1: outEdges) {
+ const BubbleChain& bubbleChain1 = cGraph[ce1];
+ const Bubble& bubble1 = bubbleChain1.firstBubble();
+ SHASTA_ASSERT(bubble1.isHaploid());
+ const Chain& chain1 = bubble1.front();
+ SHASTA_ASSERT(chain1.size() >= 2);
+ const MarkerGraphEdgeId markerGraphEdgeId1 = chain1[1]; // Exclude first
+
+ if(markerGraphEdgeId0 == markerGraphEdgeId1) {
+ if(debug) {
+ cout << "Not detangling due to cycle." << endl;
+ }
+ return false;
+ }
+ }
+ }
+
+
+
+ // Compute the tangle matrix.
+ vector< vector<uint64_t> > tangleMatrix;
+ computeTangleMatrix(inEdges, outEdges, tangleMatrix, true);
+
+ if(debug) {
+ cout << "Tangle matrix:" << endl;
+ for(uint64_t i0=0; i0<inDegree; i0++) {
+ const edge_descriptor inEdge = inEdges[i0];
+ for(uint64_t i1=0; i1<outDegree; i1++) {
+ const edge_descriptor outEdge = outEdges[i1];
+
+ cout << bubbleChainStringId(inEdge) << " " <<
+ bubbleChainStringId(outEdge) << " " << tangleMatrix[i0][i1];
+
+ cout << endl;
+ }
+ }
+ }
+
+
+
+ // Detangle based on the contents of the tangle matrix.
+ if(useBayesianModel and inEdges.size() == 2 and outEdges.size() == 2) {
+
+ // Use the 2 by 2 Bayesian model for detangling.
+ array< array<uint64_t, 2>, 2> tangleMatrix22;
+ for(uint64_t i=0; i<2; i++) {
+ for(uint64_t j=0; j<2; j++) {
+ tangleMatrix22[i][j] = tangleMatrix[i][j];
+ }
+ }
+
+ // Compute logarithmic probability ratio of in-phase and out-of-phase
+ // against random.
+ double logPin;
+ double logPout;
+ tie(logPin, logPout) = diploidBayesianPhase(tangleMatrix22, epsilon);
+ if(debug) {
+ cout << "logPin = " << logPin << ", logPout = " << logPout << endl;
+ }
+
+ // const bool isInPhase = (logPin >= minLogP) and ((logPin - logPout) >= minLogP);
+ // const bool isOutOfPhase = (logPout >= minLogP) and ((logPout - logPin) >= minLogP);
+ // Ignore the random hypothesis.
+ const bool isInPhase = (logPin - logPout) >= minLogP;
+ const bool isOutOfPhase = (logPout - logPin) >= minLogP;
+
+ if(isInPhase or isOutOfPhase) {
+
+ // We can detangle.
+
+ // Create truncated versions of the inEdges and outEdges.
+ vector<vertex_descriptor> inVertices;
+ for(const edge_descriptor ce: inEdges) {
+ inVertices.push_back(cloneAndTruncateAtEnd(ce));
+ }
+ vector<vertex_descriptor> outVertices;
+ for(const edge_descriptor ce: outEdges) {
+ outVertices.push_back(cloneAndTruncateAtBeginning(ce));
+ }
+
+ if(isInPhase) {
+ connect(inVertices[0], outVertices[0]);
+ connect(inVertices[1], outVertices[1]);
+ } else {
+ connect(inVertices[0], outVertices[1]);
+ connect(inVertices[1], outVertices[0]);
+ }
+
+ // Now we can remove all the vertices in the superbubble.
+ for(const vertex_descriptor cv: superbubble) {
+ clear_vertex(cv, cGraph);
+ remove_vertex(cv, cGraph);
+ }
+
+ return true;
+
+ } else {
+
+ // Ambiguous. Don't detangle.
+ if(debug) {
+ cout << "Ambiguous. Not detangling." << endl;
+ }
+ return false;
+ }
+ }
+
+
+
+ // If getting here, we are not using the Bayesian model.
+
+ // Count the number of significant, ambiguous, and negligible elements
+ // in the tangle matrix.
+ uint64_t significantCount = 0;
+ uint64_t ambiguousCount = 0;
+ uint64_t negligibleCount = 0;
+ for(uint64_t i0=0; i0<inDegree; i0++) {
+ for(uint64_t i1=0; i1<outDegree; i1++) {
+ const uint64_t t = tangleMatrix[i0][i1];
+ if(t <= detangleToleranceLow) {
+ ++negligibleCount;
+ } else if(t >= detangleToleranceHigh) {
+ ++significantCount;
+ } else {
+ ++ambiguousCount;
+ }
+ }
+ }
+
+ // If the tangle matrix contains any ambiguous elements, do nothing.
+ if(ambiguousCount > 0) {
+ if(debug) {
+ cout << "Not detangled because the tangle matrix contains ambiguous elements." << endl;
+ }
+ return false;
+ }
+
+#if 0
+ // (Skip this check - we still want to get rid of the superbubble in that case too!)
+ // There are no ambiguous elements.
+ // If there are no negligible element, that is all elements of the tangle matrix are significant,
+ // there is nothing to do.
+ if(negligibleCount == 0) {
+ if(debug) {
+ cout << "Not detangled because the tangle matrix contains no negligible elements." << endl;
+ }
+ return false;
+ }
+#endif
+
+ // To avoid breaking contiguity, we require each column and each row of the
+ // tangle matrix to have at least one significant element.
+ // This means that each in-edge will be "merged" with at least one out-edge,
+ // and each out-edge will be "merged" with at least one in-edge.
+ bool ok = true;
+ for(uint64_t i0=0; i0<inDegree; i0++) {
+ bool foundSignificant = false;
+ for(uint64_t i1=0; i1<outDegree; i1++) {
+ if(tangleMatrix[i0][i1] >= detangleToleranceHigh) {
+ foundSignificant = true;
+ break;
+ }
+ }
+ if(not foundSignificant) {
+ ok = false;
+ break;
+ }
+ }
+ for(uint64_t i1=0; i1<outDegree; i1++) {
+ bool foundSignificant = false;
+ for(uint64_t i0=0; i0<inDegree; i0++) {
+ if(tangleMatrix[i0][i1] >= detangleToleranceHigh) {
+ foundSignificant = true;
+ break;
+ }
+ }
+ if(not foundSignificant) {
+ ok = false;
+ break;
+ }
+ }
+ if(not ok) {
+ if(debug) {
+ cout << "Not detangled to avoid breaking contiguity." << endl;
+ }
+ return false;
+ }
+
+ if(debug) {
+ cout << "This superbubble will be detangled." << endl;
+ }
+
+ // Create truncated versions of the inEdges and outEdges.
+ vector<vertex_descriptor> inVertices;
+ for(const edge_descriptor ce: inEdges) {
+ inVertices.push_back(cloneAndTruncateAtEnd(ce));
+ }
+ vector<vertex_descriptor> outVertices;
+ for(const edge_descriptor ce: outEdges) {
+ outVertices.push_back(cloneAndTruncateAtBeginning(ce));
+ }
+
+
+
+ // Each significant element of the tangle matrix generates a new edge.
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ if(tangleMatrix[i0][i1] >= detangleToleranceHigh) {
+ connect(inVertices[i0], outVertices[i1]);
+ }
+ }
+ }
+ if(debug) {
+ cout << "After creating new edges, nextEdgeId is " << nextEdgeId << endl;
+ }
+
+
+#if 0
+ // Each significant element of the tangle matrix generates a new edge,
+ // obtained by "merging" an in-edge with an out-edge.
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ const edge_descriptor ce0 = inEdges[i0];
+ const BubbleChain& bubbleChain0 = cGraph[ce0];
+ const Bubble& bubble0 = bubbleChain0.lastBubble();
+ SHASTA_ASSERT(bubble0.isHaploid());
+ const Chain& chain0 = bubble0.front();
+ SHASTA_ASSERT(chain0.size() >= 2);
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ if(tangleMatrix[i0][i1] < detangleToleranceHigh) {
+ continue;
+ }
+ const edge_descriptor ce1 = outEdges[i1];
+ const BubbleChain& bubbleChain1 = cGraph[ce1];
+ const Bubble& bubble1 = bubbleChain1.firstBubble();
+ SHASTA_ASSERT(bubble1.isHaploid());
+ const Chain& chain1 = bubble1.front();
+ SHASTA_ASSERT(chain1.size() >= 2);
+
+ edge_descriptor eNew;
+ tie(eNew, ignore) = add_edge(source(ce0, cGraph), target(ce1, cGraph), cGraph);
+ AssemblyGraphEdge& newEdge = cGraph[eNew];
+ newEdge.id = nextEdgeId++;
+ BubbleChain& newBubbleChain = newEdge;
+
+ if(debug) {
+ cout << "Merging " <<
+ bubbleChainStringId(ce0) << " " <<
+ bubbleChainStringId(ce1) << " into " <<
+ bubbleChainStringId(eNew) << endl;
+ }
+
+ // Create the new BubbleChain. It is obtained by joining
+ // bubbleChain0 and bubbleChain1, with vertex cv
+ // removed from the end of bubbleChain0
+ // and from the beginning of bubbleChain1.
+ // Here we use the above assumption that
+ // the last bubble of bubbleChain0 and the first bubble of bubbleChain1
+ // are haploid.
+ newBubbleChain = bubbleChain0;
+
+ // Remove the last marker graph edge, which is in the superbubble.
+ Bubble& newBubbleLast = newBubbleChain.back();
+ SHASTA_ASSERT(newBubbleLast.size() == 1);
+ Chain& newChainLast = newBubbleLast.front();
+ newChainLast.resize(newChainLast.size() - 1);
+
+ // Append chain1, except for the first marker graph edge, which is in the superbubble.
+ copy(chain1.begin() + 1, chain1.end(), back_inserter(newChainLast));
+
+ // Append the rest of bubbleChain1.
+ copy(bubbleChain1.begin() + 1, bubbleChain1.end(), back_inserter(newBubbleChain));
+ }
+
+ }
+#endif
+
+ // Now we can remove all the vertices in the superbubble.
+ for(const vertex_descriptor cv: superbubble) {
+ clear_vertex(cv, cGraph);
+ remove_vertex(cv, cGraph);
+ }
+
+ return true;
+}
+
+
+
+// Special treatment to detangle back edges that were too long
+// to be handled by detangleEdges.
+bool AssemblyGraph::detangleBackEdges(
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh)
+{
+ cout << "Detangling back edges." << endl;
+ AssemblyGraph& cGraph = *this;
+
+ // To safely iterate over edges while removing edges we must use edge ids
+ // as unique identifiers, because edge descriptors can be reused as edges are
+ // deleted a new edges are created.
+ std::map<uint64_t, edge_descriptor> edgeMap;
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+ edgeMap.insert({cGraph[ce].id, ce});
+ }
+
+ uint64_t detangleCount = 0;;
+ for(auto it=edgeMap.begin(); it!=edgeMap.end(); /* Incremented safely by detangleEdgeStrict */) {
+ if(detangleBackEdge(edgeMap, it, detangleToleranceLow, detangleToleranceHigh)) {
+ ++detangleCount;
+ }
+ }
+ cout << "Detangled " << detangleCount << " back edges." << endl;
+
+ return detangleCount > 0;
+
+}
+
+
+
+// Special treatment to detangle back edges that were too long
+// to be handled by detangleEdge.
+bool AssemblyGraph::detangleBackEdge(
+ std::map<uint64_t, edge_descriptor>& edgeMap,
+ std::map<uint64_t, edge_descriptor>::iterator& it,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh)
+{
+ AssemblyGraph& cGraph = *this;
+ const edge_descriptor ce = it->second;
+ ++it;
+ // edgeMap.erase(cGraph[ce].id);
+
+ const bool debug = false;
+
+ // Tangle matrix elements <= detangleToleranceLow are treated as negligible.
+ // Tangle matrix elements >= detangleToleranceHigh are treated as significant.
+ // Tangle matrix elements in between are considered ambiguous.
+ SHASTA_ASSERT(detangleToleranceHigh > detangleToleranceLow);
+
+ const vertex_descriptor cv0 = source(ce, cGraph);
+ const vertex_descriptor cv1 = target(ce, cGraph);
+
+ // Check the degrees.
+ if(out_degree(cv0, cGraph) != 1) {
+ return false;
+ }
+ if(in_degree(cv1, cGraph) != 1) {
+ return false;
+ }
+
+ // Look for a back edge.
+ vector<edge_descriptor> backEdges;
+ BGL_FORALL_OUTEDGES(cv1, ce, cGraph, AssemblyGraph) {
+ if(target(ce, cGraph) == cv0) {
+ backEdges.push_back(ce);
+ }
+ }
+ if(backEdges.empty()) {
+ return false;
+ }
+
+ // Only attempt to handle the case with a single back-edge.
+ if(backEdges.size() != 1) {
+ return false;
+ }
+ const edge_descriptor ceBack = backEdges.front();
+
+ if(debug) {
+ cout << "Attempting to detangle edge " << bubbleChainStringId(ce) <<
+ " with back-edge " << bubbleChainStringId(ceBack) << endl;
+ }
+
+ // The back-edge is both an in-edge and an out-edge.
+ // Store it at the first position of both inEdges and outEdges.
+
+ // Gather the in-edges.
+ vector<edge_descriptor> inEdges(1, ceBack);
+ BGL_FORALL_INEDGES(cv0, ce, cGraph, AssemblyGraph) {
+ if(ce == ceBack) {
+ continue;
+ }
+ const BubbleChain& bubbleChain = cGraph[ce];
+ if(not bubbleChain.lastBubble().isHaploid()) {
+ if(debug) {
+ cout << "Not detangling because the last bubble of in-edge " <<
+ bubbleChainStringId(ce) << " is not haploid." << endl;
+ }
+ return false;
+ }
+ inEdges.push_back(ce);
+ }
+
+ // Gather the out-edges.
+ vector<edge_descriptor> outEdges(1, ceBack);
+ BGL_FORALL_OUTEDGES(cv1, ce, cGraph, AssemblyGraph) {
+ if(ce == ceBack) {
+ continue;
+ }
+ const BubbleChain& bubbleChain = cGraph[ce];
+ if(not bubbleChain.firstBubble().isHaploid()) {
+ if(debug) {
+ cout << "Not detangling because the first bubble of out-edge " <<
+ bubbleChainStringId(ce) << " is not haploid." << endl;
+ }
+ return false;
+ }
+ outEdges.push_back(ce);
+ }
+
+
+ if(debug) {
+
+ // Position 0 of the inEdges and outEdges stores the back-edge.
+
+ cout << "In-edges: ";
+ for(uint64_t i=1; i<inEdges.size(); i++) {
+ const edge_descriptor ce = inEdges[i];
+ cout << " " << bubbleChainStringId(ce);
+ }
+ cout << endl;
+
+ cout << "Out-edges: ";
+ for(uint64_t i=1; i<outEdges.size(); i++) {
+ const edge_descriptor ce = outEdges[i];
+ cout << " " << bubbleChainStringId(ce);
+ }
+ cout << endl;
+ }
+ // Compute the tangle matrix.
+ vector< vector<uint64_t> > tangleMatrix;
+ computeTangleMatrix(inEdges, outEdges, tangleMatrix, false);
+
+ if(debug) {
+ cout << "Tangle matrix:" << endl;
+ for(uint64_t i0=0; i0<inEdges.size(); i0++) {
+ const edge_descriptor ce0 = inEdges[i0];
+ for(uint64_t i1=0; i1<outEdges.size(); i1++) {
+ const edge_descriptor ce1 = outEdges[i1];
+ cout <<
+ bubbleChainStringId(ce0) << " " <<
+ bubbleChainStringId(ce1) << " " <<
+ tangleMatrix[i0][i1];
+ cout << endl;
+ }
+ }
+ }
+
+ return false;
+}
+
+
+
+void AssemblyGraph::phaseBubbleChainsUsingPhasingGraph(
+ bool debug,
+ uint64_t n, // Maximum number of Chain MarkerGraphEdgeIds to use when computing tangle matrices.
+ uint64_t lowThreshold,
+ uint64_t highThreshold,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP,
+ uint64_t longBubbleThreshold)
+{
+ AssemblyGraph& cGraph = *this;
+
+ if(debug) {
+ cout << "phaseBubbleChainsUsingPhasingGraph begins." << endl;
+ }
+
+ vector<edge_descriptor> allEdges;
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+ allEdges.push_back(ce);
+ }
+
+ for(const edge_descriptor ce: allEdges) {
+ phaseBubbleChainUsingPhasingGraph(ce, n, lowThreshold, highThreshold, useBayesianModel, epsilon, minLogP, longBubbleThreshold, debug);
+ }
+
+ if(debug) {
+ cout << "phaseBubbleChainsUsingPhasingGraph ends." << endl;
+ }
+}
+
+
+
+void AssemblyGraph::phaseBubbleChainsUsingPhasingTable(
+ const string& debugOutputFileNamePrefix,
+ double phaseErrorThreshold,
+ double bubbleErrorThreshold,
+ uint64_t longBubbleThreshold)
+{
+ AssemblyGraph& cGraph = *this;
+
+ const bool debug = not debugOutputFileNamePrefix.empty();
+ if(debug) {
+ cout << "phaseBubbleChainsUsingPhasingTable begins." << endl;
+ }
+ performanceLog << timestamp << "AssemblyGraph::phaseBubbleChainsUsingPhasingTable begins." << endl;
+
+ // If debug output was requested, make sure we have a directory
+ // where the debug output files will go.
+ string directoryName;
+ if(debug) {
+ directoryName = debugOutputFileNamePrefix + "-PhasingTables";
+ std::filesystem::create_directory(directoryName);
+ }
+
+ vector<edge_descriptor> allEdges;
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+ allEdges.push_back(ce);
+ }
+
+ for(const edge_descriptor ce: allEdges) {
+ phaseBubbleChainUsingPhasingTable(
+ debug ? (directoryName + "/" + bubbleChainStringId(ce)) : "",
+ ce, phaseErrorThreshold, bubbleErrorThreshold, longBubbleThreshold);
+ }
+
+ if(debug) {
+ cout << "phaseBubbleChainsUsingPhasingTable ends." << endl;
+ }
+ performanceLog << timestamp << "AssemblyGraph::phaseBubbleChainsUsingPhasingTable ends." << endl;
+
+}
+
+
+
+void AssemblyGraph::phaseBubbleChainUsingPhasingGraph(
+ edge_descriptor ce,
+ uint64_t n, // Maximum number of Chain MarkerGraphEdgeIds to use when computing tangle matrices.
+ uint64_t lowThreshold,
+ uint64_t highThreshold,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP,
+ uint64_t longBubbleThreshold,
+ bool debug)
+{
+ AssemblyGraph& cGraph = *this;
+ BubbleChain& bubbleChain = cGraph[ce];
+
+ // debug = debug and (cGraph[ce].id == 500048);
+
+ if(debug) {
+ cout << "Phasing " << bubbleChainStringId(ce) << endl;
+ }
+
+ const bool detailedDebug = debug; // (cGraph[ce].id == 49557);
+
+ // If this bubble chain has a single bubble, there is nothing to do.
+ if(bubbleChain.size() == 1) {
+ if(debug) {
+ cout << "Not phased because it has only one bubble." << endl;
+ }
+ return;
+ }
+
+ // Table to contain the Phasing graph vertex corresponding to each diploid bubble.
+ // Indexed by the bubble position in the bubble chains, and contains
+ // PhasingGraph::null_vertex() for non-diploid bubbles.
+ vector<PhasingGraph::vertex_descriptor> vertexTable(bubbleChain.size(), PhasingGraph::null_vertex());
+
+ // Create the PhasingGraph and its vertices, one for
+ // each diploid bubble in the bubble chain.
+ PhasingGraph phasingGraph;
+ for(uint64_t i=0; i<bubbleChain.size(); i++) {
+ if(bubbleChain[i].isDiploid()) {
+ vertexTable[i] = add_vertex({i, 0}, phasingGraph);
+ }
+ }
+
+ // Write a histogram of the bubbles in this bubble chain by ploidy.
+ if(debug) {
+ cout << "Phasing a bubble chain with " << bubbleChain.size() << " bubbles." << endl;
+ vector<uint64_t> histogram;
+ for(const Bubble& bubble: bubbleChain) {
+ const uint64_t ploidy = bubble.size();
+ if(histogram.size() <= ploidy) {
+ histogram.resize(ploidy + 1);
+ }
+ ++histogram[ploidy];
+ }
+ for(uint64_t ploidy=1; ploidy<histogram.size(); ploidy++) {
+ const uint64_t frequency = histogram[ploidy];
+ if(frequency) {
+ cout << frequency << " bubbles of ploidy " << ploidy << endl;
+ }
+ }
+ }
+
+#if 0
+ // If this bubble chain has less than two diploid bubbles, there is nothing to do.
+ uint64_t diploidBubblesCount = 0;
+ for(const Bubble& bubble: bubbleChain) {
+ if(bubble.size() == 2) {
+ ++diploidBubblesCount;
+ }
+ }
+ if(diploidBubblesCount < 2) {
+ if(debug) {
+ cout << "Not phased because it has less than 2 diploid bubbles." << endl;
+ }
+ return;
+ }
+#endif
+
+ // Add edges of the phasing graph.
+ for(uint64_t i0=0; i0<bubbleChain.size()-1; i0++) {
+ const PhasingGraph::vertex_descriptor pv0 = vertexTable[i0];
+ if(pv0 == PhasingGraph::null_vertex()) {
+ continue;
+ }
+
+ // Gather the next-to-last two marker graph edges for the two chains
+ // of this bubble.
+ const Bubble& bubble0 = bubbleChain[i0];
+ SHASTA_ASSERT(bubble0.size() == 2);
+ const Chain& chain00 = bubble0[0];
+ const Chain& chain01 = bubble0[1];
+ const array<MarkerGraphEdgeId, 2> edges0 =
+ {chain00[chain00.size()-2], chain01[chain01.size()-2]};
+
+ for(uint64_t i1=i0+1; i1<bubbleChain.size(); i1++) {
+ const PhasingGraph::vertex_descriptor pv1 = vertexTable[i1];
+ if(pv1 == PhasingGraph::null_vertex()) {
+ continue;
+ }
+
+ // Gather the next-to-last two marker graph edges for the two chains
+ // of this bubble.
+ const Bubble& bubble1 = bubbleChain[i1];
+ SHASTA_ASSERT(bubble1.size() == 2);
+ const Chain& chain10 = bubble1[0];
+ const Chain& chain11 = bubble1[1];
+ const array<MarkerGraphEdgeId, 2> edges1 =
+ {chain10[1], chain11[1]};
+
+ // Compute the tangle matrix.
+ TangleMatrix tangleMatrix;
+ if(n == 1) {
+ for(uint64_t j0=0; j0<2; j0++) {
+ for(uint64_t j1=0; j1<2; j1++) {
+ MarkerGraphEdgePairInfo info;
+ SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(
+ edges0[j0], edges1[j1], info));
+ tangleMatrix[j0][j1] = info.common;
+ }
+ }
+ } else {
+ computeTangleMatrix(
+ {&chain00, &chain01},
+ {&chain10, &chain11},
+ n, tangleMatrix);
+ }
+
+ // Analyze the tangle matrix.
+ int64_t phase;
+ uint64_t minConcordant;
+ uint64_t maxDiscordant;
+ uint64_t total;
+ double logPInPhase;
+ double logPOutOfPhase;
+ tangleMatrix.analyze(
+ lowThreshold,
+ highThreshold,
+ phase,
+ minConcordant,
+ maxDiscordant,
+ total,
+ epsilon,
+ logPInPhase,
+ logPOutOfPhase);
+
+ // If no common reads, stop the loop on i1.
+ if(total == 0) {
+ break;
+ }
+
+ if(detailedDebug) {
+ cout << "Tangle matrix " << i0 << " " << i1 << ": " <<
+ tangleMatrix[0][0] << " " <<
+ tangleMatrix[0][1] << " " <<
+ tangleMatrix[1][0] << " " <<
+ tangleMatrix[1][1] << endl;
+ cout << "minConcordant " << minConcordant << endl;
+ cout << "maxDiscordant " << maxDiscordant << endl;
+ cout << "log[p(in-phase)/p(random)] = " << logPInPhase <<
+ " dB, log[p(out-of-phase)/p(random)] = " << logPOutOfPhase << " dB." << endl;
+ }
+
+ // If using the Bayesian model, redefine the phase based on logPInPhase and logPOutOfPhase.
+ if(useBayesianModel) {
+ if((logPInPhase > minLogP) and (logPInPhase - logPOutOfPhase) > minLogP) {
+ phase = +1;
+ } else if((logPOutOfPhase > minLogP) and (logPOutOfPhase - logPInPhase) > minLogP) {
+ phase = -1;
+ } else {
+ phase = 0;
+ }
+ }
+
+ // If not ambiguous, add an edge to the PhasingGraph.
+ if(phase != 0) {
+ boost::add_edge(pv0, pv1, {phase, minConcordant, maxDiscordant, logPInPhase, logPOutOfPhase}, phasingGraph);
+
+ if(detailedDebug) {
+ cout << " Added phasing graph edge " <<
+ phasingGraph[pv0].positionInBubbleChain << " " <<
+ phasingGraph[pv1].positionInBubbleChain << " with minConcordant " <<
+ minConcordant << ", maxDiscordant " << maxDiscordant << endl;
+ }
+ } else {
+ if(detailedDebug) {
+ cout << " No phasing graph edge for " <<
+ phasingGraph[pv0].positionInBubbleChain << " " <<
+ phasingGraph[pv1].positionInBubbleChain << endl;
+ }
+ }
+
+ }
+ }
+
+ if(debug) {
+ const uint64_t vertexCount = num_vertices(phasingGraph);
+ const uint64_t edgeCount = num_edges(phasingGraph);
+ const double connectivity = 2. * double(edgeCount) / double(vertexCount);
+ cout << "The phasing graph has " << vertexCount <<
+ " vertices and " << edgeCount << " edges."
+ " Average connectivity " << connectivity << endl;
+ }
+
+ phasingGraph.phase1(false, useBayesianModel);
+
+
+
+ // Use the PhasedComponents in the PhasingGraph to create
+ // a new BubbleChain that will replace the existing one.
+ phaseBubbleChainUsingPhasedComponents(debug, ce, phasingGraph.phasedComponents, longBubbleThreshold);
+}
+
+
+
+// Use PhasedComponents to create a new BubbleChain that will replace the existing one.
+void AssemblyGraph::phaseBubbleChainUsingPhasedComponents(
+ bool debug,
+ edge_descriptor e,
+ const vector<shared_ptr<PhasedComponent> >& phasedComponents,
+ uint64_t longBubbleThreshold)
+{
+ AssemblyGraph& cGraph = *this;
+ BubbleChain& bubbleChain = cGraph[e];
+
+ BubbleChain newBubbleChain;
+ if(debug) {
+ cout << "Creating the new bubble chain for " << bubbleChainStringId(e) << endl;
+ }
+
+ // Loop over the phased components.
+ for(uint64_t i=0; /* Check later */; i++) {
+
+ // Bubbles in-between phased components, or before the first phased component,
+ // or after the last phased component.
+ {
+ const uint64_t beginPositionInBubbleChain =
+ (i == 0) ? 0 : phasedComponents[i-1]->maxPositionInBubbleChain + 1;
+ const uint64_t endPositionInBubbleChain =
+ (i == phasedComponents.size()) ?
+ bubbleChain.size() :
+ phasedComponents[i]->minPositionInBubbleChain;
+
+
+ if(debug) {
+ cout << "Adding unphased bubbles at positions [" <<
+ beginPositionInBubbleChain << "," << endPositionInBubbleChain << ")" << endl;
+ }
+
+ for(uint64_t i=beginPositionInBubbleChain; i<endPositionInBubbleChain; i++) {
+ const Bubble& bubble = bubbleChain[i];
+
+ // This unphased bubble will be copied verbatim to the new chain if it is
+ // haploid or if it is long.
+ bool copyVerbatim = bubble.isHaploid();
+ if(not copyVerbatim) {
+ uint64_t averageOffset;
+ uint64_t minOffset;
+ uint64_t maxOffset;
+#if 0
+ if(bubbleOffsetNoException(bubble, averageOffset, minOffset, maxOffset)) {
+ copyVerbatim = maxOffset >= longBubbleThreshold;
+ } else {
+ copyVerbatim = false;
+ }
+#else
+ bubbleOffset(bubble, averageOffset, minOffset, maxOffset);
+ copyVerbatim = maxOffset >= longBubbleThreshold;
+#endif
+ }
+
+ if(copyVerbatim) {
+ newBubbleChain.push_back(bubble);
+ } else {
+ // Just add a simple haploid bubble with only the source
+ // and target MarkerGraphEdgeIds.
+ Bubble newBubble;
+ newBubble.resize(1); // Make it haploid
+ Chain& newChain = newBubble.front(); // Its only chain.
+ newChain.push_back(bubble.front().front()); // Source MarkerGraphEdgeId
+ newChain.push_back(bubble.front().back()); // Target MarkerGraphEdgeId
+ newBubbleChain.push_back(newBubble);
+ }
+ }
+ }
+
+
+
+ // If we are past the last phased component, we are done.
+ if(i == phasedComponents.size()) {
+ break;
+ }
+
+ // Add a diploid bubble for the i-th phased component.
+ const PhasedComponent& phasedComponent = *phasedComponents[i];
+ const uint64_t minPositionInBubbleChain = phasedComponent.minPositionInBubbleChain;
+ const uint64_t maxPositionInBubbleChain = phasedComponent.maxPositionInBubbleChain;
+ if(debug) {
+ cout << "Adding phased bubbles at positions " <<
+ minPositionInBubbleChain << "-" << maxPositionInBubbleChain << endl;
+ }
+ newBubbleChain.emplace_back();
+ Bubble& newBubble = newBubbleChain.back();
+ newBubble.resize(2); // Make it diploid.
+ Chain& newChain0 = newBubble[0]; // The first haplotype after phasing.
+ Chain& newChain1 = newBubble[1]; // The second haplotype after phasing.
+
+ // Add the source MarkerGraphEdgeId.
+ newChain0.push_back(bubbleChain[minPositionInBubbleChain].front().front());
+ newChain1.push_back(bubbleChain[minPositionInBubbleChain].front().front());
+
+ // Add the internal MarkerGraphEdgeIds of all phased diploid bubbles in this PhasedComponent.
+ for(const auto& p: phasedComponent) {
+ const uint64_t positionInBubbleChain = p.first;
+ const int64_t phase = p.second;
+ SHASTA_ASSERT(phase==1 or phase==-1);
+ const Bubble& bubble = bubbleChain[positionInBubbleChain];
+ SHASTA_ASSERT(bubble.isDiploid());
+ const Chain& chain0 = (phase==1) ? bubble[0] : bubble[1];
+ const Chain& chain1 = (phase==1) ? bubble[1] : bubble[0];
+ copy(chain0.begin()+1, chain0.end()-1, back_inserter(newChain0));
+ copy(chain1.begin()+1, chain1.end()-1, back_inserter(newChain1));
+ }
+
+ // Add the target MarkerGraphEdgeId.
+ newChain0.push_back(bubbleChain[maxPositionInBubbleChain].front().back());
+ newChain1.push_back(bubbleChain[maxPositionInBubbleChain].front().back());
+ }
+
+ // Replace the old BubbleChain with the new one, leaving the id of the edge unchanged.
+ newBubbleChain.compress();
+ bubbleChain = newBubbleChain;
+}
+
+
+
+void AssemblyGraph::phaseBubbleChainUsingPhasingTable(
+ const string& debugOutputFileNamePrefix,
+ edge_descriptor e,
+ double phaseErrorThreshold,
+ double bubbleErrorThreshold,
+ uint64_t longBubbleThreshold)
+{
+ AssemblyGraph& cGraph = *this;
+ BubbleChain& bubbleChain = cGraph[e];
+
+ const bool debug = not debugOutputFileNamePrefix.empty();
+
+ cleanupBubbleChainUsingPhasingTable(
+ debug ? (debugOutputFileNamePrefix + "-PreCleanup") : "",
+ e,
+ phaseErrorThreshold,
+ bubbleErrorThreshold,
+ longBubbleThreshold);
+
+
+#if 0
+ // If this bubble chain has a single bubble, there is nothing to do.
+ // NOT TRUE, WE STILL MAY HAVE TO REMOVE SOME BUBBLES.
+ if(bubbleChain.size() == 1) {
+ if(debug) {
+ cout << "Skipped because it has only one bubble." << endl;
+ }
+ return;
+ }
+#endif
+
+ // Create the phasing table for this bubble chain.
+ PhasingTable phasingTable(bubbleChain, assembler.markerGraph, phaseErrorThreshold);
+
+ if(phasingTable.empty()) {
+ if(debug) {
+ cout << "Not phasing because the phasing table is empty." << endl;
+ }
+ return;
+ }
+#if 0
+ // WE STILL MAY HAVE TO REMOVE SOME BUBBLES.
+ if(phasingTable.bubbleCount() < 2) {
+ if(debug) {
+ cout << "Not phasing because the phasing table has less than 2 bubbles." << endl;
+ }
+ return;
+ }
+#endif
+
+ if(debug) {
+ const uint64_t totalCount = phasingTable.entryCount();
+ const uint64_t ambiguousCount = phasingTable.ambiguousEntryCount();
+ const uint64_t unambiguousCount = totalCount - ambiguousCount;
+ const uint64_t bubbleCount = phasingTable.bubbleCount();
+ const uint64_t orientedReadCount = phasingTable.orientedReadCount();
+ const double coverage = double(unambiguousCount) / double(bubbleCount);
+
+ cout << "Phasing table summary for " << bubbleChainStringId(e) << ":" << endl;
+ cout << bubbleCount << " diploid bubbles." << endl;
+ cout << orientedReadCount << " oriented reads." << endl;
+ cout << unambiguousCount << " unambiguous entries." << endl;
+ cout << ambiguousCount << " ambiguous entries." << endl;
+ cout << "Average coverage " << std::round(coverage) << endl;
+ cout << "Average number of diploid bubbles seen by each oriented read " <<
+ std::round(double(unambiguousCount)/double(orientedReadCount)) << endl;
+ }
+
+ // Phasing of the phasing table.
+ phasingTable.greedyPhasing();
+ if(debug) {
+ uint64_t consistentCount;
+ uint64_t inconsistentCount;
+ tie(consistentCount, inconsistentCount) = phasingTable.countConsistentEntries();
+
+ cout << "After greedy phasing, the phasing table has " << consistentCount <<
+ " consistent entries and " << inconsistentCount <<
+ " inconsistent entries (" << consistentCount + inconsistentCount <<
+ " total)." << endl;
+
+ phasingTable.writePng(debugOutputFileNamePrefix + "-Consistency.png",
+ PhasingTable::ColoringMethod::byConsistency);
+ phasingTable.writeCsv(debugOutputFileNamePrefix);
+ phasingTable.writePng(debugOutputFileNamePrefix + "-RelativePhase.png",
+ PhasingTable::ColoringMethod::byRelativePhase);
+ phasingTable.writePng(debugOutputFileNamePrefix + "-DiscreteRelativePhase.png",
+ PhasingTable::ColoringMethod::byDiscreteRelativePhase);
+ }
+
+ // Create the PhasedComponents.
+ phasingTable.constructPhasedComponents(debug);
+
+
+#if 1
+ // Split each PhasedComponent at locations where this is necessary.
+ // Check pairs of adjacent consecutive bubbles in the same phased component.
+ vector< shared_ptr<PhasedComponent> > splitComponents;
+ for(const auto& phasedComponentPointer: phasingTable.phasedComponents) {
+ const PhasedComponent& phasedComponent = *phasedComponentPointer;
+ if(phasedComponent.size() < 2) {
+ break;
+ }
+ if(debug) {
+ cout << "Checking for splitting a PhasedComponent of size " << phasedComponent.size() << endl;
+ }
+ vector<uint64_t> splitComponentsBegin(1, 0);
+ for(uint64_t i=1; i<phasedComponent.size(); i++) {
+ const auto& p0 = phasedComponent[i-1];
+ const auto& p1 = phasedComponent[i];
+ const uint64_t positionInBubbleChain0 = p0.first;
+ const uint64_t positionInBubbleChain1 = p1.first;
+ const int64_t phase0 = p0.second;
+ const int64_t phase1 = p1.second;
+
+ const Bubble& bubble0 = bubbleChain[positionInBubbleChain0];
+ const Bubble& bubble1 = bubbleChain[positionInBubbleChain1];
+ SHASTA_ASSERT(bubble0.isDiploid());
+ SHASTA_ASSERT(bubble1.isDiploid());
+
+ const Chain& chain00 = bubble0[0];
+ const Chain& chain01 = bubble0[1];
+ const Chain& chain10 = (phase0 == phase1) ? bubble1[0] : bubble1[1];
+ const Chain& chain11 = (phase0 == phase1) ? bubble1[1] : bubble1[0];
+
+ MarkerGraphEdgeId e00 = chain00.secondToLast();
+ MarkerGraphEdgeId e01 = chain01.secondToLast();
+ MarkerGraphEdgeId e10 = chain10.second();
+ MarkerGraphEdgeId e11 = chain11.second();
+
+ MarkerGraphEdgePairInfo info;
+ SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(e00, e10, info));
+ const uint64_t common0 = info.common;
+ SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(e01, e11, info));
+ const uint64_t common1 = info.common;
+
+ if(debug) {
+ cout << "Bubble pair: " <<
+ positionInBubbleChain0 << " " <<
+ positionInBubbleChain1 <<
+ ": side 0 " << e00 << " " << e10 << " " << common0 << " " <<
+ ", side 1 " << e01 << " " << e11 << " " << common1 << endl;
+ if(common0 == 0 or common1 == 0) {
+ cout << "No common oriented reads." << endl;
+ }
+ }
+
+ if(common0 == 0 or common1 == 0) {
+ splitComponentsBegin.push_back(i);
+ }
+ }
+ splitComponentsBegin.push_back(phasedComponent.size());
+
+
+ // Split this phased component, if necessary.
+ if(splitComponentsBegin.size() == 2) {
+ // No splitting necessary.
+ splitComponents.push_back(phasedComponentPointer);
+ if(debug) {
+ cout << "No splitting was necessary." << endl;
+ }
+ } else {
+ // Split at the split points.
+ for(uint64_t i=0; i<splitComponentsBegin.size()-1; i++) {
+ const uint64_t begin = splitComponentsBegin[i];
+ const uint64_t end = splitComponentsBegin[i+1];
+ shared_ptr<PhasedComponent> splitComponentPointer = make_shared<PhasedComponent>();
+ copy(phasedComponent.begin() + begin, phasedComponent.begin() + end,
+ back_inserter(*splitComponentPointer));
+ splitComponentPointer->computePositionRange();
+ splitComponents.push_back(splitComponentPointer);
+ if(debug) {
+ cout << "Created a split component at " << begin << " to " << end-1 << " (inclusive)." << endl;
+ }
+ }
+ }
+ }
+ phasingTable.phasedComponents.swap(splitComponents);
+#endif
+
+
+
+ // Remove PhasedComponents consisting of only one short bubble.
+ {
+ vector< shared_ptr<PhasedComponent> > newPhasedComponents;
+ for(const auto& phasedComponent: phasingTable.phasedComponents) {
+ bool keep = true;
+ if(phasedComponent->size() == 1) {
+ const uint64_t positionInBubbleChain = phasedComponent->front().first;
+ const Bubble& bubble = bubbleChain[positionInBubbleChain];
+
+ uint64_t averageOffset;
+ uint64_t minOffset;
+ uint64_t maxOffset;
+ bubbleOffset(bubble, averageOffset, minOffset, maxOffset);
+ if(maxOffset < longBubbleThreshold) {
+ keep = false;
+ }
+ }
+ if(keep) {
+ newPhasedComponents.push_back(phasedComponent);
+ }
+ }
+ phasingTable.phasedComponents.swap(newPhasedComponents);
+ }
+
+
+
+ // Use the phased components to phase the BubbleChain.
+ phaseBubbleChainUsingPhasedComponents(
+ debug,
+ e,
+ phasingTable.phasedComponents,
+ longBubbleThreshold);
+
+}
+
+
+
+void AssemblyGraph::cleanupBubbleChainUsingPhasingTable(
+ const string& debugOutputFileNamePrefix,
+ edge_descriptor e,
+ double phaseErrorThreshold,
+ double bubbleErrorThreshold,
+ uint64_t longBubbleThreshold)
+{
+
+ AssemblyGraph& cGraph = *this;
+ BubbleChain& bubbleChain = cGraph[e];
+
+ const bool debug = not debugOutputFileNamePrefix.empty();
+ if(debug) {
+ cout << "Before bubble clean up, bubble chain " <<
+ bubbleChainStringId(e) << " has " << cGraph[e].size() << " bubbles." << endl;
+ }
+
+ // If this bubble chain has a single bubble, there is nothing to do.
+ if(bubbleChain.size() == 1) {
+ if(debug) {
+ cout << "Skipped because it has only one bubble." << endl;
+ }
+ return;
+ }
+
+ // Create the phasing table for this bubble chain.
+ PhasingTable phasingTable(bubbleChain, assembler.markerGraph, phaseErrorThreshold);
+
+ if(phasingTable.empty()) {
+ return;
+ }
+ if(phasingTable.bubbleCount() < 2) {
+ return;
+ }
+
+ if(debug) {
+ const uint64_t totalCount = phasingTable.entryCount();
+ const uint64_t ambiguousCount = phasingTable.ambiguousEntryCount();
+ const uint64_t unambiguousCount = totalCount - ambiguousCount;
+ const uint64_t bubbleCount = phasingTable.bubbleCount();
+ const uint64_t orientedReadCount = phasingTable.orientedReadCount();
+ const double coverage = double(unambiguousCount) / double(bubbleCount);
+
+ cout << "Phasing table summary (for bubble cleanup) " << bubbleChainStringId(e) << ":" << endl;
+ cout << bubbleCount << " diploid bubbles." << endl;
+ cout << orientedReadCount << " oriented reads." << endl;
+ cout << unambiguousCount << " unambiguous entries." << endl;
+ cout << ambiguousCount << " ambiguous entries." << endl;
+ cout << "Average coverage " << std::round(coverage) << endl;
+ cout << "Average number of diploid bubbles seen by each oriented read " <<
+ std::round(double(unambiguousCount)/double(orientedReadCount)) << endl;
+ }
+
+ // Phasing of the phasing table.
+ phasingTable.greedyPhasing();
+ if(debug) {
+ uint64_t consistentCount;
+ uint64_t inconsistentCount;
+ tie(consistentCount, inconsistentCount) = phasingTable.countConsistentEntries();
+
+ cout << "After greedy phasing, the phasing table (for bubble cleanup) has " << consistentCount <<
+ " consistent entries and " << inconsistentCount <<
+ " inconsistent entries (" << consistentCount + inconsistentCount <<
+ " total)." << endl;
+
+ phasingTable.writePng(debugOutputFileNamePrefix + "-Consistency.png",
+ PhasingTable::ColoringMethod::byConsistency);
+ phasingTable.writeCsv(debugOutputFileNamePrefix);
+ phasingTable.writePng(debugOutputFileNamePrefix + "-RelativePhase.png",
+ PhasingTable::ColoringMethod::byRelativePhase);
+ phasingTable.writePng(debugOutputFileNamePrefix + "-DiscreteRelativePhase.png",
+ PhasingTable::ColoringMethod::byDiscreteRelativePhase);
+ }
+
+
+ // Use the PhasingTable to create a new BubbleChain that will replace the existing one.
+ // In the new bubble chain, we remove:
+ // - All diploid bubbles that have a high error rate in the PhasingTable,
+ // unless they are longer than longBubbleThreshold.
+ // - All bubbles with ploidy greater than 2,
+ // unless they are longer than longBubbleThreshold.
+ // Each bubble that is removed is replaced by a haploid bubble consisting
+ // of only the terminal MarkerGraphEdgeIds.
+ BubbleChain newBubbleChain;
+ for(uint64_t positionInBubbleChain = 0; positionInBubbleChain < bubbleChain.size();
+ positionInBubbleChain++) {
+ const Bubble& bubble = bubbleChain[positionInBubbleChain];
+
+ // Decide whether this Bubble will be copied verbatim to the new bubble chain.
+ bool copyVerbatim = false;
+ if(bubble.isHaploid()) {
+ copyVerbatim = true;
+ if(debug) {
+ cout << "Bubble at position in bubble chain " << positionInBubbleChain <<
+ " is haploid and will be kept." << endl;
+ }
+ } else if(bubble.isDiploid()) {
+ const double bubbleErrorRate = phasingTable.bubbleErrorRate(positionInBubbleChain);
+ if(debug) {
+ cout << "Bubble at phasing table index " << phasingTable.bubblesMap[positionInBubbleChain] <<
+ " position in bubble chain " << positionInBubbleChain <<
+ " has error rate " << bubbleErrorRate;
+ if(bubbleErrorRate <= bubbleErrorThreshold) {
+ cout << " and will be kept." << endl;
+ } else {
+ cout << " and will be removed." << endl;
+ }
+ }
+ if(bubbleErrorRate <= bubbleErrorThreshold) {
+ copyVerbatim = true;
+ }
+ } else {
+ if(debug) {
+ cout << "Bubble at position in bubble chain " << positionInBubbleChain <<
+ " has ploidy " << bubble.size() << " and will be removed." << endl;
+ }
+ }
+ if(not copyVerbatim) {
+ uint64_t averageOffset;
+ uint64_t minOffset;
+ uint64_t maxOffset;
+ bubbleOffset(bubble, averageOffset, minOffset, maxOffset);
+ copyVerbatim = maxOffset >= longBubbleThreshold;
+ }
+
+ if(copyVerbatim) {
+ newBubbleChain.push_back(bubble);
+ if(debug) {
+ cout << "Bubble at position in bubble chain " << positionInBubbleChain <<
+ " was copied to the new bubble chain." << endl;
+ }
+ } else {
+ // Just add a simple haploid bubble with only the source
+ // and target MarkerGraphEdgeIds.
+ Bubble newBubble;
+ newBubble.resize(1); // Make it haploid
+ Chain& newChain = newBubble.front(); // Its only chain.
+ newChain.push_back(bubble.front().front()); // Source MarkerGraphEdgeId
+ newChain.push_back(bubble.front().back()); // Target MarkerGraphEdgeId
+ newBubbleChain.push_back(newBubble);
+ if(debug) {
+ cout << "Bubble at position in bubble chain " << positionInBubbleChain <<
+ " was replaced by a simple haploid bubble in the new bubble chain: " <<
+ bubble.front().front() << " " << bubble.front().back() << endl;
+ }
+ }
+ }
+ bubbleChain = newBubbleChain;
+
+ if(debug) {
+ cout << "After bubble clean up, bubble chain " <<
+ bubbleChainStringId(e) << " has " << newBubbleChain.size() <<
+ " bubbles of which " <<
+ newBubbleChain.diploidBubbleCount() << " diploid." << endl;
+ const string csvFileName = debugOutputFileNamePrefix + "-ChainsDetails-PostBubbleCleanup.csv";
+ ofstream csv(csvFileName);
+ cout << "For chain details after bubble cleanup, see " << csvFileName << endl;
+ writeChainDetailsCsv(csv, e, true);
+ }
+
+ // Replace the old BubbleChain with the new one, leaving the id of the edge unchanged.
+ bubbleChain.compress();
+ if(debug) {
+ cout << "After bubble clean up and compression, bubble chain " <<
+ bubbleChainStringId(e) << " has " << newBubbleChain.size() <<
+ " bubbles of which " <<
+ newBubbleChain.diploidBubbleCount() << " diploid." << endl;
+ const string csvFileName = debugOutputFileNamePrefix +
+ "-ChainsDetails-PostBubbleCleanupSAndCompress.csv";
+ ofstream csv(csvFileName);
+ cout << "For chain details after bubble cleanup and compress, see " << csvFileName << endl;
+ writeChainDetailsCsv(csv, e, true);
+ }
+}
+
+
+
+// Compute the tangle matrix between two incoming chains
+// and two outgoing chains, taking into account up to
+// n MarkergraphEdgeIds for each Chain.
+void AssemblyGraph::computeTangleMatrix(
+ const array<const Chain*, 2> inChains,
+ const array<const Chain*, 2> outChains,
+ uint64_t n,
+ TangleMatrix& tangleMatrix) const
+{
+ // Gather the OrientedReadIds near the end of the inChains.
+ array<vector<OrientedReadId>, 2> allOrientedReadIdsIn;
+ for(uint64_t i=0; i<2; i++) {
+ gatherOrientedReadIdsAtEnd(*inChains[i], n, allOrientedReadIdsIn[i]);
+
+ }
+
+ // Gather the OrientedReadIds near the beginning of the outChains.
+ array<vector<OrientedReadId>, 2> allOrientedReadIdsOut;
+ for(uint64_t i=0; i<2; i++) {
+ gatherOrientedReadIdsAtBeginning(*outChains[i], n, allOrientedReadIdsOut[i]);
+ }
+
+ // Discard OrientedReadIds that appear in both inChains.
+ array<vector<OrientedReadId>, 2> orientedReadIdsIn;
+ for(uint64_t i=0; i<2; i++) {
+ std::set_difference(
+ allOrientedReadIdsIn[i] .begin(), allOrientedReadIdsIn[i] .end(),
+ allOrientedReadIdsIn[1-i].begin(), allOrientedReadIdsIn[1-i].end(),
+ back_inserter(orientedReadIdsIn[i]));
+ }
+
+ // Discard OrientedReadIds that appear in both outChains.
+ array<vector<OrientedReadId>, 2> orientedReadIdsOut;
+ for(uint64_t i=0; i<2; i++) {
+ std::set_difference(
+ allOrientedReadIdsOut[i] .begin(), allOrientedReadIdsOut[i] .end(),
+ allOrientedReadIdsOut[1-i].begin(), allOrientedReadIdsOut[1-i].end(),
+ back_inserter(orientedReadIdsOut[i]));
+ }
+
+ // Now we can compute the tangle matrix.
+ vector<OrientedReadId> commonOrientedReads;
+ for(uint64_t i0=0; i0<2; i0++) {
+ for(uint64_t i1=0; i1<2; i1++) {
+ commonOrientedReads.clear();
+ set_intersection(
+ orientedReadIdsIn[i0] .begin(), orientedReadIdsIn[i0] .end(),
+ orientedReadIdsOut[i1].begin(), orientedReadIdsOut[i1].end(),
+ back_inserter(commonOrientedReads));
+ tangleMatrix[i0][i1] = commonOrientedReads.size();
+ }
+ }
+}
+
+
+
+// Gather OrientedReadIds from up to n MarkergraphEdgeIds
+// near the end of a chain.
+void AssemblyGraph::gatherOrientedReadIdsAtEnd(
+ const Chain& chain,
+ uint64_t n,
+ vector<OrientedReadId>& orientedReadIds) const
+{
+
+ const uint64_t last = chain.size() - 2; // Exclude last MarkergraphEdgeId.
+ const uint64_t first = (last > (n-1)) ? last + 1 - n : 0; // Use up to n.
+
+ SHASTA_ASSERT(first < chain.size());
+ SHASTA_ASSERT(last < chain.size());
+
+ orientedReadIds.clear();
+ for(uint64_t i=first; i<=last; i++) {
+ const MarkerGraphEdgeId markerGraphEdgeId = chain[i];
+ const auto& markerIntervals =
+ assembler.markerGraph.edgeMarkerIntervals[markerGraphEdgeId];
+ for(const MarkerInterval& markerInterval: markerIntervals) {
+ orientedReadIds.push_back(markerInterval.orientedReadId);
+ }
+ }
+ deduplicate(orientedReadIds);
+}
+
+
+
+// Gather OrientedReadIds from up to n MarkergraphEdgeIds
+// near the beginning of a chain.
+void AssemblyGraph::gatherOrientedReadIdsAtBeginning(
+ const Chain& chain,
+ uint64_t n,
+ vector<OrientedReadId>& orientedReadIds) const
+{
+
+ const uint64_t first = 1; // / Exclude first MarkergraphEdgeId.
+ const uint64_t last = (chain.size() > (n+1)) ? n : chain.size() - 1;
+
+ SHASTA_ASSERT(first < chain.size());
+ SHASTA_ASSERT(last < chain.size());
+
+ orientedReadIds.clear();
+ for(uint64_t i=first; i<=last; i++) {
+ const MarkerGraphEdgeId markerGraphEdgeId = chain[i];
+ const auto& markerIntervals =
+ assembler.markerGraph.edgeMarkerIntervals[markerGraphEdgeId];
+ for(const MarkerInterval& markerInterval: markerIntervals) {
+ orientedReadIds.push_back(markerInterval.orientedReadId);
+ }
+ }
+ deduplicate(orientedReadIds);
+}
+
+
+
+// To phase the PhasingGraph, we create an optimal spanning tree
+// using edges in order of decreasing "significance".
+void AssemblyGraph::PhasingGraph::phase(bool debug)
+{
+ PhasingGraph& phasingGraph = *this;
+
+ // Gather edges by maxDiscordant and minConcordant.
+ // edgeTable[maxDiscordant][minConcordant] contains the
+ // edges with those values of maxDiscordant and minConcordant.
+ // This allows the code later ot process edges in order
+ // of increasing maxDiscordant and decreasing minConcordant.
+ vector< vector< vector<edge_descriptor> > > edgeTable;
+ BGL_FORALL_EDGES(pe, phasingGraph, PhasingGraph) {
+ const PhasingGraphEdge& edge = phasingGraph[pe];
+ const uint64_t maxDiscordant = edge.maxDiscordant;
+ const uint64_t minConcordant = edge.minConcordant;
+ if(edgeTable.size() <= maxDiscordant) {
+ edgeTable.resize(maxDiscordant + 1);
+ }
+ vector< vector<edge_descriptor> >& v = edgeTable[maxDiscordant];
+ if(v.size() <= minConcordant) {
+ v.resize(minConcordant + 1);
+ }
+ v[minConcordant].push_back(pe);
+ }
+
+ // Map vertices to integers.
+ std::map<vertex_descriptor, uint64_t> vertexIndexMap;
+ uint64_t vertexIndex = 0;
+ BGL_FORALL_VERTICES(pv, phasingGraph, PhasingGraph) {
+ vertexIndexMap.insert({pv, vertexIndex++});
+ }
+ const uint64_t vertexCount = vertexIndexMap.size();
+
+
+
+ // Compute optimal spanning tree and connected components.
+ vector<uint64_t> rank(vertexCount);
+ vector<uint64_t> parent(vertexCount);
+ boost::disjoint_sets<uint64_t*, uint64_t*> disjointSets(&rank[0], &parent[0]);
+ for(uint64_t i=0; i<vertexCount; i++) {
+ disjointSets.make_set(i);
+ }
+ uint64_t spanningTreeEdgeCount = 0;
+ for(uint64_t maxDiscordant=0; maxDiscordant<edgeTable.size(); maxDiscordant++) {
+ const vector< vector<edge_descriptor> >& v = edgeTable[maxDiscordant];
+ for(int64_t minConcordant=v.size()-1; minConcordant>=0; minConcordant--) {
+ const vector<edge_descriptor>& vv = v[minConcordant];
+ if(false) {
+ cout << "Processing " << vv.size() << " phasing graph edges with maxDiscordant=" <<
+ maxDiscordant << ", minConcordant=" << minConcordant << endl;
+ }
+ for(const edge_descriptor e: vv) {
+ PhasingGraphEdge& edge = phasingGraph[e];
+ const vertex_descriptor pv0 = source(e, phasingGraph);
+ const vertex_descriptor pv1 = target(e, phasingGraph);
+ const uint64_t vertexIndex0 = vertexIndexMap[pv0];
+ const uint64_t vertexIndex1 = vertexIndexMap[pv1];
+ const uint64_t componentId0 = disjointSets.find_set(vertexIndex0);
+ const uint64_t componentId1 = disjointSets.find_set(vertexIndex1);
+ if(componentId0 != componentId1) {
+ disjointSets.union_set(vertexIndex0, vertexIndex1);
+ edge.isSpanningTreeEdge = true;
+ ++spanningTreeEdgeCount;
+ }
+ }
+ if(false) {
+ cout << "Found " << spanningTreeEdgeCount << " spanning tree edges so far." << endl;
+ }
+ }
+ }
+
+ // Gather the vertices in each connected component.
+ vector< vector<vertex_descriptor> > components(vertexCount);
+ BGL_FORALL_VERTICES(pv, phasingGraph, PhasingGraph) {
+ const uint64_t componentId = disjointSets.find_set(vertexIndexMap[pv]);
+ components[componentId].push_back(pv);
+ }
+
+ // Write a histogram of component sizes.
+ if(debug) {
+ vector<uint64_t> histogram;
+ for(const vector<vertex_descriptor>& component: components) {
+ const uint64_t componentSize = component.size();
+ if(histogram.size() <= componentSize) {
+ histogram.resize(componentSize + 1, 0);
+ }
+ ++histogram[componentSize];
+ }
+
+ cout << "Histogram of component sizes:" << endl;
+ cout << "Size,Frequency,Vertices" << endl;
+ for(uint64_t componentSize=1; componentSize<histogram.size(); componentSize++) {
+ const uint64_t frequency = histogram[componentSize];
+ if(frequency) {
+ cout << componentSize << "," << frequency << "," << componentSize*frequency << endl;
+ }
+ }
+ }
+
+ // Gather the non-trivial component and sort them by decreasing size.
+ vector< pair<uint64_t, uint64_t> > componentTable; // (componentId, componentSize)
+ for(uint64_t componentId=0; componentId<vertexCount; componentId++) {
+ const vector<vertex_descriptor>& component = components[componentId];
+ if(component.size() > 1) {
+ componentTable.push_back({componentId, component.size()});
+ }
+ }
+ sort(componentTable.begin(), componentTable.end(),
+ OrderPairsBySecondOnlyGreater<uint64_t, uint64_t>());
+
+
+
+ // Process the non-trivial components in order of decreasing size.
+ phasedComponents.clear();
+ for(const pair<uint64_t, uint64_t>& p: componentTable) {
+ const uint64_t componentId = p.first;
+ const vector<vertex_descriptor>& component = components[componentId];
+ SHASTA_ASSERT(component.size() == p.second);
+ if(debug) {
+ cout << "Processing a phasing component with " << component.size() <<
+ " vertices." << endl;
+ }
+
+ // Use a BFS on the spanning tree to phase the vertices in this component.
+ // Use the spanning tree to phase vertices in the largest component.
+ // It does not matter which vertex we start from.
+ const vertex_descriptor vFirst = component.front();
+ phasingGraph[vFirst].phase = +1;
+ std::queue<vertex_descriptor> q;
+ q.push(vFirst);
+ while(not q.empty()) {
+ const vertex_descriptor v0 = q.front();
+ q.pop();
+ BGL_FORALL_OUTEDGES(v0, e, phasingGraph, PhasingGraph) {
+ PhasingGraphEdge& edge = phasingGraph[e];
+ if(not edge.isSpanningTreeEdge) {
+ continue;
+ }
+ const PhasingGraphVertex& vertex0 = phasingGraph[v0];
+ const vertex_descriptor v1 = target(e, phasingGraph);
+ PhasingGraphVertex& vertex1 = phasingGraph[v1];
+ if(vertex1.phase == 0) {
+ vertex1.phase = vertex0.phase;
+ if(edge.phase == -1) {
+ vertex1.phase = - vertex1.phase;
+ }
+ q.push(v1);
+ }
+ }
+ }
+
+ // Count inconsistent edges in this component.
+ if(debug) {
+ uint64_t inconsistentCount = 0;
+ uint64_t totalCount = 0;
+ for(const vertex_descriptor v: component) {
+ BGL_FORALL_OUTEDGES(v, e, phasingGraph, PhasingGraph) {
+ totalCount++;
+ if(not isConsistent(e)) {
+ ++inconsistentCount;
+ }
+ }
+ }
+ // This counts edges twice.
+ inconsistentCount /= 2;
+ totalCount /= 2;
+ cout << inconsistentCount << " inconsistent edges in this component out of " <<
+ totalCount << " total." << endl;
+ }
+
+
+ // Create the PhasedComponent corresponding to this component.
+ // Don't include any vertices that overlap previous PhasedComponent.
+ shared_ptr<PhasedComponent> phasedComponentPointer = make_shared<PhasedComponent>();
+ PhasedComponent& phasedComponent = *phasedComponentPointer;
+ for(const vertex_descriptor pv: component) {
+ const PhasingGraphVertex& vertex = phasingGraph[pv];
+ const uint64_t positionInBubbleChain = vertex.positionInBubbleChain;
+ bool overlapsPrevious = false;
+ for(const auto& phasedComponent: phasedComponents) {
+ if(
+ positionInBubbleChain >= phasedComponent->minPositionInBubbleChain and
+ positionInBubbleChain <= phasedComponent->maxPositionInBubbleChain) {
+ overlapsPrevious = true;
+ break;
+ }
+ }
+ if(not overlapsPrevious) {
+ phasedComponent.push_back({vertex.positionInBubbleChain, vertex.phase});
+ }
+ }
+ if(phasedComponent.size() < 2) {
+ if(debug) {
+ cout << "This component will be discarded due to overlap with previous components." << endl;
+ }
+ continue;
+ }
+ phasedComponent.sort();
+ if(debug) {
+ cout << "Phasing range for this component " << phasedComponent.minPositionInBubbleChain <<
+ " " << phasedComponent.maxPositionInBubbleChain << endl;
+ }
+ phasedComponents.push_back(phasedComponentPointer);
+ }
+
+ // Sort the phased components in order of increasing position.
+ class SortHelper {
+ public:
+ bool operator()(
+ const shared_ptr<PhasedComponent>& p0,
+ const shared_ptr<PhasedComponent>& p1
+ ) const
+ {
+ return p0->minPositionInBubbleChain < p1->minPositionInBubbleChain;
+ }
+ };
+ sort(phasedComponents.begin(), phasedComponents.end(), SortHelper());
+
+ if(debug) {
+ cout << "Kept " << phasedComponents.size() << " phased components:" << endl;
+ for(const auto& phasedComponent: phasedComponents) {
+ cout << phasedComponent->size() << " diploid bubbles at positions " <<
+ phasedComponent->minPositionInBubbleChain << "..." <<
+ phasedComponent->maxPositionInBubbleChain << " in bubble chain." << endl;
+
+ }
+ phasingGraph.writeGraphviz("PhasingGraph.dot");
+ }
+}
+
+
+
+// Sort edges in order of decreasing significance:
+// - If using the Bayesian model, logP.
+// - Otherwise, minConcordant/maxDiscordant.
+void AssemblyGraph::PhasingGraph::sortEdges(
+ vector<edge_descriptor>& sortedEdges,
+ bool useBayesianModel) const
+{
+ const PhasingGraph& phasingGraph = *this;
+
+ if(useBayesianModel) {
+
+ // Gather edges and their logP.
+ vector< pair<edge_descriptor, double> > edgeTable;
+ BGL_FORALL_EDGES(pe, phasingGraph, PhasingGraph) {
+ const PhasingGraphEdge& edge = phasingGraph[pe];
+ edgeTable.push_back({pe, edge.logP()});
+ }
+
+ // Sort by decreasing logP.
+ sort(edgeTable.begin(), edgeTable.end(),
+ OrderPairsBySecondOnlyGreater<edge_descriptor, double>());
+ sortedEdges.clear();
+ for(const auto& p: edgeTable) {
+ sortedEdges.push_back(p.first);
+ }
+
+ } else {
+
+ // Gather edges by maxDiscordant and minConcordant.
+ // edgeTable[maxDiscordant][minConcordant] contains the
+ // edges with those values of maxDiscordant and minConcordant.
+ vector< vector< vector<edge_descriptor> > > edgeTable;
+ BGL_FORALL_EDGES(pe, phasingGraph, PhasingGraph) {
+ const PhasingGraphEdge& edge = phasingGraph[pe];
+ const uint64_t maxDiscordant = edge.maxDiscordant;
+ const uint64_t minConcordant = edge.minConcordant;
+ if(edgeTable.size() <= maxDiscordant) {
+ edgeTable.resize(maxDiscordant + 1);
+ }
+ vector< vector<edge_descriptor> >& v = edgeTable[maxDiscordant];
+ if(v.size() <= minConcordant) {
+ v.resize(minConcordant + 1);
+ }
+ v[minConcordant].push_back(pe);
+ }
+
+ // The sorted edges are in order of increasing maxDiscordant
+ // and decreasing minConcordant.
+ sortedEdges.clear();
+ for(uint64_t maxDiscordant=0; maxDiscordant<edgeTable.size(); maxDiscordant++) {
+ const vector< vector<edge_descriptor> >& v = edgeTable[maxDiscordant];
+ for(int64_t minConcordant=v.size()-1; minConcordant>=0; minConcordant--) {
+ const vector<edge_descriptor>& vv = v[minConcordant];
+ for(const edge_descriptor e: vv) {
+ sortedEdges.push_back(e);
+ }
+ }
+ }
+
+ }
+}
+
+
+
+// To phase the PhasingGraph, we create an optimal spanning tree
+// using edges in order of decreasing "significance".
+// We do this iteratively. At each iteration we process the largest
+// connected component of the surviving PhasingGraph.
+void AssemblyGraph::PhasingGraph::phase1(bool debug, bool useBayesianModel)
+{
+ PhasingGraph& phasingGraph = *this;
+ phasedComponents.clear();
+
+ if(debug) {
+ cout << "Beginning phasing for a PhasingGraph with " << num_vertices(phasingGraph) <<
+ " vertices." << endl;
+ }
+
+ // Main iteration loop.
+ while(true) {
+
+ // Clear the isSpanningTreeEdge flag of all edges.
+ BGL_FORALL_EDGES(pe, phasingGraph, PhasingGraph) {
+ phasingGraph[pe].isSpanningTreeEdge = false;
+ }
+
+ // Sort edges in order of decreasing significance:
+ // - If using the Bayesian model, logP.
+ // - Otherwise, minConcordant/maxDiscordant.
+ vector<edge_descriptor> sortedEdges;
+ sortEdges(sortedEdges, useBayesianModel);
+
+ // Map vertices to integers.
+ // This is needed for the computation of the spanning tree and
+ // connected components.
+ std::map<vertex_descriptor, uint64_t> vertexIndexMap;
+ uint64_t vertexIndex = 0;
+ BGL_FORALL_VERTICES(pv, phasingGraph, PhasingGraph) {
+ vertexIndexMap.insert({pv, vertexIndex++});
+ }
+ const uint64_t vertexCount = vertexIndexMap.size();
+
+ if(debug) {
+ cout << "Beginning a new phasing iteration. The phasing graph has " <<
+ vertexCount << " vertices left." << endl;
+ }
+
+
+
+ // Compute optimal spanning tree and connected components.
+ vector<uint64_t> rank(vertexCount);
+ vector<uint64_t> parent(vertexCount);
+ boost::disjoint_sets<uint64_t*, uint64_t*> disjointSets(&rank[0], &parent[0]);
+ for(uint64_t i=0; i<vertexCount; i++) {
+ disjointSets.make_set(i);
+ }
+ uint64_t spanningTreeEdgeCount = 0;
+
+ for(const edge_descriptor e: sortedEdges) {
+ PhasingGraphEdge& edge = phasingGraph[e];
+ const vertex_descriptor pv0 = source(e, phasingGraph);
+ const vertex_descriptor pv1 = target(e, phasingGraph);
+ const uint64_t vertexIndex0 = vertexIndexMap[pv0];
+ const uint64_t vertexIndex1 = vertexIndexMap[pv1];
+ const uint64_t componentId0 = disjointSets.find_set(vertexIndex0);
+ const uint64_t componentId1 = disjointSets.find_set(vertexIndex1);
+ if(componentId0 != componentId1) {
+ disjointSets.union_set(vertexIndex0, vertexIndex1);
+ edge.isSpanningTreeEdge = true;
+ ++spanningTreeEdgeCount;
+ }
+ }
+
+ // Gather the vertices in each connected component.
+ vector< vector<vertex_descriptor> > components(vertexCount);
+ BGL_FORALL_VERTICES(pv, phasingGraph, PhasingGraph) {
+ const uint64_t componentId = disjointSets.find_set(vertexIndexMap[pv]);
+ components[componentId].push_back(pv);
+ }
+
+ // Find the largest connected component.
+ uint64_t largestComponentId = invalid<uint64_t>;
+ uint64_t largestComponentSize = 0;
+ for(uint64_t componentId=0; componentId<vertexCount; componentId++) {
+ const uint64_t componentSize = components[componentId].size();
+ if(componentSize > largestComponentSize) {
+ largestComponentSize = componentSize;
+ largestComponentId = componentId;
+ }
+ }
+
+ // If the largest component has less than two vertices, we are done.
+ if(largestComponentSize < 2) {
+ if(debug) {
+ cout << "Phasing terminates because only trivial connected components were found." << endl;
+ }
+ break;
+ }
+
+ // Access the largest connected component, which we will be working on
+ // for the rest of this iteration.
+ const vector<vertex_descriptor>& component = components[largestComponentId];
+ SHASTA_ASSERT(component.size() == largestComponentSize);
+ if(debug) {
+ cout << "The largest component of the current PhasingGraph has " <<
+ largestComponentSize << " vertices." << endl;
+ }
+
+ // Use a BFS on the spanning tree to phase the vertices in this component.
+ // It does not matter which vertex we start from.
+ const vertex_descriptor vFirst = component.front();
+ phasingGraph[vFirst].phase = +1;
+ std::queue<vertex_descriptor> q;
+ q.push(vFirst);
+ while(not q.empty()) {
+ const vertex_descriptor v0 = q.front();
+ q.pop();
+ BGL_FORALL_OUTEDGES(v0, e, phasingGraph, PhasingGraph) {
+ PhasingGraphEdge& edge = phasingGraph[e];
+ if(not edge.isSpanningTreeEdge) {
+ continue;
+ }
+ const PhasingGraphVertex& vertex0 = phasingGraph[v0];
+ const vertex_descriptor v1 = target(e, phasingGraph);
+ PhasingGraphVertex& vertex1 = phasingGraph[v1];
+ if(vertex1.phase == 0) {
+ vertex1.phase = vertex0.phase;
+ if(edge.phase == -1) {
+ vertex1.phase = - vertex1.phase;
+ }
+ q.push(v1);
+ }
+ }
+ }
+
+ // Count inconsistent edges in this component.
+ if(debug) {
+ uint64_t inconsistentCount = 0;
+ uint64_t totalCount = 0;
+ for(const vertex_descriptor v: component) {
+ BGL_FORALL_OUTEDGES(v, e, phasingGraph, PhasingGraph) {
+ totalCount++;
+ if(not isConsistent(e)) {
+ ++inconsistentCount;
+ }
+ }
+ }
+ // This counts edges twice.
+ inconsistentCount /= 2;
+ totalCount /= 2;
+ cout << inconsistentCount << " inconsistent edges in this component out of " <<
+ totalCount << " total." << endl;
+ }
+
+ // All vertices in this component have been phased.
+ // However, when creating the PhasedComponent, we have to make sure that adjacent
+ // phased vertices have common reads.
+ // To guarantee this, we find a longest path in this component, in order of increasing
+ // positionInBubbleChain. Only vertices in this longest path are then included in the
+ // PhasedComponent.
+
+ // To find this longest path, we use an algorithm similar to the one in longestPath.cpp,
+ // using the topological ordering induced by positionInBubbleChain.
+
+ // Table of the vertices in order of increasing positionInBubbleChain.
+ vector< pair<vertex_descriptor, uint64_t> > vertexTable;
+ for(const vertex_descriptor v: component) {
+ vertexTable.push_back({v, phasingGraph[v].positionInBubbleChain});
+ }
+ sort(vertexTable.begin(), vertexTable.end(), OrderPairsBySecondOnly<vertex_descriptor, uint64_t>());
+
+ // The length of the longest path ending at each vertex.
+ std::map<vertex_descriptor, uint64_t> lengthMap;
+ for(const vertex_descriptor v: component) {
+ lengthMap.insert(make_pair(v, 0));
+ }
+
+ // Process the vertices in order of increasing positionInBubbleChain.
+ for(const auto& p: vertexTable) {
+ const vertex_descriptor v0 = p.first;
+ const uint64_t positionInBubbleChain0 = phasingGraph[v0].positionInBubbleChain;
+
+ uint64_t maximumLength = 0;
+ BGL_FORALL_OUTEDGES_T(v0, e, phasingGraph, PhasingGraph) {
+ const vertex_descriptor v1 = target(e, phasingGraph);
+ const uint64_t positionInBubbleChain1 = phasingGraph[v1].positionInBubbleChain;
+
+ if(positionInBubbleChain1 < positionInBubbleChain0) {
+ maximumLength = max(maximumLength, lengthMap[v1]);
+ }
+ }
+ lengthMap[v0] = maximumLength + 1;
+ }
+
+ // Find the vertex with the longest length.
+ // This will be the end of the longest path.
+ vertex_descriptor v = PhasingGraph::null_vertex();
+ uint64_t maximumLength = 0;
+ for(const auto& p: lengthMap) {
+ if(p.second > maximumLength) {
+ v = p.first;
+ maximumLength = p.second;
+ }
+ }
+
+ // Constuct the path, moving backward from here.
+ vector<vertex_descriptor> longestPath;
+ longestPath.push_back(v);
+ while(true) {
+ vertex_descriptor vPrevious = PhasingGraph::null_vertex();
+ uint64_t maximumLength = 0;
+ BGL_FORALL_OUTEDGES(v, e, phasingGraph, PhasingGraph) {
+ const vertex_descriptor v0 = target(e, phasingGraph);
+ if(phasingGraph[v0].positionInBubbleChain < phasingGraph[v].positionInBubbleChain) {
+ const uint64_t length = lengthMap[v0];
+ if(length > maximumLength) {
+ vPrevious = v0;
+ maximumLength = length;
+ }
+ }
+ }
+ if(vPrevious == PhasingGraph::null_vertex()) {
+ break;
+ }
+ v = vPrevious;
+ longestPath.push_back(v);
+
+ }
+ std::reverse(longestPath.begin(), longestPath.end());
+
+ if(debug) {
+ cout << "The longest path contains " << longestPath.size() << " vertices." << endl;
+ }
+
+
+
+ // If the longest path is non-trivial, use it to create a new PhasedComponent.
+ if(longestPath.size() > 1) {
+ if(debug) {
+ cout << "Creating a new PhasedComponent." << endl;
+ }
+ shared_ptr<PhasedComponent> phasedComponentPointer = make_shared<PhasedComponent>();
+ phasedComponents.push_back(phasedComponentPointer);
+ PhasedComponent& phasedComponent = *phasedComponentPointer;
+
+ for(const vertex_descriptor v: longestPath) {
+ const PhasingGraphVertex& vertex = phasingGraph[v];
+ phasedComponent.push_back({vertex.positionInBubbleChain, vertex.phase});
+ }
+ phasedComponent.minPositionInBubbleChain = phasingGraph[longestPath.front()].positionInBubbleChain;
+ phasedComponent.maxPositionInBubbleChain = phasingGraph[longestPath.back()].positionInBubbleChain;
+ if(debug) {
+ cout << "Phasing range for this component " << phasedComponent.minPositionInBubbleChain <<
+ " " << phasedComponent.maxPositionInBubbleChain << endl;
+ }
+
+ // Now remove from the PhasingGraph all vertices of this component
+ // plus any vertices with a positionInBubbleChain
+ // that overlaps this phased component.
+ vector<vertex_descriptor> verticesToBeRemoved = component;
+ BGL_FORALL_VERTICES(v, phasingGraph, PhasingGraph) {
+ const uint64_t positionInBubbleChain = phasingGraph[v].positionInBubbleChain;
+ if( positionInBubbleChain >= phasedComponent.minPositionInBubbleChain and
+ positionInBubbleChain <= phasedComponent.maxPositionInBubbleChain) {
+ verticesToBeRemoved.push_back(v);
+ }
+ }
+ deduplicate(verticesToBeRemoved);
+ for(const vertex_descriptor v: verticesToBeRemoved) {
+ clear_vertex(v, phasingGraph);
+ remove_vertex(v, phasingGraph);
+ }
+ } else {
+
+ // Now remove from the PhasingGraph all vertices of this component.
+ for(const vertex_descriptor v: component) {
+ clear_vertex(v, phasingGraph);
+ remove_vertex(v, phasingGraph);
+ }
+ }
+ }
+
+
+
+ // Sort the phased components in order of increasing position.
+ class SortHelper {
+ public:
+ bool operator()(
+ const shared_ptr<PhasedComponent>& p0,
+ const shared_ptr<PhasedComponent>& p1
+ ) const
+ {
+ return p0->minPositionInBubbleChain < p1->minPositionInBubbleChain;
+ }
+ };
+ sort(phasedComponents.begin(), phasedComponents.end(), SortHelper());
+
+ if(debug) {
+ cout << phasedComponents.size() << " phased components:" << endl;
+ for(const auto& phasedComponent: phasedComponents) {
+ cout << phasedComponent->size() << " diploid bubbles at positions " <<
+ phasedComponent->minPositionInBubbleChain << "..." <<
+ phasedComponent->maxPositionInBubbleChain << " in bubble chain." << endl;
+
+ }
+ // phasingGraph.writeGraphviz("PhasingGraph.dot");
+ }
+}
+
+
+
+bool AssemblyGraph::PhasingGraph::isConsistent(edge_descriptor e) const
+{
+ const PhasingGraph& phasingGraph = *this;
+ const vertex_descriptor v0 = source(e, phasingGraph);
+ const vertex_descriptor v1 = target(e, phasingGraph);
+ const int64_t phase0 = phasingGraph[v0].phase;
+ const int64_t phase1 = phasingGraph[v1].phase;
+ const int64_t phase = phasingGraph[e].phase;
+
+ SHASTA_ASSERT(phase0==+1 or phase0==-1);
+ SHASTA_ASSERT(phase1==+1 or phase1==-1);
+ SHASTA_ASSERT(phase==+1 or phase==-1);
+
+ if(phase == +1) {
+ return phase0 == phase1;
+ } else {
+ return phase0 != phase1;
+ }
+}
+
+
+
+void AssemblyGraph::PhasingGraph::writeGraphviz(const string& fileName) const
+{
+ const PhasingGraph& phasingGraph = *this;
+
+ ofstream dot(fileName);
+ dot << "graph PhasingGraph {\n";
+
+ BGL_FORALL_EDGES(e, phasingGraph, PhasingGraph) {
+ const vertex_descriptor v0 = source(e, phasingGraph);
+ const vertex_descriptor v1 = target(e, phasingGraph);
+ dot <<
+ phasingGraph[v0].positionInBubbleChain << "--" <<
+ phasingGraph[v1].positionInBubbleChain;
+ if(phasingGraph[e].isSpanningTreeEdge) {
+ dot << " [color=green]";
+ } else if(not isConsistent(e)) {
+ dot << " [color=red]";
+ }
+ dot << ";\n";
+ }
+
+ dot << "}\n";
+}
+
+
+
+void AssemblyGraph::TangleMatrix::analyze(
+ uint64_t lowThreshold,
+ uint64_t highThreshold,
+ int64_t& phase,
+ uint64_t& minConcordant,
+ uint64_t& maxDiscordant,
+ uint64_t& total,
+ double epsilon,
+ double& logPin, // log[P(in-phase)/P(random)] in decibels
+ double& logPout // log[P(out-of-phase)/P(random)] in decibels
+ ) const
+{
+ const TangleMatrix& m = *this;
+
+ // Classify matrix elements:
+ // 0 = low (<=lowThreshold)
+ // 1 = ambiguous (>lowThreshold, <highThreshold)
+ // 2 = high (>=highThreshold)
+ array< array<uint64_t, 2>, 2> c;
+ total = 0;
+ for(uint64_t i=0; i<2; i++) {
+ for(uint64_t j=0; j<2; j++) {
+ const uint64_t matrixElement = m[i][j];
+ total += matrixElement;
+ uint64_t& classification = c[i][j];
+ if(matrixElement <= lowThreshold) {
+ classification = 0;
+ } else if(matrixElement >= highThreshold) {
+ classification = 2;
+ } else {
+ classification = 1;
+ }
+ }
+ }
+
+ // Check if this tangle matrix is unambiguously in phase.
+ if(c[0][0]==2 and c[1][1]==2 and c[0][1]==0 and c[1][0]==0) {
+ phase = +1;
+ minConcordant = min(m[0][0], m[1][1]);
+ maxDiscordant = max(m[0][1], m[1][0]);
+ }
+
+ // Check if this tangle matrix is unambiguously out of phase.
+ else if(c[0][1]==2 and c[1][0]==2 and c[0][0]==0 and c[1][1]==0) {
+ phase = -1;
+ minConcordant = min(m[0][1], m[1][0]);
+ maxDiscordant = max(m[0][0], m[0][0]);
+ }
+
+ // Otherwise, it is ambiguous.
+ else {
+ phase = 0;
+ minConcordant = 0;
+ maxDiscordant = 0;
+ }
+
+ tie(logPin, logPout) = diploidBayesianPhase(m, epsilon);
+}
+
+
+
+// Collapse consecutive haploid bubbles of a BubbleChain.
+bool BubbleChain::compress()
+{
+ BubbleChain& bubbleChain = *this;
+ BubbleChain newBubbleChain;
+
+ // If this bubble chain consists of a single bubble, there is nothing to compress.
+ if(size() == 1) {
+ return false;
+ }
+
+ // Look for pairs of consecutive haploid bubbles.
+ // If none found, return.
+ bool found = false;
+ for(uint64_t i1=1; i1<size(); i1++) {
+ const uint64_t i0 = i1 - 1;
+ const Bubble& bubble0 = bubbleChain[i0];
+ const Bubble& bubble1 = bubbleChain[i1];
+ if(bubble0.isHaploid() and bubble1.isHaploid()) {
+ found = true;
+ break;
+ }
+ }
+ if(not found) {
+ return false;
+ }
+
+
+
+ // Find sets of consecutive haploid bubbles.
+ for(uint64_t i=0; i<size(); i++) {
+ const Bubble& bubble = bubbleChain[i];
+
+ if(bubble.isHaploid()) {
+
+ // This bubble is haploid.
+ // If the last bubble of the new bubble is haploid, append it to that.
+ // Otherwise apppend it to the last bubble.
+ if(not newBubbleChain.empty() and newBubbleChain.back().isHaploid()) {
+ const Chain& chain = bubble.front();
+ Chain& newChain = newBubbleChain.back().front();
+ copy(chain.begin()+1, chain.end(), back_inserter(newChain));
+ } else {
+ newBubbleChain.push_back(bubble);
+ }
+ } else {
+
+ // This bubble is not haploid. Just append it to the last bubble.
+ newBubbleChain.push_back(bubble);
+ }
+
+ }
+
+ // Replace it with the new one.
+ bubbleChain = newBubbleChain;
+
+ return true;
+}
+
+
+
+void AssemblyGraph::assembleChain(
+ Chain& chain,
+ uint64_t chainTerminalCommonThreshold)
+{
+ chain.stepSequences.resize(chain.size() - 1);
+
+ // Do all the assembly steps.
+ for(uint64_t positionInChain=0; positionInChain<chain.size()-1; positionInChain++) {
+ runAssemblyStep(chain, positionInChain, chainTerminalCommonThreshold);
+ }
+
+ combineStepSequences(chain);
+ chain.wasAssembled = true;
+}
+
+
+
+// Multithreaded version of sequence assembly.
+// This only assembles the chains that have the shouldBeAssembled flag set.
+void AssemblyGraph::assembleChainsMultithreaded(
+ uint64_t chainTerminalCommonThreshold,
+ uint64_t threadCount)
+{
+ AssemblyGraph& assemblyGraph = *this;
+
+ // Store the argument so the threads can see it.
+ assembleChainsMultithreadedData.chainTerminalCommonThreshold = chainTerminalCommonThreshold;
+
+ // Gather AssemblySteps for all the Chains.
+ auto& assemblySteps = assembleChainsMultithreadedData.assemblySteps;
+ assemblySteps.clear();
+
+ // Loop over BubbleChains.
+ AssemblyStep assemblyStep;
+ BGL_FORALL_EDGES(e, assemblyGraph, AssemblyGraph) {
+ assemblyStep.e = e;
+ BubbleChain& bubbleChain = assemblyGraph[e];
+
+ // Loop over Bubbles in this BubbleChain.
+ for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) {
+ assemblyStep.positionInBubbleChain = positionInBubbleChain;
+ Bubble& bubble = bubbleChain[positionInBubbleChain];
+
+ // Loop over Chains in this Bubble.
+ for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) {
+ assemblyStep.indexInBubble = indexInBubble;
+ Chain& chain = bubble[indexInBubble];
+ SHASTA_ASSERT(chain.size() >= 2);
+
+ // If this Chain is not marked to be assembled, skip it.
+ if(not chain.shouldBeAssembled) {
+ continue;
+ }
+
+ // Prepare the vectors where the threads will store
+ // the internal sequence assembled for each AssemblyStep.
+ // Each of these vectors will be modified by only one thread.
+ chain.stepSequences.resize(chain.size() - 1);
+
+ // Loop over pairs of consecutive vertices in this Chain.
+ for(uint64_t positionInChain=0; positionInChain<chain.size()-1; positionInChain++) {
+ assemblyStep.positionInChain = positionInChain;
+
+ // Compute the offset.
+ const MarkerGraphEdgeId edgeIdA = chain[positionInChain];
+ const MarkerGraphEdgeId edgeIdB = chain[positionInChain + 1];
+ MarkerGraphEdgePairInfo info;
+ SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(
+ edgeIdA, edgeIdB, info));
+ assemblyStep.offsetInBases = info.offsetInBases;
+
+ // Store this assembly step.
+ assemblySteps.push_back(assemblyStep);
+ }
+ }
+ }
+ }
+
+ // For better load balancing, sort them by decreasing offset.
+ sort(assemblySteps.begin(), assemblySteps.end());
+
+
+
+ // Assemble the steps in parallel.
+ setupLoadBalancing(assemblySteps.size(), 1);
+ performanceLog << timestamp << "Sequence assembly begins." << endl;
+ runThreads(&AssemblyGraph::assembleChainsMultithreadedTheadFunction, threadCount);
+ performanceLog << timestamp << "Sequence assembly ends." << endl;
+
+
+
+ // Now that all the AssemblySteps have been computed, the stepSequences
+ // of each Chain have been filled in.
+ // Combine those with the marker graph edge sequences to obtain the
+ // complete sequence of each chain.
+ // This can be parallelized.
+ BGL_FORALL_EDGES(e, assemblyGraph, AssemblyGraph) {
+ assemblyStep.e = e;
+ BubbleChain& bubbleChain = assemblyGraph[e];
+
+ // Loop over Bubbles in this BubbleChain.
+ for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) {
+ assemblyStep.positionInBubbleChain = positionInBubbleChain;
+ Bubble& bubble = bubbleChain[positionInBubbleChain];
+
+ // Loop over Chains in this Bubble.
+ for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) {
+ assemblyStep.indexInBubble = indexInBubble;
+ Chain& chain = bubble[indexInBubble];
+ if(chain.shouldBeAssembled) {
+ combineStepSequences(chain);
+ chain.wasAssembled = true;
+ }
+ }
+ }
+ }
+}
+
+
+
+// This sets the shouldBeAssembled flag for all chains, then
+// calls assembleChainsMultithreaded.
+void AssemblyGraph::assembleAllChainsMultithreaded(
+ uint64_t chainTerminalCommonThreshold,
+ uint64_t threadCount)
+{
+ AssemblyGraph& assemblyGraph = *this;
+
+ // Loop over all bubble chains.
+ BGL_FORALL_EDGES(e, assemblyGraph, AssemblyGraph) {
+ BubbleChain& bubbleChain = assemblyGraph[e];
+
+ // Loop over Bubbles in this BubbleChain.
+ for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) {
+ Bubble& bubble = bubbleChain[positionInBubbleChain];
+
+ // Loop over Chains in this Bubble.
+ for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) {
+ Chain& chain = bubble[indexInBubble];
+ chain.shouldBeAssembled = true;
+ }
+ }
+ }
+
+ assembleChainsMultithreaded(chainTerminalCommonThreshold, threadCount);
+}
+
+
+
+// This clears the shouldBeAssembled flag from all Chains.
+void AssemblyGraph::clearAllShouldBeAssembledFlags()
+{
+ AssemblyGraph& assemblyGraph = *this;
+
+ // Loop over all bubble chains.
+ BGL_FORALL_EDGES(e, assemblyGraph, AssemblyGraph) {
+ BubbleChain& bubbleChain = assemblyGraph[e];
+
+ // Loop over Bubbles in this BubbleChain.
+ for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) {
+ Bubble& bubble = bubbleChain[positionInBubbleChain];
+
+ // Loop over Chains in this Bubble.
+ for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) {
+ Chain& chain = bubble[indexInBubble];
+ chain.shouldBeAssembled = false;
+ }
+ }
+ }
+
+}
+
+
+
+// Combine stepSequences of a Chain with the marker graph edge sequences to obtain the
+// complete sequence of the chain.
+void AssemblyGraph::combineStepSequences(Chain& chain)
+{
+ chain.sequence.clear();
+ for(uint64_t positionInChain=0; /* Check later */ ; positionInChain++) {
+
+ // Add the sequence for the marker graph primary edge.
+ const MarkerGraphEdgeId edgeId = chain[positionInChain];
+ const auto edgeSequence = assembler.markerGraph.edgeSequence[edgeId];
+ copy(edgeSequence.begin(), edgeSequence.end(), back_inserter(chain.sequence));
+
+ // If this was the last primary edge for the chain, we are done.
+ if(positionInChain == chain.size() - 1) {
+ break;
+ }
+
+ // Add assembled sequence between this marker graph primary edge and the next in the chain.
+ const vector<Base>& stepSequence = chain.stepSequences[positionInChain].sequence;
+ copy(stepSequence.begin(), stepSequence.end(), back_inserter(chain.sequence));
+
+ }
+}
+
+
+
+// This writes the details of sequence assembly for all Chains in the AssemblyGraph.
+void AssemblyGraph::writeAssemblyDetails() const
+{
+ const AssemblyGraph& assemblyGraph = *this;
+
+ // Opeb the csv file and write the header.
+ ofstream csv("AssemblyDetails-" + to_string(componentId) + ".csv");
+ csv << "Chain,Component,Bubble chain,Position in bubble chain,Index in bubble,"
+ "Position in chain,Type,Marker graph edge id,"
+ "Assembly status,Length,Sequence begin,Sequence end,Coverage,Common\n";
+
+ // Loop over all bubble chains.
+ BGL_FORALL_EDGES(e, assemblyGraph, AssemblyGraph) {
+ const BubbleChain& bubbleChain = assemblyGraph[e];
+
+ // Loop over Bubbles in this BubbleChain.
+ for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) {
+ const Bubble& bubble = bubbleChain[positionInBubbleChain];
+
+ // Loop over Chains in this Bubble.
+ for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) {
+ const Chain& chain = bubble[indexInBubble];
+ SHASTA_ASSERT(chain.wasAssembled);
+ SHASTA_ASSERT(chain.stepSequences.size() == chain.size() - 1);
+ const string chainString = chainStringId(e, positionInBubbleChain, indexInBubble);
+
+ // Loop over positions in this Chain.
+ uint64_t positionInSequence = 0;
+ for(uint64_t positionInChain=0; /* Check later */ ; positionInChain++) {
+
+ // Write one line to csv with information about the sequence
+ // contributed by this the marker graph primary edge.
+ {
+ const MarkerGraphEdgeId edgeId = chain[positionInChain];
+ const uint64_t coverage = assembler.markerGraph.edgeMarkerIntervals[edgeId].size();
+ const uint64_t edgeSequenceLength = assembler.markerGraph.edgeSequence[edgeId].size();
+ const uint64_t beginInSequence = positionInSequence;
+ const uint64_t endInSequence = positionInSequence + edgeSequenceLength;
+ csv << chainString << ",";
+ csv << componentId << ",";
+ csv << assemblyGraph[e].id << ",";
+ csv << positionInBubbleChain << ",";
+ csv << indexInBubble << ",";
+ csv << positionInChain << ",";
+ csv << "E,";
+ csv << edgeId << ",,";
+ csv << edgeSequenceLength << ",";
+ csv << beginInSequence << ",";
+ csv << endInSequence << ",";
+ csv << coverage << ",";
+ csv << ",";
+ csv << "\n";
+ positionInSequence = endInSequence;
+ }
+
+
+ // If this was the last primary edge for the chain, we are done.
+ if(positionInChain == chain.size() - 1) {
+ SHASTA_ASSERT(positionInSequence == chain.sequence.size());
+ break;
+ }
+
+ // Write one line to csv with information about the sequence
+ // contributed by the assemby step between this marker graph primary edge
+ // and the next in the chain.
+ {
+ const MarkerGraphEdgeId edgeId = chain[positionInChain];
+ const MarkerGraphEdgeId nextEdgeId = chain[positionInChain + 1];
+ const uint64_t commonCount = assembler.countCommonOrientedReadsUnsafe(
+ edgeId, nextEdgeId);
+ const auto& stepSequence = chain.stepSequences[positionInChain];
+ const uint64_t stepSequenceLength = stepSequence.sequence.size();
+ const bool success = stepSequence.success;
+ const uint64_t beginInSequence = positionInSequence;
+ const uint64_t endInSequence = positionInSequence + stepSequenceLength;
+ csv << chainString << ",";
+ csv << componentId << ",";
+ csv << assemblyGraph[e].id << ",";
+ csv << positionInBubbleChain << ",";
+ csv << indexInBubble << ",";
+ csv << ",";
+ csv << "S,";
+ csv << ",";
+ csv << (success ? "Success," : "Failure,");
+ csv << stepSequenceLength << ",";
+ csv << beginInSequence << ",";
+ csv << endInSequence << ",";
+ csv << ",";
+ csv << commonCount << ",";
+ csv << "\n";
+ positionInSequence = endInSequence;
+ }
+
+ }
+ }
+ }
+ }
+}
+
+
+
+void AssemblyGraph::assembleChainsMultithreadedTheadFunction(uint64_t threadId)
+{
+ const uint64_t chainTerminalCommonThreshold = assembleChainsMultithreadedData.chainTerminalCommonThreshold;
+
+ // Loop over all batches assigned to this thread.
+ uint64_t begin, end;
+ while(getNextBatch(begin, end)) {
+
+ // Loop over all assembly steps assigned to this batch.
+ for(uint64_t i=begin; i!=end; ++i) {
+ const auto& assemblyStep = assembleChainsMultithreadedData.assemblySteps[i];
+ runAssemblyStep(chainTerminalCommonThreshold, assemblyStep);
+ }
+ }
+}
+
+
+
+void AssemblyGraph::runAssemblyStep(
+ uint64_t chainTerminalCommonThreshold,
+ const AssemblyStep& assemblyStep)
+{
+ AssemblyGraph& assemblyGraph = *this;
+
+ // Get the BubbleChain.
+ BubbleChain& bubbleChain = assemblyGraph[assemblyStep.e];
+
+ // Get the Bubble.
+ Bubble& bubble = bubbleChain[assemblyStep.positionInBubbleChain];
+
+ // Get the Chain.
+ Chain& chain = bubble[assemblyStep.indexInBubble];
+ SHASTA_ASSERT(chain.size() >= 2);
+
+ // Do it.
+ runAssemblyStep(chain, assemblyStep.positionInChain, chainTerminalCommonThreshold);
+}
+
+
+
+void AssemblyGraph::runAssemblyStep(
+ Chain& chain,
+ uint64_t positionInChain,
+ uint64_t chainTerminalCommonThreshold)
+{
+
+ // Find the MarkerGraphEdgeIds for this local assembly.
+ const MarkerGraphEdgeId edgeIdA = chain[positionInChain];
+ const MarkerGraphEdgeId edgeIdB = chain[positionInChain + 1];
+
+ // Suppress html output from LocalAssembly.
+ ostream html(0);
+
+
+
+ // Figure out if we should use the oriented reads on edgeIdA and edgeIdB.
+ bool useA = true;
+ bool useB = true;
+ // For chains of length 2, we leave useA and useB set to true.
+ // For the usual case of longer chains, there is more checking.
+ if(chain.size() != 2) {
+
+ // If we are at the beginning or end of the chain, we need to check
+ // the number of common oriented reads.
+ MarkerGraphEdgePairInfo info;
+ if((positionInChain == 0) or (positionInChain == chain.size() - 2)) {
+ SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(edgeIdA, edgeIdB, info));
+ }
+
+ // If this is the first step of the Chain, we want to set useA to false
+ // to avoid using reads that don't belong. But we only do it
+ // if this leaves us with enough reads to assemble.
+ if(positionInChain == 0) {
+ if(info.common >= chainTerminalCommonThreshold) {
+ useA = false;
+ }
+ }
+
+ // If this is the last step of the Chain, we want to set useB to false
+ // to avoid using reads that don't belong. But we only do it
+ // if this leaves us with enough reads to assemble.
+ else if(positionInChain == chain.size() - 2) {
+ if(info.common >= chainTerminalCommonThreshold) {
+ useB = false;
+ }
+ }
+ }
+
+
+
+ // Do the local assembly between these two MarkerGraphEdgeIds.
+ auto& stepSequence = chain.stepSequences[positionInChain];
+ try {
+ LocalAssembly localAssembly(assembler, edgeIdA, edgeIdB, 0, html, options.localAssemblyOptions, useA, useB);
+ localAssembly.getSecondarySequence(stepSequence.sequence);
+ stepSequence.success = true;
+ } catch (...) {
+ // The local assembly failed.
+ // The sequence is empty and the success flag is false.
+ stepSequence.sequence.clear();
+ stepSequence.success = false;
+ std::lock_guard<std::mutex> lock(mutex);
+ cout << "Error occurred in local assembly between marker graph edges " <<
+ edgeIdA << " and " << edgeIdB << endl;
+ throw;
+ }
+}
+
+
+
+// Make a copy of an edge, truncating it at its end by removing the last MarkerGraphEdgeId.
+// Return the target vertex of the newly created edge.
+// The last bubble of the bubble chain of the given edge must be haploid.
+// If the bubble chain consists of just a single haploid bubble with a chain of length 2,
+// no new edge is created, and this simply returns the source vertex of the given edge.
+AssemblyGraph::vertex_descriptor
+ AssemblyGraph::cloneAndTruncateAtEnd(edge_descriptor ce)
+{
+ AssemblyGraph& cGraph = *this;
+ const AssemblyGraphEdge& edge = cGraph[ce];
+ const vertex_descriptor cv0 = source(ce, cGraph);
+ const BubbleChain& bubbleChain = cGraph[ce];
+
+ // Sanity checks.
+ SHASTA_ASSERT(not bubbleChain.empty());
+ SHASTA_ASSERT(bubbleChain.lastBubble().isHaploid());
+
+
+
+ // Case where the bubble chain consists of a single bubble, which must be haploid,
+ // that is, consist of a single chain.
+ if(bubbleChain.size() == 1) {
+ const Bubble& bubble = bubbleChain.lastBubble();
+ SHASTA_ASSERT(bubble.isHaploid());
+ const Chain& chain = bubble.front();
+ SHASTA_ASSERT(chain.size() > 1);
+
+ // If the Chain has length 2, we can't truncate it.
+ // So we don't create a new edge, and instead just return cv0.
+ // Detangling code will connect there, as prescribed by the tangle matrix.
+ if(chain.size() == 2) {
+ return cv0;
+ }
+
+ // Create the new edge, without adding it to the graph for now.
+ AssemblyGraphEdge newEdge = edge;
+ newEdge.id = nextEdgeId++;
+ BubbleChain& newBubbleChain = newEdge;
+ SHASTA_ASSERT(newBubbleChain.size() == 1);
+ Bubble& newBubble = newBubbleChain.lastBubble();
+ SHASTA_ASSERT(newBubble.isHaploid());
+ Chain& newChain = newBubble.front();
+ SHASTA_ASSERT(chain.size() > 2);
+ newChain.pop_back(); // Remove the last MarkerGraphEdgeId.
+
+ // Add it to the graph.
+ // It will be dangling at its end.
+ // Detangling code will later connect it s prescribed by the tangle matrix.
+ const vertex_descriptor cv2 = createVertex(newBubbleChain.lastMarkerGraphEdgeId());
+ add_edge(cv0, cv2, newEdge, cGraph);
+ return cv2;
+ }
+
+
+
+ // Case where the bubble chain consists of more than one bubble.
+ else {
+ const Bubble& lastBubble = bubbleChain.lastBubble();
+ SHASTA_ASSERT(lastBubble.isHaploid());
+ const Chain& lastChain = lastBubble.front();
+ SHASTA_ASSERT(lastChain.size() > 1);
+
+ // Create the new edge, without adding it to the graph for now.
+ AssemblyGraphEdge newEdge = edge;
+ newEdge.id = nextEdgeId++;
+ BubbleChain& newBubbleChain = newEdge;
+ SHASTA_ASSERT(newBubbleChain.size() > 1);
+ Bubble& newLastBubble = newBubbleChain.lastBubble();
+ SHASTA_ASSERT(newLastBubble.isHaploid());
+ Chain& newLastChain = newLastBubble.front();
+
+ // If the last chain has length 2, just remove the last bubble from newBubbleChain.
+ // Otherwise, remove the last MarkerGraphEdgeId from the lastChain.
+ if(newLastChain.size() == 2) {
+ newBubbleChain.pop_back();
+ } else {
+ newLastChain.pop_back();
+ }
+
+ // Add it to the graph.
+ // It will be dangling at its end.
+ // Detangling code will later connect it s prescribed by the tangle matrix.
+ const vertex_descriptor cv2 = createVertex(newBubbleChain.lastMarkerGraphEdgeId());
+ add_edge(cv0, cv2, newEdge, cGraph);
+ return cv2;
+ }
+
+}
+
+
+
+
+
+// Make a copy of an edge, truncating it at its beginning by removing the first MarkerGraphEdgeId.
+// Return the source vertex of the newly created edge.
+// The first bubble of the bubble chain of the given edge must be haploid.
+// If the bubble chain consists of just a single haploid bubble with a chain of length 2,
+// no new edge is created, and this simply returns the target vertex of the given edge.
+AssemblyGraph::vertex_descriptor
+ AssemblyGraph::cloneAndTruncateAtBeginning(edge_descriptor ce)
+{
+ AssemblyGraph& cGraph = *this;
+ const AssemblyGraphEdge& edge = cGraph[ce];
+ const vertex_descriptor cv1 = target(ce, cGraph);
+ const BubbleChain& bubbleChain = cGraph[ce];
+
+ // Sanity checks.
+ SHASTA_ASSERT(not bubbleChain.empty());
+ SHASTA_ASSERT(bubbleChain.firstBubble().isHaploid());
+
+
+
+ // Case where the bubble chain consists of a single bubble, which must be haploid,
+ // that is, consist of a single chain.
+ if(bubbleChain.size() == 1) {
+ const Bubble& bubble = bubbleChain.firstBubble();
+ SHASTA_ASSERT(bubble.isHaploid());
+ const Chain& chain = bubble.front();
+ SHASTA_ASSERT(chain.size() > 1);
+
+ // If the Chain has length 2, we can't truncate it.
+ // So we don't create a new edge, and instead just return cv1.
+ // Detangling code will connect there, as prescribed by the tangle matrix.
+ if(chain.size() == 2) {
+ return cv1;
+ }
+
+ // Create the new edge, without adding it to the graph for now.
+ AssemblyGraphEdge newEdge = edge;
+ newEdge.id = nextEdgeId++;
+ BubbleChain& newBubbleChain = newEdge;
+ SHASTA_ASSERT(newBubbleChain.size() == 1);
+ Bubble& newBubble = newBubbleChain.firstBubble();
+ SHASTA_ASSERT(newBubble.isHaploid());
+ Chain& newChain = newBubble.front();
+ SHASTA_ASSERT(chain.size() > 2);
+ newChain.erase(newChain.begin()); // Remove the first MarkerGraphEdgeId.
+
+ // Add it to the graph.
+ // It will be dangling at its beginning.
+ // Detangling code will later connect it s prescribed by the tangle matrix.
+ const vertex_descriptor cv2 = createVertex(newBubbleChain.firstMarkerGraphEdgeId());
+ add_edge(cv2, cv1, newEdge, cGraph);
+ return cv2;
+ }
+
+
+
+ // Case where the bubble chain consists of more than one bubble.
+ else {
+ const Bubble& firstBubble = bubbleChain.firstBubble();
+ SHASTA_ASSERT(firstBubble.isHaploid());
+ const Chain& firstChain = firstBubble.front();
+ SHASTA_ASSERT(firstChain.size() > 1);
+
+ // Create the new edge, without adding it to the graph for now.
+ AssemblyGraphEdge newEdge = edge;
+ newEdge.id = nextEdgeId++;
+ BubbleChain& newBubbleChain = newEdge;
+ SHASTA_ASSERT(newBubbleChain.size() > 1);
+ Bubble& newFirstBubble = newBubbleChain.firstBubble();
+ SHASTA_ASSERT(newFirstBubble.isHaploid());
+ Chain& newFirstChain = newFirstBubble.front();
+
+ // If the last chain has length 2, just remove the first bubble from newBubbleChain.
+ // Otherwise, remove the first MarkerGraphEdgeId from the lastChain.
+ if(newFirstChain.size() == 2) {
+ newBubbleChain.erase(newBubbleChain.begin());
+ } else {
+ newFirstChain.erase(newFirstChain.begin());
+ }
+
+ // Add it to the graph.
+ // It will be dangling at its end.
+ // Detangling code will later connect it s prescribed by the tangle matrix.
+ const vertex_descriptor cv2 = createVertex(newBubbleChain.firstMarkerGraphEdgeId());
+ add_edge(cv2, cv1, newEdge, cGraph);
+ return cv2;
+ }
+
+}
+
+
+// Create a new edge connecting the cv0 and cv1.
+// The new edge will consist of a simple BubbleChain with a single
+// haploid Bubble with a Chain of length 2.
+AssemblyGraph::edge_descriptor AssemblyGraph::connect(vertex_descriptor cv0, vertex_descriptor cv1)
+{
+ AssemblyGraph& cGraph = *this;
+
+ edge_descriptor ceNew;
+ tie(ceNew, ignore) = add_edge(cv0, cv1, cGraph);
+ AssemblyGraphEdge& newEdge = cGraph[ceNew];
+ newEdge.id = nextEdgeId++;
+ BubbleChain& newBubbleChain = newEdge;
+
+ // The new BubbleChain consists of a single Bubble.
+ newBubbleChain.resize(1);
+ Bubble& bubble = newBubbleChain.front();
+
+ // The new Bubble is haploid, that is, consists of a single Chain.
+ bubble.resize(1);
+
+ // The new Bubble consists of just the two MarkerGraphEdgeIds
+ // corresponding to cv0 and cv1.
+ Chain& chain = bubble.front();
+ chain.push_back(cGraph[cv0].edgeId);
+ chain.push_back(cGraph[cv1].edgeId);
+
+ return ceNew;
+
+}
+
+
+
+void AssemblyGraph::save(const string& fileName) const
+{
+ ofstream file(fileName);
+ boost::archive::binary_oarchive archive(file);
+ archive << *this;
+}
+
+
+
+void AssemblyGraph::load(const string& fileName)
+{
+ ifstream file(fileName);
+ boost::archive::binary_iarchive archive(file);
+ archive >> *this;
+}
+
+
+
+// Optimize chains before assembly, to remove assembly steps with
+// less that minCommon reads.
+void AssemblyGraph::optimizeChains(
+ bool debug,
+ uint64_t minCommon,
+ uint64_t k)
+{
+ AssemblyGraph& cGraph = *this;
+
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+ BubbleChain& bubbleChain = cGraph[ce];
+
+ for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) {
+ Bubble& bubble = bubbleChain[positionInBubbleChain];
+ const uint64_t ploidy = bubble.size();
+
+ for(uint64_t indexInBubble=0; indexInBubble<ploidy; indexInBubble++) {
+ Chain& chain = bubble[indexInBubble];
+ SHASTA_ASSERT(chain.size() >= 2);
+
+ if(debug) {
+ cout << "Optimizing chain " << chainStringId(ce, positionInBubbleChain, indexInBubble) << endl;
+ }
+ optimizeChain(debug, chain, minCommon, k);
+ }
+ }
+ }
+
+}
+
+
+
+// Optimize a chain before assembly, to remove assembly steps with
+// less that minCommon reads.
+void AssemblyGraph::optimizeChain(
+ bool debug,
+ Chain& chain,
+ uint64_t minCommon,
+ uint64_t k)
+{
+ if(debug) {
+ cout << "Optimizing a chain of length " << chain.size() << endl;
+ }
+ SHASTA_ASSERT(chain.size() >= 2);
+
+
+ // A directed graph describing the initial and final chains.
+ // Each vertex stores a MarkerGraphEdgeId.
+ // Each edge stores the number of common oriented reads.
+ class ChainGraphVertex {
+ public:
+ MarkerGraphEdgeId edgeId;
+ uint64_t immediateDominator = invalid<uint64_t>;
+ };
+ class ChainGraphEdge {
+ public:
+ uint64_t commonCount;
+ bool keep = false;
+ };
+ using ChainGraphBaseClass = boost::adjacency_list<
+ boost::listS,
+ boost::vecS,
+ boost::bidirectionalS,
+ ChainGraphVertex,
+ ChainGraphEdge>;
+ class ChainGraph : public ChainGraphBaseClass {
+ public:
+ };
+ ChainGraph chainGraph;
+
+ class PathInspector {
+ public:
+ PathInspector(ChainGraph& chainGraph, bool debug) : chainGraph(chainGraph), debug(debug) {}
+ ChainGraph& chainGraph;
+ bool debug;
+ using Path = vector<ChainGraph::edge_descriptor>;
+ Path bestPath;
+ uint64_t bestPathMinCommonCount = 0;
+ void operator()(const Path& path)
+ {
+ // Compute the minimum number of common oriented reads over edges of this path.
+ uint64_t minCommonCount = invalid<uint64_t>;
+ for(const ChainGraph::edge_descriptor e: path) {
+ minCommonCount = min(minCommonCount, chainGraph[e].commonCount);
+ }
+
+ if(debug) {
+ cout << "Path with minCommonCount " << minCommonCount << ":";
+ for(const ChainGraph::edge_descriptor e: path) {
+ cout << " " << source(e, chainGraph);
+ }
+ cout << " " << target(path.back(), chainGraph) << "\n";
+ }
+
+ // A Path is better if it has a higher minCommonCount or
+ // it has the same minCommonCount and is longer.
+ //
+ if( (minCommonCount > bestPathMinCommonCount) or
+ (minCommonCount == bestPathMinCommonCount and path.size() > bestPath.size())) {
+ bestPath = path;
+ bestPathMinCommonCount = minCommonCount;
+ }
+ }
+
+ };
+
+ // Construct the initial ChainGraph.
+
+ // Add the vertices.
+ // We are using vecS as the second template argument for ChainGraph,
+ // so positions in the chain are also vertex descriptors in the ChainGraph.
+ for(const MarkerGraphEdgeId edgeId: chain) {
+ add_vertex({edgeId}, chainGraph);
+ }
+
+ // Add the edges that correspond to the initial Chain.
+ for(uint64_t i1=1; i1<chain.size(); i1++) {
+ const uint64_t i0 = i1 - 1;
+ const MarkerGraphEdgeId edgeId0 = chainGraph[i0].edgeId;
+ const MarkerGraphEdgeId edgeId1 = chainGraph[i1].edgeId;
+ MarkerGraphEdgePairInfo info;
+ SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(edgeId0, edgeId1, info));
+ add_edge(i0, i1, {info.common}, chainGraph);
+ }
+
+
+
+ // Add edges that skip around any edges with less than minCommon common oriented reads.
+ uint64_t totalAddedEdgesCount = 0;
+ uint64_t totalRemovedEdgesCount = 0;
+ for(uint64_t i1=1; i1<chain.size(); i1++) {
+ const uint64_t i0 = i1 - 1;
+ ChainGraph::edge_descriptor e;
+ bool edgeWasFound = false;
+ tie(e, edgeWasFound) = edge(i0, i1, chainGraph);
+ SHASTA_ASSERT(edgeWasFound);
+
+ // If this edge has enough common reads, don't do anything.
+ if(chainGraph[e].commonCount >= minCommon) {
+ continue;
+ }
+
+ if(debug) {
+ cout << i0 << "->" << i1 << " " << chainGraph[i0].edgeId << "->" << chainGraph[i1].edgeId <<
+ " has " << chainGraph[e].commonCount << " common oriented reads, adding edges to skip it." << endl;
+ }
+
+ // Loop over pairs of predecessors of v0 and successors of v1.
+ uint64_t addedEdgesCount = 0;
+ const uint64_t j0First = (k < i0) ? (i0 - k) : 0;
+ const uint64_t j0Last = i0;
+ const uint64_t j1First = i1;
+ const uint64_t j1Last = min(i1 + k, chain.size() - 1);
+ for(uint64_t j0=j0First; j0<=j0Last; j0++) {
+ for(uint64_t j1=j1First; j1<=j1Last; j1++) {
+ if(j0==i0 and j1 == i1) {
+ // We already have the edge between v0 and v1.
+ continue;
+ }
+ MarkerGraphEdgePairInfo info;
+ SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(chainGraph[j0].edgeId, chainGraph[j1].edgeId, info));
+
+ // If the number of common reads is better than for e, add this edge.
+ if(info.common > chainGraph[e].commonCount) {
+ add_edge(j0, j1, {info.common}, chainGraph);
+ ++addedEdgesCount;
+ if(debug) {
+ cout << " Added " << j0 << "->" << j1 << " " << chainGraph[j0].edgeId << "->" << chainGraph[j1].edgeId <<
+ " with " << info.common << " common oriented reads." << endl;
+ }
+ } else {
+ if(debug) {
+ cout << "Found " << j0 << "->" << j1 << " " << chainGraph[j0].edgeId << "->" << chainGraph[j1].edgeId <<
+ " with " << info.common << " common oriented reads." << endl;
+
+ }
+ }
+ }
+ }
+ totalAddedEdgesCount += addedEdgesCount;
+
+ // If we added any edges skipping e, we can remove e.
+ if(addedEdgesCount > 0) {
+ if(debug) {
+ cout << "Removed " << i0 << "->" << i1 << " " << chainGraph[i0].edgeId << "->" << chainGraph[i1].edgeId <<
+ " with " << chainGraph[e].commonCount << " common oriented reads." << endl;
+ }
+ // DON'T REMOVE THE EDGE. THIS IS NECESSARY TO MAKE SURE WE
+ // STILL HAVE A PATH FROM THE ENTRANCE TO THE EXIT.
+ // boost::remove_edge(e, chainGraph);
+ // ++totalRemovedEdgesCount;
+ } else {
+ if(debug) {
+ cout << "Did not find any suitable replacement edges." << endl;
+ }
+ }
+ }
+
+
+ // If we did not add or remove any edges, leave this Chain alone.
+ if(totalAddedEdgesCount == 0) {
+ SHASTA_ASSERT(totalRemovedEdgesCount == 0);
+ if(debug) {
+ cout << "No edges were added or removed, so this Chain will be left unchanged." << endl;
+ }
+ return;
+ }
+
+ if(debug) {
+ cout << "This chain will be optimized." << endl;
+ }
+
+
+
+ // To find the optimized chain, we want to do path enumeration on the ChainGraph,
+ // looking for paths that only use edges with large numbers of common oriented reads.
+ // Specifically, we use as the new chain the path that maximizes the minimum
+ // number of common oriented reads encountered on edges along the path.
+ // For efficiency of the path enumeration, we compute a dominator tree
+ // for the ChainGraph, with entrance at the beginning of the chain.
+ // The unique path on that tree from the entrance to the exit
+ // divides the graph in segments, and we can do path enumeration on one segment at a time.
+ shasta::lengauer_tarjan_dominator_tree(chainGraph, 0,
+ boost::get(&ChainGraphVertex::immediateDominator, chainGraph));
+
+ // The unique path on the dominator tree from the entrance to the exit.
+ vector<ChainGraph::vertex_descriptor> dominatorTreePath;
+ ChainGraph::vertex_descriptor v = chain.size() - 1;
+ while(true) {
+ dominatorTreePath.push_back(v);
+ if(v == 0) {
+ break;
+ }
+ v = chainGraph[v].immediateDominator;
+ if(v == invalid<uint64_t>) {
+ cout << "Assertion failure at " << v << endl;
+ }
+ SHASTA_ASSERT(v != invalid<uint64_t>);
+ }
+ if(debug) {
+ cout << "Dominator tree path length " << dominatorTreePath.size() << endl;
+ }
+ reverse(dominatorTreePath.begin(), dominatorTreePath.end());
+
+ if(false) {
+ cout << "Dominator tree path:" << endl;
+ for(uint64_t i=0; i<dominatorTreePath.size(); i++) {
+ const uint64_t v = dominatorTreePath[i];
+ cout << i << "," << v << "," << chainGraph[v].edgeId << "\n";
+ }
+ }
+
+
+
+ // The dominator tree path divides the graph in segments,
+ // and we can do path enumeration on one segment at a time.
+ // For each segment we find the best path and mark the edges on that
+ // best path as to be kept in the final chain.
+ for(uint64_t i1=1; i1<dominatorTreePath.size(); i1++) {
+ const uint64_t i0 = i1 - 1;
+ const ChainGraph::vertex_descriptor v0 = dominatorTreePath[i0];
+ const ChainGraph::vertex_descriptor v1 = dominatorTreePath[i1];
+
+ // Fast handling of the most common case.
+ if(v1 == v0+1 and out_degree(v0, chainGraph)==1 and in_degree(v1, chainGraph)==1) {
+ ChainGraph::edge_descriptor e;
+ bool edgeWasFound = true;
+ tie(e, edgeWasFound) = edge(v0, v1, chainGraph);
+ if(edgeWasFound) {
+ chainGraph[e].keep = true;
+ continue;
+ }
+ }
+
+ // If getting here, we have to do path enumeration.
+ if(debug) {
+ cout << "Starting path enumeration between " << v0 << " " << v1 << endl;
+ }
+
+ // Enumerate paths starting at v0 and ending at v1.
+ PathInspector pathInspector(chainGraph, debug);
+ enumeratePathsBetween(chainGraph, v0, v1, pathInspector);
+
+ if(debug) {
+ if(debug) {
+ cout << "The best path has minCommonCount " << pathInspector.bestPathMinCommonCount << ":";
+ for(const ChainGraph::edge_descriptor e: pathInspector.bestPath) {
+ cout << " " << source(e, chainGraph);
+ }
+ cout << " " << target(pathInspector.bestPath.back(), chainGraph) << "\n";
+ }
+ }
+
+ // Mark as to be kept all edges on the best path.
+ for(const ChainGraph::edge_descriptor e: pathInspector.bestPath) {
+ chainGraph[e].keep = true;
+ }
+ }
+
+
+ // Remove all edges not marked to be kept.
+ vector<ChainGraph::edge_descriptor> edgesToBeRemoved;
+ BGL_FORALL_EDGES(e, chainGraph, ChainGraph) {
+ if(not chainGraph[e].keep) {
+ edgesToBeRemoved.push_back(e);
+ }
+ }
+ for(const ChainGraph::edge_descriptor e: edgesToBeRemoved) {
+ boost::remove_edge(e, chainGraph);
+ }
+
+ // The remaining edges should form a path in the ChainGraph
+ // which defines the optimized Chain.
+ SHASTA_ASSERT(in_degree(0, chainGraph) == 0);
+ SHASTA_ASSERT(out_degree(0, chainGraph) == 1);
+ SHASTA_ASSERT(in_degree(chain.size()-1, chainGraph) == 1);
+ SHASTA_ASSERT(out_degree(chain.size()-1, chainGraph) == 0);
+ for(uint64_t i=1; i<chain.size()-1; i++) {
+ const uint64_t inDegree = in_degree(i, chainGraph);
+ const uint64_t outDegree = out_degree(i, chainGraph);
+ SHASTA_ASSERT(
+ (inDegree==1 and outDegree==1) or // In the new chain.
+ (inDegree==0 and outDegree==0) // Now isolated.
+ );
+ }
+
+ // Find the path from the entrance to the exit in the update ChainGraph.
+ vector<uint64_t> newPath;
+ v = 0;
+ while(true) {
+ newPath.push_back(v);
+ if(v == chain.size()-1) {
+ break;
+ }
+
+ // Move forward.
+ SHASTA_ASSERT(out_degree(v, chainGraph) == 1);
+ ChainGraph::out_edge_iterator it;
+ tie(it, ignore) = out_edges(v, chainGraph);
+ const ChainGraph::edge_descriptor e = *it;
+ v = target(e, chainGraph);
+ }
+
+ // Sanity check that the path is moving forward.
+ for(uint64_t i=1; i<newPath.size(); i++) {
+ SHASTA_ASSERT(newPath[i] > newPath[i-1]);
+ }
+
+ // Construct the new Chain.
+ chain.clear();
+ chain.sequence.clear();
+ for(const uint64_t v: newPath) {
+ chain.push_back(chainGraph[v].edgeId);
+ }
+
+}
+
+
+
+bool AssemblyGraph::removeSelfComplementaryEdges()
+{
+ AssemblyGraph& cGraph = *this;
+
+ vector<edge_descriptor> edgesToBeRemoved;
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+ const vertex_descriptor v0 = source(ce, cGraph);
+ const vertex_descriptor v1 = target(ce, cGraph);
+ const MarkerGraphEdgeId edgeId0 = cGraph[v0].edgeId;
+ const MarkerGraphEdgeId edgeId1 = cGraph[v1].edgeId;
+
+ if(assembler.markerGraph.reverseComplementEdge[edgeId0] == edgeId1) {
+ SHASTA_ASSERT(assembler.markerGraph.reverseComplementEdge[edgeId1] == edgeId0);
+ edgesToBeRemoved.push_back(ce);
+ }
+ }
+
+ for(const edge_descriptor ce: edgesToBeRemoved) {
+ boost::remove_edge(ce, cGraph);
+ }
+
+ return not edgesToBeRemoved.empty();
+}
+
+
+
+// Split terminal haploid bubbles out of bubble chains, to facilitate detangling.
+void AssemblyGraph::splitTerminalHaploidBubbles()
+{
+ AssemblyGraph& cGraph = *this;
+
+ vector<edge_descriptor> allEdges;
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+ allEdges.push_back(ce);
+ }
+
+ for(const edge_descriptor e: allEdges) {
+ splitTerminalHaploidBubbles(e);
+ }
+}
+
+
+
+void AssemblyGraph::splitTerminalHaploidBubbles(edge_descriptor ce)
+{
+ AssemblyGraph& cGraph = *this;
+ BubbleChain& bubbleChain = cGraph[ce];
+
+ // Skip trivial bubble chains consisting of a single bubble.
+ if(bubbleChain.size() < 2) {
+ return;
+ }
+
+ // Access the first and last bubble in the bubble chain.
+ // We already checked that the bubble chain has at least two bubbles,
+ // so these two are distinct.
+ const Bubble& firstBubble = bubbleChain.front();
+ const Bubble& lastBubble = bubbleChain.back();
+
+ // Skip bubble chains consisting of two haploid bubbles.
+ // After compress() is called, there should be none of these.
+ if(bubbleChain.size() == 2 and firstBubble.isHaploid() and lastBubble.isHaploid()) {
+ return;
+ }
+
+ // Figure out if we need to split the first or last bubble, or both.
+ bool splitFirstBubble = false;
+ bool splitLastBubble = false;
+ if(firstBubble.isHaploid()) {
+ splitFirstBubble = true;
+ }
+ if(lastBubble.isHaploid()) {
+ splitLastBubble = true;
+ }
+ if(splitFirstBubble and splitLastBubble) {
+ SHASTA_ASSERT(bubbleChain.size() > 2);
+ }
+
+ // If there is nothing to do, we are done.
+ if(not (splitFirstBubble or splitLastBubble)) {
+ return;
+ }
+
+ // The source and target vertices of the edge we are splitting.
+ const vertex_descriptor cv0 = source(ce, cGraph);
+ const vertex_descriptor cv1 = target(ce, cGraph);
+ vertex_descriptor cv2 = null_vertex();
+ vertex_descriptor cv3 = null_vertex();
+
+
+
+ // Create a new edge with just the first bubble, if necessary.
+ if(splitFirstBubble) {
+
+ // Get the target vertex for the new edge.
+ const Chain& firstChain = firstBubble.front();
+ const MarkerGraphEdgeId markerGraphEdgeId2 = firstChain.back();
+ cv2 = createVertex(markerGraphEdgeId2);
+
+ // Add the new edge.
+ edge_descriptor eNew;
+ tie(eNew, ignore) = add_edge(cv0, cv2, cGraph);
+ AssemblyGraphEdge& newEdge = cGraph[eNew];
+ newEdge.id = nextEdgeId++;
+
+ // Copy the first bubble to the new edge.
+ newEdge.push_back(firstBubble);
+
+ }
+
+
+
+ // Create a new edge with just the last bubble, if necessary.
+ if(splitLastBubble) {
+
+ // Get the source vertex for the new edge.
+ const Chain& lastChain = lastBubble.front();
+ const MarkerGraphEdgeId markerGraphEdgeId3 = lastChain.front();
+ cv3 = createVertex(markerGraphEdgeId3);
+
+ // Add the new edge.
+ edge_descriptor eNew;
+ tie(eNew, ignore) = add_edge(cv3, cv1, cGraph);
+ AssemblyGraphEdge& newEdge = cGraph[eNew];
+ newEdge.id = nextEdgeId++;
+
+ // Copy the last bubble to the new edge.
+ newEdge.push_back(lastBubble);
+
+ }
+
+
+
+ // Create a new edge for the rest of the bubble chain.
+ edge_descriptor eNew;
+ tie(eNew, ignore) = add_edge(
+ splitFirstBubble ? cv2 : cv0,
+ splitLastBubble ? cv3 : cv1,
+ cGraph);
+ AssemblyGraphEdge& newEdge = cGraph[eNew];
+ newEdge.id = nextEdgeId++;
+
+ // Copy the rest of the bubble chain to the new edge.
+ auto it0 = bubbleChain.begin();
+ auto it1 = bubbleChain.end();
+ if(splitFirstBubble) {
+ ++it0;
+ }
+ if(splitLastBubble) {
+ --it1;
+ }
+ copy(it0, it1, back_inserter(newEdge));
+
+
+ // Now we can remove the old BubbleChain we just split.
+ boost::remove_edge(ce, cGraph);
+
+}
+
+
+
+// Bubble cleanup (all bubbles), with the purpose of eliminating most bubbles caused by errors.
+uint64_t AssemblyGraph::cleanupBubbles(
+ bool debug,
+ uint64_t maxOffset,
+ uint64_t chainTerminalCommonThreshold,
+ uint64_t threadCount)
+{
+ AssemblyGraph& graph = *this;
+ performanceLog << timestamp << "AssemblyGraph::cleanupBubbles begins." << endl;
+
+
+
+ // First, assemble sequence for all the chains of diploid bubbles with a small offset.
+ clearAllShouldBeAssembledFlags();
+ BGL_FORALL_EDGES(e, graph, AssemblyGraph) {
+ BubbleChain& bubbleChain = graph[e];
+ for(Bubble& bubble: bubbleChain) {
+
+ // If this bubble is not diploid, skip it.
+ if(bubble.size() != 2) {
+ continue;
+ }
+
+ // The bubble is diploid. Compute its maxOffset.
+ uint64_t averageOffset;
+ uint64_t minOffset;
+ uint64_t bubbleMaxOffset;
+ const uint64_t offsetWasComputed = bubbleOffsetNoException(
+ bubble, averageOffset, minOffset, bubbleMaxOffset);
+
+ // If the offset is large or could not be computed, we don't need to
+ // assemble this bubble.
+ if((not offsetWasComputed) or bubbleMaxOffset>maxOffset) {
+ continue;
+ }
+
+ // We need to assemble the Chains of this bubble.
+ for(Chain& chain: bubble) {
+ chain.shouldBeAssembled = true;
+ }
+ }
+ }
+ assembleChainsMultithreaded(chainTerminalCommonThreshold, threadCount);
+ performanceLog << timestamp << "Sequence assembly for AssemblyGraph::cleanupBubbles ends." << endl;
+
+
+
+ uint64_t removedCount = 0;
+ BGL_FORALL_EDGES(ce, graph, AssemblyGraph) {
+ removedCount += cleanupBubbles(debug, ce, maxOffset, chainTerminalCommonThreshold);
+ }
+
+ performanceLog << timestamp << "AssemblyGraph::cleanupBubbles ends." << endl;
+ return removedCount;
+}
+
+
+
+// Bubble cleanup for a bubble chain, with the purpose of eliminating most bubbles caused by errors.
+uint64_t AssemblyGraph::cleanupBubbles(bool debug, edge_descriptor ce,
+ uint64_t maxOffset, uint64_t chainTerminalCommonThreshold)
+{
+ AssemblyGraph& cGraph = *this;
+ BubbleChain& bubbleChain = cGraph[ce];
+ BubbleChain newBubbleChain;
+
+ uint64_t removedCount = 0;
+ for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) {
+ Bubble& bubble = bubbleChain[positionInBubbleChain];
+
+ if(debug) {
+ cout << "cleanupBubbles working on Bubble " << bubbleStringId(ce, positionInBubbleChain) <<
+ " ploidy " << bubble.size() << endl;
+ cout << "Entrance " << bubble.front().front() << ", exit " << bubble.front().back() << endl;
+ }
+
+ bool keepBubble = false;
+
+ if(bubble.isHaploid()) {
+
+ // The bubble is haploid. Keep it.
+ keepBubble = true;
+
+ if(debug) {
+ cout << "Keeping this bubble because it is haploid." << endl;
+ }
+
+ } else {
+
+ // The bubble is not haploid. Compute its maxOffset.
+ uint64_t averageOffset;
+ uint64_t minOffset;
+ uint64_t bubbleMaxOffset;
+ const bool offsetWasComputed = bubbleOffsetNoException(bubble, averageOffset, minOffset, bubbleMaxOffset);
+
+ if((not offsetWasComputed) or bubbleMaxOffset>maxOffset) {
+
+ // The bubble is not haploid but the offset is large. Keep it.
+ keepBubble = true;
+
+ if(debug) {
+ cout << "Keeping this bubble because it is not haploid but its offset is large." << endl;
+ }
+
+ } else {
+
+ // The bubble is not haploid and has a small offset.
+
+ if(bubble.size() > 2) {
+
+ // The bubble has a small offset and ploidy greater than 2. Remove it.
+ keepBubble = false;
+
+ if(debug) {
+ cout << "Removing this bubble because it has a small offset and ploidy greater than 2." << endl;
+ }
+
+ } else {
+
+ // The bubble has a small offset and ploidy 2.
+ // Check that we assembled the sequence of its two sides.
+ for(Chain& chain: bubble) {
+ SHASTA_ASSERT(chain.wasAssembled);
+ }
+
+ if(debug) {
+ for(uint64_t indexInBubble=0; indexInBubble<2; indexInBubble++) {
+ const auto& sequence = bubble[indexInBubble].sequence;
+ cout << ">" << chainStringId(ce, positionInBubbleChain, indexInBubble) <<
+ " " << sequence.size() << "\n";
+ copy(sequence.begin(), sequence.end(), ostream_iterator<shasta::Base>(cout));
+ cout << "\n";
+ }
+ }
+ if(bubble[0].sequence == bubble[1].sequence) {
+ keepBubble = false;
+ if(debug) {
+ cout << "The two sides have identical sequence." << endl;
+ }
+ } else {
+
+ // Figure out if they differ by a copy number of short periodicity.
+ const uint64_t period = isCopyNumberDifference(bubble[0].sequence, bubble[1].sequence, 4);
+ if(debug) {
+ cout << "Period " << period << "\n";
+ }
+ keepBubble = (period == 0);
+ }
+ }
+ }
+
+
+ }
+
+ if(keepBubble) {
+ newBubbleChain.push_back(bubble);
+ if(debug) {
+ cout << "Kept this bubble." << endl;
+ }
+ } else {
+ // Remove the bubble and replace it with a haploid bubble
+ // consisting of only the terminal MarkerGraphEdgeIds.
+ Chain newChain;
+ newChain.push_back(bubble.front().front());
+ newChain.push_back(bubble.front().back());
+ Bubble newBubble;
+ newBubble.push_back(newChain);
+ newBubbleChain.push_back(newBubble);
+ ++removedCount;
+ if(debug) {
+ cout << "Removed this bubble." << endl;
+ }
+ }
+ }
+
+ bubbleChain.swap(newBubbleChain);
+ return removedCount;
+}
+
+
+
+// This finds squares of the form:
+// A->B
+// A->B'
+// B->A'
+// B'->A'
+// where a prime sign indicates reverse complementing.
+// It then one of two pairs of self-complementary edges:
+// A->B and B'->A'
+// or
+// A->B' and B->A'
+// The pair to be removed is selected in such a way that its removal
+// does not introduce any dead ends.
+// The code uses the following names:
+// A0 = A
+// A1 = A'
+// B0 = B
+// B1 = B'
+void AssemblyGraph::removeSelfComplementarySquares()
+{
+ AssemblyGraph& cGraph = *this;
+ const bool debug = true;
+
+ vector< pair<edge_descriptor, vertex_descriptor> > outEdgesA0;
+
+
+ // Do this iteratively.
+ while(true) {
+
+
+ // Loop over all possible choices for A0.
+ bool done = false;
+ BGL_FORALL_VERTICES(A0, cGraph, AssemblyGraph) {
+
+ // Gather the children of A.
+ outEdgesA0.clear();
+ BGL_FORALL_OUTEDGES(A0, ce, cGraph, AssemblyGraph) {
+ outEdgesA0.push_back({ce, target(ce, cGraph)});
+ }
+
+ // Look for a reverse complementary pair (B0, B1)
+ // with edges B0->A1 and B1->A1.
+ for(uint64_t i1=0; i1<outEdgesA0.size(); i1++) {
+ const vertex_descriptor B1 = outEdgesA0[i1].second;
+ const uint64_t edgeIdB1 = cGraph[B1].edgeId;
+ const uint64_t edgeIdB0 = assembler.markerGraph.reverseComplementEdge[edgeIdB1];
+ for(uint64_t i0=0; i0<i1; i0++) {
+ const vertex_descriptor B0 = outEdgesA0[i0].second;
+ if(cGraph[B0].edgeId == edgeIdB0) {
+
+ // We found it.
+
+ // Look for the edges B0->A1 and B1->A1.
+ const uint64_t edgeIdA0 = cGraph[A0].edgeId;
+ const uint64_t edgeIdA1 = assembler.markerGraph.reverseComplementEdge[edgeIdA0];
+
+ edge_descriptor B0A1;
+ vertex_descriptor A10 = null_vertex();
+ BGL_FORALL_OUTEDGES(B0, ce, cGraph, AssemblyGraph) {
+ const vertex_descriptor v = target(ce, cGraph);
+ if(cGraph[v].edgeId == edgeIdA1) {
+ B0A1 = ce;
+ A10 = v;
+ break;
+ }
+ }
+ if(A10 == null_vertex()) {
+ continue;
+ }
+
+ edge_descriptor B1A1;
+ vertex_descriptor A11 = null_vertex();
+ BGL_FORALL_OUTEDGES(B1, ce, cGraph, AssemblyGraph) {
+ const vertex_descriptor v = target(ce, cGraph);
+ if(cGraph[v].edgeId == edgeIdA1) {
+ B1A1 = ce;
+ A11 = v;
+ break;
+ }
+ }
+ if(A11 == null_vertex()) {
+ continue;
+ }
+
+ if(A10 != A11) {
+ continue;
+ }
+ const vertex_descriptor A1 = A10;
+
+ // We found a self-complementary square.
+ const edge_descriptor A0B0 = outEdgesA0[i0].first;
+ const edge_descriptor A0B1 = outEdgesA0[i1].first;
+
+ if(debug) {
+ cout << "Found a self-complementary square:\n" <<
+ cGraph[A0].edgeId << " " <<
+ cGraph[B0].edgeId << " " <<
+ cGraph[B1].edgeId << " " <<
+ cGraph[A1].edgeId << "\n" <<
+ bubbleChainStringId(A0B0) << " " <<
+ bubbleChainStringId(A0B1) << " " <<
+ bubbleChainStringId(B0A1) << " " <<
+ bubbleChainStringId(B1A1) << "\n";
+ }
+
+ // Remove two of the edges in the square,
+ // making sure to not introduce dead ends.
+ if(out_degree(A0, cGraph) > 1 and in_degree(A1, cGraph) > 1) {
+ if(in_degree (B0, cGraph) > 1 and out_degree(B1, cGraph) > 1) {
+ boost::remove_edge(A0B0, cGraph);
+ boost::remove_edge(B1A1, cGraph);
+ done = true;
+ } else if(in_degree(B1, cGraph) > 1 and out_degree(B0, cGraph) > 1) {
+ boost::remove_edge(A0B1, cGraph);
+ boost::remove_edge(B0A1, cGraph);
+ done = true;
+ }
+ }
+
+ if(done) {
+ break;
+ }
+ }
+
+ if(done) {
+ break;
+ }
+
+ }
+ if(done) {
+ break;
+ }
+ }
+ if(done) {
+ break;
+ }
+ }
+
+ // If nothing happened, stop the outer iteration.
+ if(not done) {
+ break;
+ }
+ }
+}
diff --git a/src/mode3-AssemblyGraph.hpp b/src/mode3-AssemblyGraph.hpp
new file mode 100644
index 0000000..2468451
--- /dev/null
+++ b/src/mode3-AssemblyGraph.hpp
@@ -0,0 +1,886 @@
+#pragma once
+
+// Shasta
+#include "Base.hpp"
+#include "invalid.hpp"
+#include "mode3-PhasedComponent.hpp"
+#include "MultithreadedObject.hpp"
+#include "shastaTypes.hpp"
+#include "SHASTA_ASSERT.hpp"
+
+// Boost libraries.
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/serialization/vector.hpp>
+
+// Standard library
+#include "algorithm.hpp"
+#include "array.hpp"
+#include <map>
+#include "memory.hpp"
+#include "fstream.hpp"
+#include "string.hpp"
+#include "utility.hpp"
+#include "vector.hpp"
+
+
+
+namespace shasta {
+ namespace mode3 {
+
+ // Each edge of the CompressedPathGraph describes a BubbleChain.
+
+ // A Chain is a sequence of MarkerGraphEdgeIds.
+ class Chain;
+
+ // A Bubble is a set of Chains that begin and end at the same MarkerGraphEdgeId.
+ // It can consist of one or more Chains.
+ class Bubble;
+
+ // A BubbleChain is a sequence of Bubbles.
+ class BubbleChain;
+
+ class AssemblyGraph;
+ class AssemblyGraphVertex;
+ class AssemblyGraphEdge;
+ using AssemblyGraphBaseClass = boost::adjacency_list<
+ boost::listS,
+ boost::listS,
+ boost::bidirectionalS,
+ AssemblyGraphVertex,
+ AssemblyGraphEdge>;
+
+ class PrimaryGraph;
+ }
+ class Assembler;
+ class Mode3AssemblyOptions;
+ class OrientedReadId;
+}
+
+
+
+// A Chain is a sequence of MarkerGraphEdgeIds.
+class shasta::mode3::Chain : public vector<MarkerGraphEdgeId> {
+public:
+
+ // Flag used to indicate that this Chain needs to be assembled.
+ // Used by assembleChainsMultithreaded.
+ bool shouldBeAssembled = false;
+ bool wasAssembled = false;
+
+ // Assembled sequence, including the sequence of the first and
+ // last primary marker graph edges.
+ vector<Base> sequence;
+
+ // The internal sequence assembled between consecutive pairs
+ // of MarkerGraphEdgeIds in the chain.
+ // If a local assembly fails, the success flag remains false and the sequence remains empty.
+ class StepSequence {
+ public:
+ vector<Base> sequence;
+ bool success = false;
+ };
+ vector<StepSequence> stepSequences;
+
+
+ MarkerGraphEdgeId second() const
+ {
+ SHASTA_ASSERT(size() > 1);
+ return (*this)[1];
+ }
+ MarkerGraphEdgeId secondToLast() const
+ {
+ SHASTA_ASSERT(size() > 1);
+ return (*this)[size() - 2];
+ }
+
+ template<class Archive> void serialize(Archive & ar, const unsigned int version)
+ {
+ ar & boost::serialization::base_object< vector<MarkerGraphEdgeId> >(*this);
+ }
+};
+
+
+
+class shasta::mode3::Bubble : public vector<Chain> {
+public:
+ bool isHaploid() const
+ {
+ return size() == 1;
+ }
+ bool isDiploid() const
+ {
+ return size() == 2;
+ }
+ bool isGeneral() const
+ {
+ return size() > 2;
+ }
+
+ // Remove duplicate chains.
+ void deduplicate();
+
+ template<class Archive> void serialize(Archive & ar, const unsigned int version)
+ {
+ ar & boost::serialization::base_object< vector<Chain> >(*this);
+ }
+};
+
+
+
+class shasta::mode3::BubbleChain : public vector<Bubble> {
+public:
+ const Bubble& firstBubble() const
+ {
+ SHASTA_ASSERT(not empty());
+ return front();
+ }
+ Bubble& firstBubble()
+ {
+ SHASTA_ASSERT(not empty());
+ return front();
+ }
+ const Bubble& lastBubble() const
+ {
+ SHASTA_ASSERT(not empty());
+ return back();
+ }
+ Bubble& lastBubble()
+ {
+ SHASTA_ASSERT(not empty());
+ return back();
+ }
+
+ uint64_t diploidBubbleCount() const
+ {
+ uint64_t n = 0;
+ for(const Bubble& bubble: *this) {
+ if(bubble.isDiploid()) {
+ ++n;
+ }
+ }
+ return n;
+ }
+
+ // This returns true if this superbubble consists of a single haploid bubble.
+ bool isSimpleChain() const
+ {
+ return size() == 1 and firstBubble().isHaploid();
+ }
+ Chain& getOnlyChain()
+ {
+ SHASTA_ASSERT(isSimpleChain());
+ return firstBubble().front();
+ }
+
+ // Collapse consecutive haploid bubbles.
+ bool compress();
+
+ MarkerGraphEdgeId firstMarkerGraphEdgeId() const
+ {
+ SHASTA_ASSERT(not empty());
+ const Bubble& firstBubble = front();
+ const MarkerGraphEdgeId markerGraphEdgeId = firstBubble.front().front();
+ for(const Chain& chain: firstBubble) {
+ SHASTA_ASSERT(chain.front() == markerGraphEdgeId);
+ }
+ return markerGraphEdgeId;
+ }
+
+ MarkerGraphEdgeId lastMarkerGraphEdgeId() const
+ {
+ SHASTA_ASSERT(not empty());
+ const Bubble& lastBubble = back();
+ const MarkerGraphEdgeId markerGraphEdgeId = lastBubble.front().back();
+ for(const Chain& chain: lastBubble) {
+ SHASTA_ASSERT(chain.back() == markerGraphEdgeId);
+ }
+ return markerGraphEdgeId;
+ }
+
+ // Return the total lenght of this bubble chain.
+ uint64_t totalLength() const;
+
+
+ template<class Archive> void serialize(Archive & ar, const unsigned int version)
+ {
+ ar & boost::serialization::base_object< vector<Bubble> >(*this);
+ }
+
+};
+
+
+
+class shasta::mode3::AssemblyGraphVertex {
+public:
+ MarkerGraphEdgeId edgeId;
+
+ // Numbering of vertices consecutively starting at zero.
+ // This is computed by renumberVertices, and becomes
+ // invalid as soon as a vertex is added or removed.
+ uint64_t index = invalid<uint64_t>;
+
+ // The id of the Superbubble this vertex belongs to, if any.
+ // Stored by class Superbubbles.
+ uint64_t superbubbleId = invalid<uint64_t>;
+
+ template<class Archive> void serialize(Archive & ar, const unsigned int version)
+ {
+ ar & edgeId;
+ }
+};
+
+
+
+class shasta::mode3::AssemblyGraphEdge : public BubbleChain {
+public:
+ uint64_t id = invalid<uint64_t>;
+
+ template<class Archive> void serialize(Archive & ar, const unsigned int version)
+ {
+ ar & boost::serialization::base_object<BubbleChain>(*this);
+ ar & id;
+ }
+};
+
+
+
+class shasta::mode3::AssemblyGraph:
+ public AssemblyGraphBaseClass,
+ public MultithreadedObject<shasta::mode3::AssemblyGraph> {
+public:
+
+ // Create from a connected component of the PrimaryGraph, then call run.
+ AssemblyGraph(
+ const PrimaryGraph&,
+ uint64_t componentId,
+ const Assembler&,
+ uint64_t threadCount,
+ const Mode3AssemblyOptions& options,
+ bool assembleSequence,
+ bool debug);
+
+ // Load it from a binary archive, then call run.
+ AssemblyGraph(
+ const string& fileName,
+ const Assembler&,
+ uint64_t threadCount,
+ const Mode3AssemblyOptions& options,
+ bool assembleSequence,
+ bool debug);
+
+private:
+
+ // Hide Base defined by the base class.
+ using Base = shasta::Base;
+
+ // Information stored by the constructor.
+ uint64_t componentId;
+ const Assembler& assembler;
+ const Mode3AssemblyOptions& options;
+
+ friend class boost::serialization::access;
+ template<class Archive> void serialize(Archive & ar, const unsigned int version)
+ {
+ ar & boost::serialization::base_object<AssemblyGraphBaseClass>(*this);
+ ar & componentId;
+ ar & nextEdgeId;
+ }
+ void save(const string& fileName) const;
+ void load(const string& fileName);
+
+ void run(
+ uint64_t threadCount,
+ bool assembleSequence,
+ bool debug);
+
+
+
+ // Initial creation from the PrimaryGraph.
+ // Each linear chain of edges in the PrimaryGraph after transitive reduction generates
+ // an AssemblyGraphEdge (BubbleChain) consisting of a single haploid bubble.
+ void create(const PrimaryGraph&, bool debug);
+ uint64_t nextEdgeId = 0;
+ void renumberEdges();
+
+ // Return the vertex corresponding to a given MarkerGraphEdgeId,
+ // creating it if it is not in the given vertexMap.
+ // This is only used in create().
+ vertex_descriptor getVertex(
+ MarkerGraphEdgeId,
+ std::map<MarkerGraphEdgeId, vertex_descriptor>& vertexMap
+ );
+
+ // Create a new vertex with a given MarkerGraphEdgeId.
+ vertex_descriptor createVertex(MarkerGraphEdgeId);
+
+ void removeVertex(vertex_descriptor);
+
+ // Compute vertexIndex for every vertex.
+ // This numbers vertices consecutively starting at zero.
+ // This numbering becomes invalid as soon as a vertex is added or removed.
+ void numberVertices();
+ void clearVertexNumbering();
+
+ // Create a new edge connecting cv0 and cv1.
+ // The new edge will consist of a simple BubbleChain with a single
+ // haploid Bubble with a Chain of length 2.
+ edge_descriptor connect(vertex_descriptor cv0, vertex_descriptor cv1);
+
+ // Compress parallel edges into bubbles, where possible.
+ bool compressParallelEdges();
+
+ // Compress linear sequences of edges (BubbleChains) into longer BubbleChains.
+ bool compressSequentialEdges();
+
+ // Call compress on all BubbleChains to merge adjacent haploid bubbles.
+ bool compressBubbleChains();
+
+ // Call compressParallelEdges, compressSequentialEdges, and compressBubbleChains
+ // iteratively until nothing changes.
+ bool compress();
+
+ // This does the opposite of compress. All bubble chains that
+ // consist of more than one simple haploid bubble are expanded into one
+ // edge for each edge of each bubble.
+ // For optimal results it is best to call compressBubbleChains before expand.
+ void expand();
+
+ // Compute the tangle matrix given in-edges and out-edges.
+ // The last bubble of each in-edge and the first bubble
+ // of each out-edge must be haploid.
+ void computeTangleMatrix(
+ const vector<edge_descriptor>& inEdges,
+ const vector<edge_descriptor>& outEdges,
+ vector< vector<uint64_t> >& tangleMatrix,
+ bool setToZeroForComplementaryPairs
+ ) const;
+
+ // Low level primitives used in detangling.
+ // See the implementation for details.
+ vertex_descriptor cloneAndTruncateAtEnd(edge_descriptor);
+ vertex_descriptor cloneAndTruncateAtBeginning(edge_descriptor);
+
+ // Vertex detangling.
+ // bool detangleVerticesStrict(bool debug);
+ // bool detangleVertexStrict(vertex_descriptor, bool debug);
+ bool detangleVertices(bool debug,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP);
+ bool detangleVertex(
+ vertex_descriptor,
+ bool debug,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP);
+
+ // Vertex detangling that can deal with non-haploid bubbles adjacent to the
+ // vertex to be detangled.
+ bool detangleVerticesGeneral(bool debug,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP);
+ bool detangleVertexGeneral(
+ vertex_descriptor,
+ bool debug,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP);
+
+ // Split the first/last bubble of a bubble chain.
+ // Used by detangleVertexGeneral to eliminate
+ // non-haploid bubble adjacent to a vertex to be detangled.
+ void splitBubbleChainAtBeginning(edge_descriptor);
+ void splitBubbleChainAtEnd(edge_descriptor);
+
+
+ // Edge detangling.
+ bool detangleEdges(
+ bool debug,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP);
+ bool detangleEdge(
+ bool debug,
+ std::map<uint64_t, edge_descriptor>& edgeMap,
+ std::map<uint64_t, edge_descriptor>::iterator&,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP);
+ bool detangleEdgesGeneral(
+ bool debug,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP);
+ bool detangleEdgeGeneral(
+ bool debug,
+ std::map<uint64_t, edge_descriptor>& edgeMap,
+ std::map<uint64_t, edge_descriptor>::iterator&,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP);
+ bool detangleEdgesWithSearch(
+ bool debug,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh);
+ bool detangleEdgeWithSearch(
+ bool debug,
+ std::map<uint64_t, edge_descriptor>& edgeMap,
+ std::map<uint64_t, edge_descriptor>::iterator&,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh);
+
+ bool removeSelfComplementaryEdges();
+
+ // Special treatment to detangle back edges that were too long
+ // to be handled by detangleEdges.
+ bool detangleBackEdges(
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh);
+ bool detangleBackEdge(
+ std::map<uint64_t, edge_descriptor>& edgeMap,
+ std::map<uint64_t, edge_descriptor>::iterator&,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh);
+
+ // Bubble cleanup, with the purpose of eliminating most bubbles caused by errors.
+ // See the code for details of what this does.
+ uint64_t cleanupBubbles(
+ bool debug,
+ uint64_t maxOffset,
+ uint64_t chainTerminalCommonThreshold,
+ uint64_t threadCount);
+ uint64_t cleanupBubbles(
+ bool debug,
+ edge_descriptor ce,
+ uint64_t maxOffset,
+ uint64_t chainTerminalCommonThreshold);
+
+
+
+ // Find short superbubbles in the AssemblyGraph.
+ class Superbubble : public vector<vertex_descriptor> {
+ public:
+ vector<vertex_descriptor> entrances;
+ vector<vertex_descriptor> exits;
+
+ // Fill in the superbubble given a single entrance and exit.
+ void fillInFromEntranceAndExit(const AssemblyGraph&);
+ };
+ class Superbubbles {
+ public:
+
+ // This computes connected components using only edges with length up to maxOffset1.
+ Superbubbles(
+ AssemblyGraph&,
+ uint64_t maxOffset1
+ );
+
+ // This uses dominator trees.
+ // It only finds superbubbles with one entrance and one ecit.
+ Superbubbles(AssemblyGraph&);
+
+ ~Superbubbles();
+
+ // Return the number of superbubbbles.
+ uint64_t size() const
+ {
+ return superbubbles.size();
+ }
+
+ // Return the vertices in the specified superbubble.
+ Superbubble& getSuperbubble(uint64_t superBubbleId)
+ {
+ return superbubbles[superBubbleId];
+ }
+ const Superbubble& getSuperbubble(uint64_t superBubbleId) const
+ {
+ return superbubbles[superBubbleId];
+ }
+
+ // Figure out if a vertex is in the specified superbubble.
+ bool isInSuperbubble(uint64_t superbubbleId, vertex_descriptor cv) const
+ {
+ return cGraph[cv].superbubbleId == superbubbleId;
+ }
+
+ private:
+
+ AssemblyGraph& cGraph;
+
+ // The superbubbles are the connected components with size at least 2,
+ // computed using only the edges with offset up to maxOffset1.
+ vector<Superbubble> superbubbles;
+ };
+
+
+
+
+ // Remove short superbubbles with one entry and one exit.
+ bool removeShortSuperbubbles(
+ bool debug,
+ uint64_t maxOffset1, // Used to define superbubbles
+ uint64_t maxOffset2 // Compared against the offset between entry and exit
+ );
+
+ // Detangle short superbubbles with any number of entrances and exits.
+ bool detangleShortSuperbubbles(
+ bool debug,
+ uint64_t maxOffset1, // Used to define superbubbles
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP);
+ bool detangleShortSuperbubble(
+ bool debug,
+ const Superbubbles&,
+ uint64_t superbubbleId,
+ uint64_t detangleToleranceLow,
+ uint64_t detangleToleranceHigh,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP);
+
+
+ // Cleanup/simplify superbubbles that are likely to be caused by errors,
+ // completely or in part.
+ void cleanupSuperbubbles(
+ bool debug,
+ uint64_t maxOffset1, // Used to define superbubbles
+ uint64_t maxOffset2, // Compared against the offset between entry and exit
+ uint64_t chainTerminalCommonThreshold);
+ void cleanupSuperbubble(
+ bool debug,
+ const Superbubbles&,
+ uint64_t superbubbleId,
+ uint64_t maxOffset2, // Compared against the offset between entry and exit
+ uint64_t chainTerminalCommonThreshold,
+ std::set<vertex_descriptor>& previousSuperbubblesVertices);
+
+ // This version of superbubble cleanup uses dominator trees to define superbubbles,
+ // instead of computing connected components using edges of length uo tp maxOffset1.
+ void cleanupSuperbubbles(
+ bool debug,
+ uint64_t maxOffset2, // Compared against the offset between entry and exit
+ uint64_t chainTerminalCommonThreshold);
+
+ // Split terminal haploid bubbles out of bubble chains, to facilitate detangling.
+ void splitTerminalHaploidBubbles();
+ void splitTerminalHaploidBubbles(edge_descriptor);
+
+ void removeSelfComplementarySquares();
+
+ // Phasing of bubble chains using the PhasingGraph.
+ void phaseBubbleChainsUsingPhasingGraph(
+ bool debug,
+ uint64_t n, // Maximum number of Chain MarkerGraphEdgeIds to use when computing tangle matrices.
+ uint64_t lowThreshold,
+ uint64_t highThreshold,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP,
+ uint64_t longBubbleThreshold);
+ void phaseBubbleChainUsingPhasingGraph(
+ edge_descriptor e,
+ uint64_t n, // Maximum number of Chain MarkerGraphEdgeIds to use when computing tangle matrices.
+ uint64_t lowThreshold,
+ uint64_t highThreshold,
+ bool useBayesianModel,
+ double epsilon,
+ double minLogP,
+ uint64_t longBubbleThreshold,
+ bool debug);
+ void phaseBubbleChainUsingPhasedComponents(
+ bool debug,
+ edge_descriptor e,
+ const vector< shared_ptr<PhasedComponent> >&,
+ uint64_t longBubbleThreshold);
+
+ // In the phasing graph, each vertex corresponds to a diploid bubble
+ // in the BubbleChain being phased.
+ class TangleMatrix : public array< array<uint64_t, 2>, 2> {
+ public:
+ void analyze(
+ uint64_t lowThreshold,
+ uint64_t highThreshold,
+ int64_t& phase,
+ uint64_t& minConcordant,
+ uint64_t& maxDiscordant,
+ uint64_t& total,
+ double epsilon,
+ double& logPin, // log[P(in-phase)/P(random)] in decibels
+ double& logPout // log[P(out-of-phase)/P(random)] in decibels
+ ) const;
+ };
+
+
+ // Compute the tangle matrix between two incoming chains
+ // and two outgoing chains, taking into account up to
+ // n MarkergraphEdgeIds for each Chain.
+ void computeTangleMatrix(
+ const array<const Chain*, 2> inChains,
+ const array<const Chain*, 2> outChains,
+ uint64_t n,
+ TangleMatrix&) const;
+
+ // Gather OrientedReadIds from up to n MarkergraphEdgeIds
+ // near the beginning or end of a chain.
+ void gatherOrientedReadIdsAtBeginning(
+ const Chain&,
+ uint64_t n,
+ vector<OrientedReadId>&) const;
+ void gatherOrientedReadIdsAtEnd(
+ const Chain&,
+ uint64_t n,
+ vector<OrientedReadId>&) const;
+
+
+
+ class PhasingGraphVertex {
+ public:
+ uint64_t positionInBubbleChain;
+ int64_t phase = 0; // +1 or -1 for phased vertices, 0 otherwise
+ };
+
+ class PhasingGraphEdge {
+ public:
+ int64_t phase; // +1 (in phase) or -1 (out of phase)
+
+ // Tangle matrix metrics.
+ // If phase = +1, minConcordant = min(m00, m11), maxDiscordant = max(m01, m10).
+ // If phase = -1, minConcordant = min(m01, m10), maxDiscordant = max(m00, m11).
+ uint64_t minConcordant;
+ uint64_t maxDiscordant;
+ double logPInPhase;
+ double logPOutOfPhase;
+ double logP() const
+ {
+ return max(max(logPInPhase, logPOutOfPhase), fabs(logPInPhase - logPOutOfPhase));
+ }
+
+#if 0
+ bool sortByCounts(const PhasingGraphEdge& that) const
+ {
+ if(maxDiscordant < that.maxDiscordant) {
+ return true;
+ }
+ if(maxDiscordant > that.maxDiscordant) {
+ return false;
+ }
+ return minConcordant > that.minConcordant;
+ }
+ bool sortByProbabilities(const PhasingGraphEdge& that) const
+ {
+ return logP() > that.logP();
+ }
+#endif
+ bool isSpanningTreeEdge = false;
+ };
+ using PhasingGraphBaseClass = boost::adjacency_list<
+ boost::listS,
+ boost::listS,
+ boost::undirectedS,
+ PhasingGraphVertex,
+ PhasingGraphEdge>;
+ class PhasingGraph : public PhasingGraphBaseClass {
+ public:
+ void phase(bool debug);
+ void phase1(bool debug, bool useBayesianModel);
+ bool isConsistent(edge_descriptor) const;
+ void writeGraphviz(const string& fileName) const;
+ vector< shared_ptr<PhasedComponent> > phasedComponents;
+
+ // Sort edges in order of decreasing significance:
+ // - If using the Bayesian model, logP.
+ // - Otherwise, minConcordant/maxDiscordant.
+ void sortEdges(vector<edge_descriptor>& sortedEdges, bool useBayesianModel) const;
+ };
+
+
+
+ // Phasing of bubble chains using the PhasingTable.
+ void phaseBubbleChainsUsingPhasingTable(
+ const string& debugOutputFileNamePrefix,
+ double phaseErrorThreshold,
+ double bubbleErrorThreshold,
+ uint64_t longBubbleThreshold);
+ void phaseBubbleChainUsingPhasingTable(
+ const string& debugOutputFileNamePrefix,
+ edge_descriptor e,
+ double phaseErrorThreshold,
+ double bubbleErrorThreshold,
+ uint64_t longBubbleThreshold);
+ void cleanupBubbleChainUsingPhasingTable(
+ const string& debugOutputFileNamePrefix,
+ edge_descriptor e,
+ double phaseErrorThreshold,
+ double bubbleErrorThreshold,
+ uint64_t longBubbleThreshold);
+
+
+
+ // Optimize chains before assembly, to remove assembly steps with
+ // less that minCommon reads.
+ void optimizeChains(
+ bool debug,
+ uint64_t minCommon,
+ uint64_t k
+ );
+ void optimizeChain(
+ bool debug,
+ Chain&,
+ uint64_t minCommon,
+ uint64_t k
+ );
+
+ // Assemble sequence for a single Chain.
+ void assembleChain(
+ Chain&,
+ uint64_t chainTerminalCommonThreshold);
+
+ // Multithreaded version of sequence assembly.
+ // This only assembles the chains that have the shouldBeAssembled flag set.
+ void assembleChainsMultithreaded(
+ uint64_t chainTerminalCommonThreshold,
+ uint64_t threadCount);
+ // This sets the shouldBeAssembled flag for all chains, then
+ // calls assembleChainsMultithreaded.
+ void assembleAllChainsMultithreaded(
+ uint64_t chainTerminalCommonThreshold,
+ uint64_t threadCount);
+ // This clears the shouldBeAssembled flag from all Chains.
+ void clearAllShouldBeAssembledFlags();
+
+ void assembleChainsMultithreadedTheadFunction(uint64_t threadId);
+ void combineStepSequences(Chain&);
+ class AssemblyStep {
+ public:
+ edge_descriptor e; // This identified the BubbleChain.
+ uint64_t positionInBubbleChain; // This identifies the Bubble.
+ uint64_t indexInBubble; // This identifies the Chain.
+ uint64_t positionInChain;
+ uint64_t offsetInBases;
+
+ // For better load balancing, order them by decreasing offsetInBases.
+ bool operator<(const AssemblyStep& that) const
+ {
+ return offsetInBases > that.offsetInBases;
+ }
+ };
+ void runAssemblyStep(
+ uint64_t chainTerminalCommonThreshold,
+ const AssemblyStep&);
+ void runAssemblyStep(
+ Chain& chain,
+ uint64_t positionInChain,
+ uint64_t chainTerminalCommonThreshold);
+ class AssembleChainsMultithreadedData {
+ public:
+ uint64_t chainTerminalCommonThreshold;
+ vector<AssemblyStep> assemblySteps;
+ };
+ AssembleChainsMultithreadedData assembleChainsMultithreadedData;
+
+
+
+ // Get the lengths of Chains assembled sequence for each Chain P-value.
+ // On return, chainLengths[pValue] contains the lengths of all
+ // Chains with that pValue, sorted in decreasing order.
+ // This can be used for N50 statistics.
+public:
+ void getChainLengthsByPValue(vector< vector<uint64_t> >& chainLengths) const;
+
+ // Get the lengths of all non-trivial bubble chains.
+ void getBubbleChainLengths(vector<uint64_t>&) const;
+
+ // Given a vector of lengths in decreasing order, compute the total length and N50.
+ static pair<uint64_t, uint64_t> n50(const vector<uint64_t>&);
+private:
+
+ // Output.
+ void write(const string& name, bool writeSequence = false) const;
+ void writeCsv(const string& fileNamePrefix) const;
+public:
+ void writeCsvSummary(ostream&) const;
+private:
+ void writeBubbleChainsCsv(const string& fileNamePrefix) const;
+ void writeBubbleChainsPhasingTables(const string& fileNamePrefix, double phaseErrorThreshold) const;
+ void writeBubblesCsv(const string& fileNamePrefix) const;
+ void writeChainsCsv(const string& fileNamePrefix) const;
+ void writeChainsDetailsCsv(const string& fileNamePrefix) const;
+ void writeChainDetailsCsv(ostream&, edge_descriptor, bool writeHeader) const;
+ void writeGraphviz(const string& fileNamePrefix, bool labels) const;
+ void writeGfa(const string& fileNamePrefix) const;
+ void writeGfaExpanded(
+ const string& fileNamePrefix,
+ bool includeSequence,
+ bool useSequenceLength) const;
+ void writeGfaExpanded(
+ ostream&,
+ bool includeSequence,
+ bool useSequenceLength) const;
+ void writeAssemblyDetails() const;
+public:
+ void writeGfaSegmentsExpanded(
+ ostream&,
+ bool includeSequence,
+ bool useSequenceLength) const;
+ void writeGfaLinksExpanded(ostream&) const;
+ static void writeGfaHeader(ostream&);
+ void writeFastaExpanded(ostream&) const;
+private:
+ void writeFastaExpanded(const string& fileNamePrefix) const;
+ void writeSnapshot(uint64_t& snapshotNumber) const;
+
+ string bubbleChainStringId(edge_descriptor) const;
+ string bubbleStringId(edge_descriptor, uint64_t positionInBubbleChain) const;
+ string chainStringId(edge_descriptor, uint64_t positionInBubbleChain, uint64_t indexInBubble) const;
+
+
+ // Return average coverage for the internal MarkerGraphEdgeIds of a Chain.
+ // For chain of length 2, this returns 0.
+ double primaryCoverage(const Chain&) const;
+
+ // This returns a "P-value" for a Chain defined as follows:
+ // If the Chain is the only chain of a BubbleChain, the P-value is 0.
+ // Otherwise, the P-value is the ploidy of the Bubble that the Chain belongs to.
+ uint64_t chainPValue(edge_descriptor, uint64_t positionInBubbleChain, uint64_t indexInBubble) const;
+
+ uint64_t chainOffset(const Chain&) const;
+ void bubbleOffset(
+ const Bubble&,
+ uint64_t& averageOffset,
+ uint64_t& minOffset,
+ uint64_t& maxOffset
+ ) const;
+ bool bubbleOffsetNoException(
+ const Bubble&,
+ uint64_t& averageOffset,
+ uint64_t& minOffset,
+ uint64_t& maxOffset
+ ) const;
+ void bubbleChainOffset(
+ const BubbleChain&,
+ uint64_t& averageOffset,
+ uint64_t& minOffset,
+ uint64_t& maxOffset
+ ) const;
+};
+
diff --git a/src/mode3-AssemblyPath.cpp b/src/mode3-AssemblyPath.cpp
deleted file mode 100644
index 4ac9dc2..0000000
--- a/src/mode3-AssemblyPath.cpp
+++ /dev/null
@@ -1,1269 +0,0 @@
-// Shasta.
-#include "mode3-AssemblyPath.hpp"
-#include "mode3-SegmentPairInformation.hpp"
-#include "assembleMarkerGraphPath.hpp"
-#include "ConsensusCaller.hpp"
-#include "deduplicate.hpp"
-#include "html.hpp"
-#include "Marker.hpp"
-#include "MarkerGraph.hpp"
-#include "Reads.hpp"
-#include "mode3.hpp"
-#include "timestamp.hpp"
-using namespace shasta;
-using namespace mode3;
-
-// Spoa.
-#include "spoa/spoa.hpp"
-
-// Seqan.
-#include <seqan/align.h>
-
-// Standard library.
-#include "fstream.hpp"
-
-
-
-// Assemble sequence for an AssemblyPath.
-void AssemblyPath::assemble(const AssemblyGraph& assemblyGraph)
-{
- const bool debug = false;
- if(debug) {
- cout << timestamp << "AssemblyPath::assemble begins." << endl;
- }
-
- // Assemble each segment on the path.
- assembleSegments(assemblyGraph);
-
- // Assemble links in this assembly path.
- initializeLinks(assemblyGraph);
- assembleLinks(assemblyGraph);
-
- if(debug) {
- writeSegmentSequences();
- writeLinkSequences(assemblyGraph);
- }
-
- assemble();
-
- if(debug) {
- cout << timestamp << "AssemblyPath::assemble ends." << endl;
- }
-}
-
-// Initialize the links.
-// This only resizes the links vector and fills in the id and isTrivial
-// fields of each link.
-void AssemblyPath::initializeLinks(const AssemblyGraph& assemblyGraph)
-{
- SHASTA_ASSERT(segments.size() > 1);
- links.resize(segments.size()-1);
-
- // Fill in the id and isTrivial fields of each link.
- for(uint64_t position0=0; position0<links.size(); position0++) {
- const uint64_t position1 = position0 + 1;
-
- // Access the source and target segments of this link.
- // We will process the link between segmentId0 and segmentId1.
- AssemblyPathSegment& segment0 = segments[position0];
- AssemblyPathSegment& segment1 = segments[position1];
-
- // Fill in the id and isTrivial fields.
- AssemblyPathLink& assemblyPathLink = links[position0];
- assemblyPathLink.id = assemblyGraph.findLink(segment0.id, segment1.id);
- const AssemblyGraph::Link& link = assemblyGraph.links[assemblyPathLink.id];
- assemblyPathLink.isTrivial = link.segmentsAreAdjacent;
-
- SHASTA_ASSERT(segment0.id == link.segmentId0);
- SHASTA_ASSERT(segment1.id == link.segmentId1);
- }
-
-
- // Fill in the previousPrimarySegmentId field of each link.
- SHASTA_ASSERT(segments.front().isPrimary);
- uint64_t lastPrimarySegmentSeen = invalid<uint64_t>;
- for(uint64_t position=0; position<links.size(); position++) {
- const AssemblyPathSegment& segment = segments[position];
- if(segment.isPrimary) {
- lastPrimarySegmentSeen = segment.id;
- }
- links[position].previousPrimarySegmentId = lastPrimarySegmentSeen;
- }
-
-
-
- // Fill in the nextPrimarySegmentId field of each link.
- SHASTA_ASSERT(segments.back().isPrimary);
- lastPrimarySegmentSeen = invalid<uint64_t>;
- for(uint64_t position = links.size() - 1; /* Check later */; position--) {
- const AssemblyPathSegment& segment = segments[position + 1];
- if(segment.isPrimary) {
- lastPrimarySegmentSeen = segment.id;
- }
- links[position].nextPrimarySegmentId = lastPrimarySegmentSeen;
-
- if(position == 0) {
- break;
- }
- }
-}
-
-
-
-// Assemble links in this assembly path.
-void AssemblyPath::assembleLinks(const AssemblyGraph& assemblyGraph)
-{
- const bool debug = false;
-
- SHASTA_ASSERT((assemblyGraph.k % 2) == 0);
-
- // Don't skip any bases at the beginning of the first
- // segment and at the end of the last segment.
- segments.front().leftTrim = 0;
- segments.back().rightTrim = 0;
-
- ofstream html;
- if(debug) {
- html.open("Msa.html");
- }
-
- // Loop over links in the path.
- links.resize(segments.size()-1);
- for(uint64_t position0=0; position0<links.size(); position0++) {
- assembleLinkAtPosition(assemblyGraph, position0, html);
- }
-}
-
-
-
-void AssemblyPath::assembleLinkAtPosition(
- const AssemblyGraph& assemblyGraph,
- uint64_t position0,
- ostream& html)
-{
- const bool debug = false;
-
- AssemblyPathLink& link = links[position0];
- const uint64_t position1 = position0 + 1;
-
- // Access the source and target segments of this link.
- // We will process the link between segmentId0 and segmentId1.
- AssemblyPathSegment& segment0 = segments[position0];
- AssemblyPathSegment& segment1 = segments[position1];
-
- if(debug) {
- cout << "Assembling link " << link.id << " " << segment0.id << "->" << segment1.id <<
- " at position " << position0 << " in the assembly path." << endl;
- }
-
- if(link.isTrivial) {
-
- // The two segments are consecutive in the marker graph.
- // This is a trivial link because the two segments share a terminal
- // marker graph vertex.
- // Just trim from the assembly the last k/2 RLE bases of segmentId0
- // and the first k/2 RLE bases of segmentId1.
- assembleTrivialLink(segment0, segment1, link, assemblyGraph.k);
-
- } else {
-
- assembleNonTrivialLink(
- assemblyGraph,
- segment0,
- segment1,
- link,
- html);
- }
-}
-
-
-
-void AssemblyPath::assembleNonTrivialLink(
- const AssemblyGraph& assemblyGraph,
- AssemblyPathSegment& segment0,
- AssemblyPathSegment& segment1,
- AssemblyPathLink& link,
- ostream& html)
-{
- const bool debug = false;
-
-
- // First, find:
- // - The position in segmentId0 of the leftmost transition.
- // - The position in segmentId1 of the rightmost transition.
- uint64_t minEdgePosition0 = assemblyGraph.markerGraphPaths[segment0.id].size();
- uint64_t maxEdgePosition1 = 0;
- for(const auto& p: assemblyGraph.transitions[link.id]) {
- const OrientedReadId orientedReadId = p.first;
-
- // If not in previousPrimarySegmentId or nextPrimarySegmentId, skip it.
- if(not(
- assemblyGraph.segmentContainsOrientedRead(link.previousPrimarySegmentId, orientedReadId)
- or
- assemblyGraph.segmentContainsOrientedRead(link.nextPrimarySegmentId, orientedReadId)
- )) {
- continue;
- }
-
- // Access the transition from segmentId0 to segmentId1 for this oriented read.
- const Transition& transition = p.second;
-
- minEdgePosition0 = min(minEdgePosition0, uint64_t(transition[0].position));
- maxEdgePosition1 = max(maxEdgePosition1, uint64_t(transition[1].position));
- }
-
- // When getting here:
- // - minEdgePosition0 is the leftmost position of the transitions in path0.
- // - maxEdgePosition1 is the rightmost position of the transitions in path1.
- // These positions are edge positions in markerGraphPath0 and markerGraphPath1.
- // We will do a multiple sequence alignment of the oriented reads,
- // using the sequence of segmentId0 to extend to the left all reads to minEdgePosition0,
- // and using the sequence of segmentId1 to extend to the right all reads to maxEdgePosition1,
-
- // Get the corresponding vertex positions in segmentId0 and segmentId1.
- const uint64_t minVertexPosition0 = minEdgePosition0 + 1;
- const uint64_t maxVertexPosition1 = maxEdgePosition1;
-
- // To compute an MSA anchored at both sides,we will extend the
- // sequence of each read to the left/right using the sequence of
- // adjacent segments.
- const AssembledSegment& assembledSegment0 = segment0.assembledSegment;
- SHASTA_ASSERT(not assembledSegment0.runLengthSequence.empty());
- const AssembledSegment& assembledSegment1 = segment1.assembledSegment;
- SHASTA_ASSERT(not assembledSegment1.runLengthSequence.empty());
-
-
- // Now extract the portion of each oriented read sequence that
- // will be used to assemble this link.
- vector<OrientedReadId> orientedReadIdsForAssembly;
- vector< vector<Base> > orientedReadsSequencesForAssembly;
- vector< vector<uint32_t> > orientedReadsRepeatCountsForAssembly;
- for(const auto& p: assemblyGraph.transitions[link.id]) {
- const OrientedReadId orientedReadId = p.first;
-
- // If not in previousPrimarySegmentId or nextPrimarySegmentId, skip it.
- if(not(
- assemblyGraph.segmentContainsOrientedRead(link.previousPrimarySegmentId, orientedReadId)
- or
- assemblyGraph.segmentContainsOrientedRead(link.nextPrimarySegmentId, orientedReadId)
- )) {
- continue;
- }
-
- // Access the transition from segmentId0 to segmentId1 for this oriented read.
- const Transition& transition = p.second;
-
- // Get the ordinals of the last appearance of this oriented
- // read on segmentId0 and the first on segmentId1,
- // and the corresponding markers.
- const uint32_t ordinal0 = transition[0].ordinals[1];
- const uint32_t ordinal1 = transition[1].ordinals[0];
- const CompressedMarker& marker0 = assemblyGraph.markers[orientedReadId.getValue()][ordinal0];
- const CompressedMarker& marker1 = assemblyGraph.markers[orientedReadId.getValue()][ordinal1];
-
- // Get the positions of these markers on the oriented read.
- // If using RLE, these are RLE positions.
- const uint32_t position0 = marker0.position;
- const uint32_t position1 = marker1.position;
-
- // Extract the sequence between these markers (including the markers).
- vector<Base> orientedReadSequence;
- vector<uint8_t> orientedReadRepeatCounts;
- if(assemblyGraph.readRepresentation == 1) {
- // RLE.
- for(uint64_t position=position0; position<position1+assemblyGraph.k; position++) {
- Base b;
- uint8_t r;
- tie(b, r) = assemblyGraph.reads.getOrientedReadBaseAndRepeatCount(orientedReadId, uint32_t(position));
- orientedReadSequence.push_back(b);
- orientedReadRepeatCounts.push_back(r);
- }
- } else {
- // Raw sequence.
- for(uint64_t position=position0; position<position1+assemblyGraph.k; position++) {
- const Base b = assemblyGraph.reads.getOrientedReadBase(orientedReadId, uint32_t(position));
- orientedReadSequence.push_back(b);
- orientedReadRepeatCounts.push_back(uint8_t(1));
- }
- }
-
- // We need to extend the sequence of this read to the left,
- // using segmentId0 sequence, up to minVertexPosition0,
- // so the portions of all reads we will be using for the MSA
- // all begin in the same place.
- vector<Base> leftSequence;
- vector<uint32_t> leftRepeatCounts;
- const uint64_t vertexPosition0 = transition[0].position + 1; // Add 1 to get vertex position.
- const uint64_t begin0 = assembledSegment0.vertexOffsets[minVertexPosition0];
- const uint64_t end0 = assembledSegment0.vertexOffsets[vertexPosition0];
- for(uint64_t position=begin0; position!=end0; position++) {
- leftSequence.push_back(assembledSegment0.runLengthSequence[position]);
- leftRepeatCounts.push_back(assembledSegment0.repeatCounts[position]);
- }
-
- vector<Base> rightSequence;
- vector<uint32_t> rightRepeatCounts;
- const uint64_t vertexPosition1 = transition[1].position;
- const uint64_t begin1 = assembledSegment1.vertexOffsets[vertexPosition1] + assemblyGraph.k;
- const uint64_t end1 = assembledSegment1.vertexOffsets[maxVertexPosition1] + assemblyGraph.k;
- for(uint64_t position=begin1; position!=end1; position++) {
- rightSequence.push_back(assembledSegment1.runLengthSequence[position]);
- rightRepeatCounts.push_back(assembledSegment1.repeatCounts[position]);
- }
-
- // Construct the extended sequence for this oriented read,
- // to be used in the MSA.
- vector<Base> orientedReadExtendedSequence;
- vector<uint32_t> orientedReadExtendedRepeatCounts;
- const auto addToExtendedSequence = back_inserter(orientedReadExtendedSequence);
- copy(leftSequence, addToExtendedSequence);
- copy(orientedReadSequence, addToExtendedSequence);
- copy(rightSequence, addToExtendedSequence);
- const auto addToRepeatCounts = back_inserter(orientedReadExtendedRepeatCounts);
- copy(leftRepeatCounts, addToRepeatCounts);
- copy(orientedReadRepeatCounts, addToRepeatCounts);
- copy(rightRepeatCounts, addToRepeatCounts);
-
- orientedReadIdsForAssembly.push_back(orientedReadId);
- orientedReadsSequencesForAssembly.push_back(orientedReadExtendedSequence);
- orientedReadsRepeatCountsForAssembly.push_back(orientedReadExtendedRepeatCounts);
-
- if(debug) {
- copy(orientedReadExtendedSequence, ostream_iterator<Base>(cout));
- cout << " " << orientedReadId << endl;
- }
- }
-
- // Store coverage for this link.
- link.coverage = orientedReadIdsForAssembly.size();
-
- // Compute the consensus sequence for the link.
- if(html) {
- html << "<h2>Link " << link.id << "</h2>\n";
- }
- computeLinkConsensusUsingSpoa(
- orientedReadIdsForAssembly,
- orientedReadsSequencesForAssembly,
- orientedReadsRepeatCountsForAssembly,
- assemblyGraph.readRepresentation,
- assemblyGraph.consensusCaller,
- debug,
- html,
- link.msaRleSequence,
- link.msaRepeatCounts
- );
- SHASTA_ASSERT(link.msaRleSequence.size() == link.msaRepeatCounts.size());
-
- if(debug) {
- cout << "Consensus RLE sequence length before trimming " << link.msaRleSequence.size() << endl;
- cout << "Portion of segment on left involved in the MSA begins at position " <<
- assembledSegment0.vertexOffsets[minVertexPosition0] << endl;
- cout << "Portion of segment on right involved in the MSA ends at position " <<
- assembledSegment1.vertexOffsets[maxVertexPosition1] + assemblyGraph.k << endl;
- }
-
- // Count the number of identical (RLE) bases at the beginning of the
- // link consensus sequence and of the segmentId0 sequence portion
- // involved in assembling this link.
- uint64_t identicalOnLeft = 0;
- const uint64_t begin0 = assembledSegment0.vertexOffsets[minVertexPosition0];
- const uint64_t end0 = assembledSegment0.runLengthSequence.size();
- for(uint64_t i=begin0; (i!=end0 and (i-begin0)<link.msaRleSequence.size()); i++) {
- if(link.msaRleSequence[i-begin0] == assembledSegment0.runLengthSequence[i]) {
- // cout << "*** " << begin0 << " " << end0 << " " << i << endl;
- ++identicalOnLeft;
- } else {
- break;
- }
- }
- if(debug) {
- cout << "Identical on left: " << identicalOnLeft << endl;
- }
-
- // Count the number of identical (RLE) bases at the end of the
- // link consensus sequence and the beginning of segmentId1 .
- uint64_t identicalOnRight = 0;
- const uint64_t end1 = assembledSegment1.vertexOffsets[maxVertexPosition1] + assemblyGraph.k;
- for(uint64_t i=end1-1; ; i--) {
- const uint64_t j = link.msaRleSequence.size() - (end1 - i);
- if(link.msaRleSequence[j] == assembledSegment1.runLengthSequence[i]) {
- // cout << "*** " << i << " " << assembledSegment1.runLengthSequence[i] << " " <<
- // j << " " << consensusRleSequence[j] << endl;
- ++identicalOnRight;
- } else {
- break;
- }
- if(i == 0) {
- break;
- }
- if(j == 0) {
- break;
- }
- }
- identicalOnRight = min(identicalOnRight, link.msaRleSequence.size()-identicalOnLeft);
- if(debug) {
- cout << "Identical on right: " << identicalOnRight << endl;
- }
-
- // Trim these identical bases from the link consensus sequence.
- link.leftTrim = identicalOnLeft;
- link.rightTrim = identicalOnRight;
-
- // Compute and store the number of bases to be trimmed at the end of segmentId0
- // and at the beginning of segmentId1.
- segment0.rightTrim =
- assembledSegment0.runLengthSequence.size() -
- assembledSegment0.vertexOffsets[minVertexPosition0] -
- identicalOnLeft;
- segment1.leftTrim =
- assembledSegment1.vertexOffsets[maxVertexPosition1] + assemblyGraph.k
- - identicalOnRight;
-}
-
-
-
-void AssemblyPath::assembleTrivialLink(
- AssemblyPathSegment& segment0,
- AssemblyPathSegment& segment1,
- AssemblyPathLink& link,
- uint64_t k)
-{
- SHASTA_ASSERT(link.isTrivial);
- SHASTA_ASSERT(link.msaRleSequence.empty());
- SHASTA_ASSERT(link.msaRepeatCounts.empty());
- SHASTA_ASSERT(link.leftTrim == 0);
- SHASTA_ASSERT(link.rightTrim == 0);
-
- // Just trim k/2 bases from the adjacent segments,
- // because they are adjacent in the marker graph.
- segment0.rightTrim = k/2;
- segment1.leftTrim = k/2;
-}
-
-
-
-void AssemblyPath::clear()
-{
- segments.clear();
- links.clear();
-}
-
-
-
-// Assemble each segment on the path.
-void AssemblyPath::assembleSegments(const AssemblyGraph& assemblyGraph)
-{
- for(uint64_t i=0; i<segments.size(); i++) {
- AssemblyPathSegment& segment = segments[i];
- assembleMarkerGraphPath(
- assemblyGraph.readRepresentation,
- assemblyGraph.k,
- assemblyGraph.markers,
- assemblyGraph.markerGraph,
- assemblyGraph.markerGraphPaths[segment.id],
- false,
- segment.assembledSegment);
- }
-}
-
-
-
-void AssemblyPath::writeSegmentSequences()
-{
- ofstream fasta("PathSegmentsSequence.fasta");
- ofstream txt("PathSegmentsRleSequence.txt");
-
- for(uint64_t i=0; i<segments.size(); i++) {
- const AssemblyPathSegment& segment = segments[i];
- const uint64_t segmentId = segment.id;
- const AssembledSegment& assembledSegment = segment.assembledSegment;
-
- if(segment.leftTrim + segment.rightTrim > assembledSegment.runLengthSequence.size()) {
- continue;
- }
-
- // Write the trimmed RLE sequence to txt.
- const auto trimmedRleSequence = segment.trimmedRleSequence();
- const auto trimmedRepeatCounts = segment.trimmedRepeatCounts();
- txt << "S" << i << " " << segmentId << "\n";
- copy( trimmedRleSequence, ostream_iterator<Base>(txt));
- txt << "\n";
- for(const uint32_t r: trimmedRepeatCounts) {
- txt << repeatCountCharacter(r);
- }
- txt << "\n";
-
- // Write the trimmed raw sequence to fasta.
- vector<Base> trimmedRawSequence;
- segment.getTrimmedRawSequence(trimmedRawSequence);
- fasta <<
- ">S" << i <<
- " segment " << segmentId <<
- ", length " << trimmedRawSequence.size() << "\n";
- copy(trimmedRawSequence, ostream_iterator<Base>(fasta));
- fasta << "\n";
- }
-}
-
-
-
-void AssemblyPath::writeLinkSequences(const AssemblyGraph& assemblyGraph)
-{
- ofstream fasta("PathLinksSequence.fasta");
- ofstream txt("PathLinksRleSequence.txt");
-
- for(uint64_t i=0; i<segments.size()-1; i++) {
- const uint64_t segmentId0 = segments[i].id;
- const uint64_t segmentId1 = segments[i+1].id;
- const uint64_t linkId = assemblyGraph.findLink(segmentId0, segmentId1);
- const span<const Base> rleSequence = links[i].trimmedRleSequence();
- const span<const uint32_t> repeatCounts = links[i].trimmedRepeatCounts();
- SHASTA_ASSERT(rleSequence.size() == repeatCounts.size());
- if(rleSequence.empty()) {
- continue;
- }
-
- fasta <<
- ">L" << i <<
- " link " << linkId << " " << segmentId0 << "->"<< segmentId1 << "\n";
- for(uint64_t j=0; j<rleSequence.size(); j++) {
- const Base b = rleSequence[j];
- const uint64_t repeatCount = repeatCounts[j];
- for(uint64_t k=0; k<repeatCount; k++) {
- fasta << b;
- }
- }
- fasta << "\n";
-
- txt << "L" << i <<
- " link " << linkId << " " << segmentId0 << "->"<< segmentId1 << "\n";
- copy(rleSequence, ostream_iterator<Base>(txt));
- txt << "\n";
- for(const uint32_t r: repeatCounts) {
- txt << repeatCountCharacter(r);
- }
- txt << "\n";
- }
-}
-
-
-
-// Compute consensus sequence for Link, given sequences of
-// the oriented reads, which must all be anchored on both sides.// Lower level version.
-void AssemblyPath::computeLinkConsensusUsingSpoa(
- const vector<OrientedReadId> orientedReadIds,
- const vector< vector<Base> > rleSequences,
- const vector< vector<uint32_t> > repeatCounts,
- uint64_t readRepresentation,
- const ConsensusCaller& consensusCaller,
- bool debug,
- ostream& html,
- vector<Base>& consensusRleSequence,
- vector<uint32_t>& consensusRepeatCounts
- )
-{
- SHASTA_ASSERT(rleSequences.size() == orientedReadIds.size());
- SHASTA_ASSERT(repeatCounts.size() == orientedReadIds.size());
-
- // Create the spoa alignment engine and elignment graph.
- const spoa::AlignmentType alignmentType = spoa::AlignmentType::kNW;
- const int8_t match = 1;
- const int8_t mismatch = -1;
- const int8_t gap = -1;
- auto spoaAlignmentEngine = spoa::AlignmentEngine::Create(alignmentType, match, mismatch, gap);
- spoa::Graph spoaAlignmentGraph;
-
- // Add the oriented read sequences to the alignment.
- string sequenceString;
- for(const vector<Base>& sequence: rleSequences) {
-
- // Add it to the alignment.
- sequenceString.clear();
- for(const Base base: sequence) {
- sequenceString += base.character();
- }
- auto alignment = spoaAlignmentEngine->Align(sequenceString, spoaAlignmentGraph);
- spoaAlignmentGraph.AddAlignment(alignment, sequenceString);
- }
-
- // Compute the multiple sequence alignment.
- const vector<string> msa = spoaAlignmentGraph.GenerateMultipleSequenceAlignment();
- const string consensus = spoaAlignmentGraph.GenerateConsensus();
- const uint64_t msaLength = msa.front().size();
- if(debug) {
- cout << "Multiple sequence alignment has length " << msaLength << ":" << endl;
- for(const string& s: msa) {
- cout << s << endl;
- }
- }
-
-
- // Compute coverage for each base at each position of the MSA.
- // Use position 4 for gaps.
- vector<Coverage> coverage(msaLength);
- for(uint64_t i=0; i<orientedReadIds.size(); i++) {
- const OrientedReadId orientedReadId = orientedReadIds[i];
- const vector<Base>& rleSequence = rleSequences[i];
- const vector<uint32_t>& repeatCount = repeatCounts[i];
- const string& msaString = msa[i];
-
- // Here:
- // rPosition = position in rle sequence of oriented read.
- // aPosition = position in alignment
- uint64_t rPosition = 0;
- for(uint64_t aPosition=0; aPosition<msaLength; aPosition++) {
- const AlignedBase alignedBase = AlignedBase::fromCharacter(msaString[aPosition]);
- if(alignedBase.isGap()) {
- coverage[aPosition].addRead(alignedBase, orientedReadId.getStrand(), 0);
- } else {
- SHASTA_ASSERT(AlignedBase(rleSequence[rPosition]) == alignedBase);
- if(readRepresentation == 1) {
- coverage[aPosition].addRead(
- alignedBase,
- orientedReadId.getStrand(),
- repeatCount[rPosition]);
- } else {
- coverage[aPosition].addRead(
- alignedBase,
- orientedReadId.getStrand(),
- 1);
- }
- ++rPosition;
- }
- }
- SHASTA_ASSERT(rPosition == rleSequence.size());
- }
-
-
-
- // Compute consensus base and repeat count at every position in the alignment.
- vector<AlignedBase> msaConsensusSequence(msaLength);
- vector<uint32_t> msaConsensusRepeatCount(msaLength);
- vector<uint64_t> msaConsensusDiscordantCount(msaLength);
- for(uint64_t aPosition=0; aPosition<msaLength; aPosition++) {
- const Coverage& c = coverage[aPosition];
- const Consensus consensus = consensusCaller(c);
- msaConsensusSequence[aPosition] = consensus.base;
- msaConsensusRepeatCount[aPosition] = uint32_t(consensus.repeatCount);
-
- // Compute discordant count at this position of the alignment.
- msaConsensusDiscordantCount[aPosition] = 0;
- for(uint64_t b=0; b<5; b++) {
- if(b != consensus.base.value) {
- msaConsensusDiscordantCount[aPosition] += c.coverage(AlignedBase::fromInteger(b));
- }
- }
- }
-
-
-
- // Fill in the output arguments.
- // These are the same as msaConsensusSequence and msaConsensusRepeatCount,
- // but with the gap bases removed.
- consensusRleSequence.clear();
- consensusRepeatCounts.clear();
- for(uint64_t aPosition=0; aPosition<msaLength; aPosition++) {
- const AlignedBase alignedBase = msaConsensusSequence[aPosition];
- if(not alignedBase.isGap()) {
- consensusRleSequence.push_back(Base(alignedBase));
- consensusRepeatCounts.push_back(msaConsensusRepeatCount[aPosition]);
- }
- }
-
-
-
- // Html output of the alignment.
- if(html) {
- html << "Coverage " << rleSequences.size() << "<br>\n";
- html << "Alignment length " << msaLength << "<br>\n";
- html << "<div style='font-family:monospace;white-space:nowrap;'>\n";
- for(uint64_t i=0; i<orientedReadIds.size(); i++) {
- const OrientedReadId orientedReadId = orientedReadIds[i];
- const string& msaString = msa[i];
-
- for(const char c: msaString) {
- const AlignedBase alignedBase = AlignedBase::fromCharacter(c);
- if(alignedBase.isGap()) {
- html << alignedBase;
- } else {
- html << "<span style='background-color:" << alignedBase.htmlColor() <<
- "'>" << alignedBase << "</span>";
- }
- }
-
- // If using the RLE representation, also write the
- // repeat count at each position.
- if(readRepresentation == 1) {
- const vector<Base>& rleSequence = rleSequences[i];
- const vector<uint32_t>& repeatCount = repeatCounts[i];
-
- // Here:
- // rPosition = position in RLE sequence of oriented read.
- // aPosition = position in alignment
- uint64_t rPosition = 0;
- html << "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;";
- for(uint64_t aPosition=0; aPosition<msaLength; aPosition++) {
- const AlignedBase alignedBase = AlignedBase::fromCharacter(msaString[aPosition]);
- if(alignedBase.isGap()) {
- html << alignedBase;
- } else {
- SHASTA_ASSERT(AlignedBase(rleSequence[rPosition]) == alignedBase);
- const uint64_t r = repeatCount[rPosition];
- html << "<span style='background-color:" << alignedBase.htmlColor() <<
- "'>";
- if(r < 10) {
- html << r;
- } else {
- html << "*";
- }
- html << "</span>";
- ++rPosition;
- }
- }
- SHASTA_ASSERT(rPosition == rleSequence.size());
- }
-
- html << " " << orientedReadId << "<br>\n";
- }
-
-
-
- // Also write the consensus.
- html << "<br>\n";
- for(uint64_t aPosition=0; aPosition<msaLength; aPosition++) {
- const AlignedBase alignedBase = msaConsensusSequence[aPosition];
- if(alignedBase.isGap()) {
- html << alignedBase;
- } else {
- html << "<span style='background-color:" << alignedBase.htmlColor() <<
- "'>" << alignedBase << "</span>";
- }
- }
- if(readRepresentation == 1) {
- html << "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;";
- for(uint64_t aPosition=0; aPosition<msaLength; aPosition++) {
- const AlignedBase alignedBase = msaConsensusSequence[aPosition];
- if(alignedBase.isGap()) {
- html << alignedBase;
- } else {
- const uint64_t r = msaConsensusRepeatCount[aPosition];
- html << "<span style='background-color:" << alignedBase.htmlColor() <<
- "'>";
- if(r < 10) {
- html << r;
- } else {
- html << "*";
- }
- html << "</span>";
- }
- }
- }
- html << " Consensus<br>\n";
-
- // Write the discordant count.
- for(uint64_t aPosition=0; aPosition<msaLength; aPosition++) {
- const uint64_t d = msaConsensusDiscordantCount[aPosition];
- const double errorRate = double(d) / double(orientedReadIds.size());
- int hue;
- if(errorRate < .01) {
- hue = 120; // Q>=20, green.
- } else {
- const double Q = -10. * log10(errorRate);
- hue = int(std::round(6. * Q)); // 60 at Q=10 (yellow), 120 at Q=20 (green).
- }
- const string color = "hsl(" + to_string(hue) + ",100%, 70%)";
- html << "<span style='background-color:" << color << "'>";
- html << repeatCountCharacter(uint32_t(d));
- html << "</span>";
- }
- html << "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;";
- for(uint64_t aPosition=0; aPosition<msaLength; aPosition++) {
- html << "&nbsp;";
- }
- html << " Discordant<br>\n";
-
- html << "</div>\n";
-
-
-
- html << "<h3>Consensus</h3>";
- html << "<div style='font-family:monospace;white-space:nowrap;'>\n";
- for(const Base b: consensusRleSequence) {
- html << b;
- }
- html << "<br>\n";
- for(const uint64_t r: consensusRepeatCounts) {
- if(r < 10) {
- html << r;
- } else {
- html << "*";
- }
- }
- html << "<br>\n";
- html << "<br>\n";
- for(uint64_t i=0; i<consensusRleSequence.size(); i++) {
- const Base b = consensusRleSequence[i];
- const uint64_t r = consensusRepeatCounts[i];
- for(uint64_t j=0; j<r; j++) {
- html << b;
- }
- }
- html << "<br>\n";
- html << "</div>\n";
- }
-}
-
-
-
-// Final assembly of segments and links sequence into the path sequence.
-void AssemblyPath::assemble()
-{
-
- rleSequence.clear();
- repeatCounts.clear();
- rawSequence.clear();
-
- // Assemble RLE sequence.
- for(uint64_t i=0; i<segments.size(); i++) {
- AssemblyPathSegment& segment = segments[i];
- const AssembledSegment& assembledSegment = segment.assembledSegment;
- segment.rlePosition = rleSequence.size();
- segment.rawPosition = rawSequence.size();
-
-
- if(segment.leftTrim + segment.rightTrim > assembledSegment.runLengthSequence.size()) {
- // The left and right trim of this segment overlap.
- // To handle this case, just take the excess number of bases out of the sequence
- // we already assembled.
- // This is not a great solution, but better than nothing.
- const uint64_t excessTrim =
- (segment.leftTrim + segment.rightTrim) - assembledSegment.runLengthSequence.size();
- SHASTA_ASSERT(excessTrim <= rleSequence.size());
- SHASTA_ASSERT(repeatCounts.size() == rleSequence.size());
-
- // Compute the excess trim in the raw sequence.
- uint64_t excessTrimRaw = 0;
- for(uint64_t i=0; i<excessTrim; i++) {
- excessTrimRaw += repeatCounts[repeatCounts.size() - 1 - i];
- }
- SHASTA_ASSERT(excessTrimRaw <= rawSequence.size());
-
- // Remove the excess trim from the sequence we already assembled.
- rleSequence.resize(rleSequence.size() - excessTrim);
- repeatCounts.resize(repeatCounts.size() - excessTrim);
- rawSequence.resize(rawSequence.size() - excessTrimRaw);
- } else {
-
- // This is the normal case.
-
- // Add the RLE sequence of this segment.
- const auto segmentTrimmedRleSequence = segment.trimmedRleSequence();
- const auto segmentTrimmedRepeatCounts = segment.trimmedRepeatCounts();
- copy(segmentTrimmedRleSequence, back_inserter(rleSequence));
- copy(segmentTrimmedRepeatCounts, back_inserter(repeatCounts));
-
- // Add the raw sequence of this segment.
- for(uint64_t i=0; i<segmentTrimmedRleSequence.size(); i++) {
- const Base b = segmentTrimmedRleSequence[i];
- const uint64_t r = segmentTrimmedRepeatCounts[i];
- for(uint64_t k=0; k<r; k++) {
- rawSequence.push_back(b);
- }
- }
- }
-
-
-
- // Add the sequence of the link following this segment.
- if(i != segments.size() - 1) {
- AssemblyPathLink& link = links[i];
- link.rlePosition = rleSequence.size();
- link.rawPosition = rawSequence.size();
-
- // Add the RLE sequence of this link.
- const auto trimmedRleSequence = link.trimmedRleSequence();
- const auto trimmedRepeatCounts = link.trimmedRepeatCounts();
- copy(trimmedRleSequence, back_inserter(rleSequence));
- copy(trimmedRepeatCounts, back_inserter(repeatCounts));
-
- // Add the raw sequence of this link.
- for(uint64_t i=0; i<trimmedRleSequence.size(); i++) {
- const Base b = trimmedRleSequence[i];
- const uint64_t r = trimmedRepeatCounts[i];
- for(uint64_t k=0; k<r; k++) {
- rawSequence.push_back(b);
- }
- }
- }
- }
- SHASTA_ASSERT(rleSequence.size() == repeatCounts.size());
-
-#if 0
- // For now, write it out.
- ofstream fasta("PathSequence.fasta");
- fasta << ">Path" << endl;
- copy(rawSequence, ostream_iterator<Base>(fasta));
- fasta << "\n";
-#endif
-}
-
-
-
-
-AssemblyPathSegment::AssemblyPathSegment(
- uint64_t id,
- bool isPrimary) :
- id(id),
- isPrimary(isPrimary)
- {}
-
-
-
-span<const Base> AssemblyPathSegment::trimmedRleSequence() const
-{
- const auto begin = assembledSegment.runLengthSequence.begin() + leftTrim;
- const auto end = assembledSegment.runLengthSequence.end() - rightTrim;
- SHASTA_ASSERT(begin <= end);
- return span<const Base>(begin, end);
-}
-
-
-
-span<const uint32_t> AssemblyPathSegment::trimmedRepeatCounts() const
-{
- const auto begin = assembledSegment.repeatCounts.begin() + leftTrim;
- const auto end = assembledSegment.repeatCounts.end() - rightTrim;
- SHASTA_ASSERT(begin <= end);
- return span<const uint32_t>(begin, end);
-}
-
-
-
-span<const Base> AssemblyPathLink::trimmedRleSequence() const
-{
- const auto begin = msaRleSequence.begin() + leftTrim;
- const auto end = msaRleSequence.end() - rightTrim;
- SHASTA_ASSERT(begin <= end);
- return span<const Base>(begin, end);
-}
-
-
-
-span<const uint32_t> AssemblyPathLink::trimmedRepeatCounts() const
-{
- const auto begin = msaRepeatCounts.begin() + leftTrim;
- const auto end = msaRepeatCounts.end() - rightTrim;
- SHASTA_ASSERT(begin <= end);
- return span<const uint32_t>(begin, end);
-}
-
-
-
-void AssemblyPathSegment::getTrimmedRawSequence(vector<Base>& trimmedRawSequence) const
-{
-
- // Get the trimed RLE sequence and repeat counts.
- const span<const Base> trimmedRleSequenceSpan = trimmedRleSequence();
- const span<const uint32_t> trimmedRepeatCountsSpan = trimmedRepeatCounts();
- SHASTA_ASSERT(trimmedRleSequenceSpan.size() == trimmedRepeatCountsSpan.size());
-
- // Construct the raw sequence.
- trimmedRawSequence.clear();
- for(uint64_t i=0; i<trimmedRleSequenceSpan.size(); i++) {
- const Base b = trimmedRleSequenceSpan[i];
- const uint32_t r = trimmedRepeatCountsSpan[i];
- for(uint64_t k=0; k<r; k++) {
- trimmedRawSequence.push_back(b);
- }
- }
-}
-
-
-
-// Return a character to represent a repeat count
-// when writing out RLE sequence.
-char AssemblyPath::repeatCountCharacter(uint32_t r) {
- if(r < 10) {
- return '0' + char(r);
- } else if(r < 36) {
- return 'A' + char(r - 10);
- } else {
- return '*';
- }
-}
-
-
-void AssemblyPath::writeHtml(ostream& html, const AssemblyGraph& assemblyGraph) const
-{
- SHASTA_ASSERT(segments.size() > 1);
- SHASTA_ASSERT(links.size() == segments.size() - 1);
-
- writeHtmlSummary(html);
- writeSequenceDialog(html);
- writeHtmlDetail(html, assemblyGraph);
-}
-
-
-
-void AssemblyPath::writeHtmlSummary(ostream& html) const
-{
- html <<
- "<table>" <<
- "<tr><th class=left>First segment id<td class=centered>" << segments.front().id <<
- "<tr><th class=left>Last segment id<td class=centered>" << segments.back().id <<
- "<tr><th class=left>Number of segments<td class=centered>" << segments.size() <<
- "<tr><th class=left>Number of links<td class=centered>" << segments.size() - 1 <<
- "<tr><th class=left>Length of RLE sequence assembled<td class=centered>" << rleSequence.size() <<
- "<tr><th class=left>Length of raw sequence assembled<td class=centered>" << rawSequence.size() <<
- "</table>";
-}
-
-
-
-void AssemblyPath::writeHtmlDetail(ostream& html, const AssemblyGraph& assemblyGraph) const
-{
- // Table legend.
- html <<
- "<p>Hover on table headers for information on the meaning of each column.";
-
-
-
- // Table header.
- html <<
- "<p>"
- "<table style='table-layout:fixed;font-family:monospace;font-size:9'>"
- "<tr>"
-
- "<th title ='S (segment) or L (link). "
- "Primary segments have a light blue background. "
- "Trivial links have a grey background.'>"
- "<span class=rotated>Type"
-
- "<th title='Segment or link id'>"
- "<span class=rotated>Id"
-
- "<th title='The number of oriented reads contributing to assembly of this segment or link. "
- "This is not the same as average coverage on marker graph vertices or edges.'>"
- "<span class=rotated>Coverage"
-
- "<th title='The id of the previous primary segment.'>"
- "<span class=rotated>Previous<br>primary segment"
-
- "<th title='The id of the next primary segment.'>"
- "<span class=rotated>Next<br>primary segment"
-
- "<th title='The fraction of oriented reads that appear on the "
- "previous primary segment and are long enough to appear on this segment, but do not.'>"
- "<span class=rotated>Unexplained fraction<br>on previous<br>primary segment"
-
- "<th title='The fraction of oriented reads that appear on the "
- "next primary segment and are long enough to appear on this segment, but do not.'>"
- "<span class=rotated>Unexplained fraction<br>on next<br>primary segment"
-
- "<th title='The position of the trimmed raw sequence of this segment or link "
- "in the raw assembled sequence of the path.'>"
- "<span class=rotated>Raw<br>position"
-
- "<th title='The complete raw sequence for this segment or link. "
- "The red portion is trimmed out and not used for assembly.'>"
- "Raw sequence"
-
- "<th title='Assembly details for non-trivial links.'>"
- "<span class=rotated>Detail";
-
-
-
- // Main body of the table.
- // There is one row for each segment and one row for each link.
- for(uint64_t position=0; position<segments.size(); position++) {
- const AssemblyPathSegment& segment = segments[position];
-
- // If not a primary segment, evaluate this segment against
- // the previous and next primary segment.
- AssemblyGraph::SegmentOrientedReadInformation info;
- AssemblyGraph::SegmentOrientedReadInformation previousInfo;
- AssemblyGraph::SegmentOrientedReadInformation nextInfo;
- SegmentPairInformation previousSegmentPairInfo;
- SegmentPairInformation nextSegmentPairInfo;
- if(not segment.isPrimary) {
- assemblyGraph.getOrientedReadsOnSegment(segment.id, info);
- assemblyGraph.getOrientedReadsOnSegment(segment.previousPrimarySegmentId, previousInfo);
- assemblyGraph.getOrientedReadsOnSegment(segment.nextPrimarySegmentId, nextInfo);
- assemblyGraph.analyzeSegmentPair(
- segment.previousPrimarySegmentId, segment.id,
- previousInfo, info,
- assemblyGraph.markers, previousSegmentPairInfo);
- assemblyGraph.analyzeSegmentPair(
- segment.nextPrimarySegmentId, segment.id,
- nextInfo, info,
- assemblyGraph.markers, nextSegmentPairInfo);
- }
-
- // Write a row for the segment at this position.
- const AssembledSegment& assembledSegment = segment.assembledSegment;
- html << "<tr";
- if(segment.isPrimary) {
- html << " style='background-color:LightCyan' title='Primary segment'";
- } else {
- html << " title='Secondary segment'";
- }
- html <<
- ">"
- "<td class=centered>S"
- "<td class=centered>" << segment.id <<
- "<td>" << assemblyGraph.coverage(segment.id);
- if(segment.isPrimary) {
- html << "<td><td><td><td>";
- } else {
- const auto oldPrecision = html.precision(2);
- const auto oldFlags = html.setf(std::ios_base::fixed, std::ios_base::floatfield);
- html <<
- "<td class=centered>" << segment.previousPrimarySegmentId <<
- "<td class=centered>" << segment.nextPrimarySegmentId <<
- "<td class=centered>" << previousSegmentPairInfo.unexplainedFraction(0) <<
- "<td class=centered>" << nextSegmentPairInfo.unexplainedFraction(0);
- html.precision(oldPrecision);
- html.flags(oldFlags);
- }
- html << "<td class=centered>" << segment.rawPosition;
-
-
-
- // Raw sequence for this segment.
- html << "<td class=centered style='max-width:500px;word-wrap:break-word'>";
- if(segment.leftTrim + segment.rightTrim > assembledSegment.runLengthSequence.size()) {
-
- // Exceptional case where the left and right trim overlap.
- html << "<span style='background-color:LightCoral'>";
- for(uint64_t i=0; i<assembledSegment.runLengthSequence.size(); i++) {
- const Base b = assembledSegment.runLengthSequence[i];
- const uint32_t r = assembledSegment.repeatCounts[i];
- if(i == assembledSegment.runLengthSequence.size() - segment.rightTrim) {
- html << "</span><span style='background-color:Fuchsia'>";
- }
- for(uint32_t k=0; k<r; k++) {
- html << b;
- }
- if(i == segment.leftTrim - 1) {
- html << "</span><span style='background-color:LightCoral'>";
- }
- }
- html << "</span><td>";
-
- } else {
-
- // Normal case.
- html << "<span style='background-color:LightCoral'>";
- for(uint64_t i=0; i<assembledSegment.runLengthSequence.size(); i++) {
- const Base b = assembledSegment.runLengthSequence[i];
- const uint32_t r = assembledSegment.repeatCounts[i];
- if(i == segment.leftTrim) {
- html << "</span>";
- }
- for(uint32_t k=0; k<r; k++) {
- html << b;
- }
- if(i == assembledSegment.runLengthSequence.size() -1 - segment.rightTrim) {
- html << "<span style='background-color:LightCoral'>";
- }
-
- }
- html << "</span><td>";
- }
-
-
-
- // Write a row for the link.
- if(position == links.size()) {
- break;
- }
- const AssemblyPathLink& link = links[position];
- html << "<tr";
- if(link.isTrivial) {
- html << " style='background-color:LightGray' title='Trivial link'";
- } else {
- html << " title='Non-trivial link'";
- }
- html <<
- "><td class=centered>L" <<
- "<td class=centered>" << link.id <<
- "<td class=centered>";
-
- if(not link.isTrivial) {
- html << link.coverage;
- }
-
- html <<
- "<td class=centered>" << link.previousPrimarySegmentId <<
- "<td class=centered>" << link.nextPrimarySegmentId <<
- "<td><td><td class=centered>";
-
- if(not link.isTrivial) {
- html << link.rawPosition;
- }
-
- // Raw sequence for this link.
- html << "<td class=centered style='max-width:300px;word-wrap:break-word'>";
- html << "<span style='background-color:LightCoral'>";
- for(uint64_t i=0; i<link.msaRleSequence.size(); i++) {
- const Base b = link.msaRleSequence[i];
- const uint32_t r = link.msaRepeatCounts[i];
- if(i == link.leftTrim) {
- html << "</span>";
- }
- for(uint32_t k=0; k<r; k++) {
- html << b;
- }
- if(i == link.msaRleSequence.size() -1 - link.rightTrim) {
- html << "<span style='background-color:LightCoral'>";
- }
- }
- html << "</span>";
-
- html << "<td class=centered>";
- if(not link.isTrivial) {
- html << "<a href='exploreMode3LinkAssembly?linkId=" << link.id <<
- "&previousPrimarySegmentId=" << link.previousPrimarySegmentId <<
- "&nextPrimarySegmentId=" << link.nextPrimarySegmentId <<
- "'>Detail</a>";
- }
- }
-
- // End the table.
- html << "</table>";
-
-}
-
-
-
-// This writes out a dialog that permit displaying
-// selected portions of the path assembled sequence.
-void AssemblyPath::writeSequenceDialog(ostream& html) const
-{
- html << "<script>var assembledSequence = '";
- copy(rawSequence, ostream_iterator<Base>(html));
- html << "';</script>";
-
- html << R"zzz(
-<form onsubmit="displaySequence(); return false;">
-<br><input type=submit value='Display assembled sequence'>
- in the position range <input type=text id=begin>
- to <input type=text id=end>
-</form>
-<script>
-function displaySequence()
-{
- var beginString = document.getElementById('begin').value;
- var endString = document.getElementById('end').value;
- var begin = parseInt(beginString);
- var end = parseInt(endString);
- if((end < begin) || (end > assembledSequence.length)) {
- document.getElementById("assembledSequence").innerText = "";
- } else {
- document.getElementById("assembledSequence").innerText = assembledSequence.substring(begin, end);
- }
-}
-</script>
-<p id=assembledSequence style='font-family:monospace;font-size:9pt;word-wrap:break-word;'>
- )zzz";
-}
diff --git a/src/mode3-AssemblyPath.hpp b/src/mode3-AssemblyPath.hpp
deleted file mode 100644
index 5e4d023..0000000
--- a/src/mode3-AssemblyPath.hpp
+++ /dev/null
@@ -1,206 +0,0 @@
-#ifndef SHASTA_MODE3_ASSEMBLY_PATH_HPP
-#define SHASTA_MODE3_ASSEMBLY_PATH_HPP
-
-// Shasta.
-#include "AssembledSegment.hpp"
-#include "invalid.hpp"
-
-// Standard library.
-#include "cstdint.hpp"
-#include "span.hpp"
-#include "utility.hpp"
-#include "vector.hpp"
-
-namespace shasta {
- namespace mode3 {
- class AssemblyPath;
- class AssemblyPathLink;
- class AssemblyPathSegment;
-
- class AssemblyGraph;
- class Transition;
- }
-
- class Base;
- class ConsensusCaller;
- class OrientedReadId;
-}
-
-
-
-// A segment in an AssemblyPath.
-class shasta::mode3::AssemblyPathSegment {
-public:
-
- // The id of this segment, in the AssemblyGraph.
- uint64_t id;
-
- // Each primary segment in the path has high Jaccard similarity
- // with the previous primary segment.
- // The first and last segment are always primary segments.
- bool isPrimary;
-
- // For secondary segments only (isPrimary is false) we store the
- // segment id of the previous and next primary segment.
- uint64_t previousPrimarySegmentId = invalid<uint64_t>;
- uint64_t nextPrimarySegmentId = invalid<uint64_t>;
-
- // The AssembledSegment contains the sequence for this segment
- // plus information on how the sequence was extracted from the
- // marker graph.
- // The sequence includes the first and last marker graph vertex
- // of this segment.
- AssembledSegment assembledSegment;
-
- // For assembly of the path sequence, we don't use the entire
- // sequence of the AssembledSegment.
- // We trim some bases at each end to avoid overlap
- // with adjacent segments and links.
- // When a segment is adjacent to a non-trivial link,
- // we give priority to link sequence over segment sequence.
- // The reason is that sequence assembled from links
- // is generally more accurate because it is assembled
- // using only a restricted set of
- // oriented reads that are believed to originate from the
- // sequence copy we are assembling.
- uint64_t leftTrim = 0;
- uint64_t rightTrim = 0;
- span<const Base> trimmedRleSequence() const;
- span<const uint32_t> trimmedRepeatCounts() const;
- void getTrimmedRawSequence(vector<Base>&) const;
-
- // The position of the trimmed sequence of this segment
- // in the assembled sequence of the path.
- uint64_t rlePosition = 0;
- uint64_t rawPosition = 0;
-
- // Constructor.
- AssemblyPathSegment(uint64_t id, bool isPrimary);
-};
-
-
-
-// A link in an AssemblyPath.
-class shasta::mode3::AssemblyPathLink {
-public:
-
- // The id of this segment, in the AssemblyGraph.
- uint64_t id;
-
- // A link is trivial if the last marker graph vertex
- // of the source segment coincides with the first marker
- // graph vertex of the target segment.
- // In this case the link does not need to be assembled
- // and all the next fields are left empty.
- bool isTrivial;
-
- // The number of oriented reads used to assemble this link.
- // This is only filled in for non-trivial links.
- uint64_t coverage = 0;
-
- // The last primary segment in the path preceding this link.
- uint64_t previousPrimarySegmentId = invalid<uint64_t>;
-
- // The next primary segment in the path following this link.
- uint64_t nextPrimarySegmentId = invalid<uint64_t>;
-
- // The RLE sequence as computed by the MSA
- // of oriented reads in the link.
- // This overlaps with adjacent segments.
- vector<Base> msaRleSequence;
- vector<uint32_t> msaRepeatCounts;
-
- // The trimmed RLE sequence, to be used for assembly, is obtained from
- // the MSA sequence by removing bases at the two ends
- // that are identical with the adjacent segments.
- uint64_t leftTrim = 0;
- uint64_t rightTrim = 0;
- span<const Base> trimmedRleSequence() const;
- span<const uint32_t> trimmedRepeatCounts() const;
-
- // The position of the trimmed sequence of this link
- // in the assembled sequence of the path.
- uint64_t rlePosition = 0;
- uint64_t rawPosition = 0;
-};
-
-
-
-// An assembly path in the mode3::AssemblyGraph
-class shasta::mode3::AssemblyPath {
-public:
-
- // The segments and links on the path.
- vector<AssemblyPathSegment> segments;
- vector<AssemblyPathLink> links;
-
- // Top level function to assemble sequence for this path.
- void assemble(const AssemblyGraph&);
-
- // Assemble the sequence of each segment.
- void assembleSegments(const AssemblyGraph&);
- void writeSegmentSequences();
-
- // Initialize the links.
- // This only resizes the links vector and fills in the following fields of each link.
- // - id
- // - isTrivial
- // - previousPrimarySegmentId
- // - nextPrimarySegmentId
- void initializeLinks(const AssemblyGraph&);
-
- // Assemble links in this assembly path.
- void assembleLinks(const AssemblyGraph&);
- void assembleLinkAtPosition(
- const AssemblyGraph& assemblyGraph,
- uint64_t position0,
- ostream& html);
- static void assembleTrivialLink(
- AssemblyPathSegment& segment0,
- AssemblyPathSegment& segment1,
- AssemblyPathLink& link,
- uint64_t k);
- static void assembleNonTrivialLink(
- const AssemblyGraph& assemblyGraph,
- AssemblyPathSegment& segment0,
- AssemblyPathSegment& segment1,
- AssemblyPathLink& link,
- ostream& html);
- void writeLinkSequences(const AssemblyGraph&);
-
- // Final assembly of segments and links sequence into the path sequence.
- void assemble();
- vector<Base> rleSequence;
- vector<uint64_t> repeatCounts;
- vector<Base> rawSequence;
-
- void clear();
-
- // Use spoa to compute consensus sequence for a link.
- static void computeLinkConsensusUsingSpoa(
- const vector<OrientedReadId> orientedReadIds,
- const vector< vector<Base> > rleSequences,
- const vector< vector<uint32_t> > repeatCounts,
- uint64_t readRepresentation,
- const ConsensusCaller&,
- bool debug,
- ostream& html,
- vector<Base>& consensusRleSequence,
- vector<uint32_t>& consensusRepeatCounts
- );
-
- // Return a character to represent a repeat count
- // when writing out RLE sequence.
- static char repeatCountCharacter(uint32_t);
-
- // Html output.
- void writeHtml(ostream&, const AssemblyGraph& assemblyGraph) const;
- void writeHtmlSummary(ostream&) const;
- void writeSequenceDialog(ostream&) const;
- void writeHtmlDetail(ostream&, const AssemblyGraph& assemblyGraph) const;
-
-};
-
-
-#endif
-
diff --git a/src/mode3-Detangler.cpp b/src/mode3-Detangler.cpp
deleted file mode 100644
index e659a14..0000000
--- a/src/mode3-Detangler.cpp
+++ /dev/null
@@ -1,415 +0,0 @@
-#include "mode3-Detangler.hpp"
-#include "Base.hpp"
-#include "deduplicate.hpp"
-#include "mode3.hpp"
-using namespace shasta;
-using namespace mode3;
-
-#include "fstream.hpp"
-
-
-
-Detangler::Detangler(const AssemblyGraph& assemblyGraph)
-{
- // ****** EXPOSE WHEN CODE STABILIZES
- const uint64_t minLinkCoverage = 6;
-
- createJourneys(assemblyGraph);
- createInitialClusters();
- cout << "The initial Detangler has " << clusters.size() << " clusters." << endl;
-
- uint64_t count = 0;
- for(auto& p: clusters) {
- for(Cluster& cluster: p.second) {
- if(simpleDetangle(&cluster, minLinkCoverage)) {
- ++count;
- }
- }
- }
- cout << "Detangled " << count << " clusters out of " << clusters.size() << endl;
-
- writeGfa("Detangler.gfa", minLinkCoverage, assemblyGraph.segmentSequences, assemblyGraph.k);
-}
-
-
-
-
-// To create the journeys, simply extract the segmentIds from the assemblyGraphJourneys.
-void Detangler::createJourneys(const AssemblyGraph& assemblyGraph)
-{
- const uint64_t journeyCount = assemblyGraph.assemblyGraphJourneys.size();
-
- journeys.clear();
- journeys.resize(journeyCount);
- for(uint64_t i=0; i<journeyCount; i++) {
- const span<const AssemblyGraphJourneyEntry> assemblyGraphJourney = assemblyGraph.assemblyGraphJourneys[i];
- Journey& journey = journeys[i];
-
- for(const AssemblyGraphJourneyEntry& assemblyGraphJourneyEntry: assemblyGraphJourney) {
- journey.push_back(Step(assemblyGraphJourneyEntry.segmentId));
- }
- }
-}
-
-
-
-// Initially, we create a Cluster for each segmentId.
-void Detangler::createInitialClusters()
-{
-
- // Loop over all oriented reads.
- const ReadId readCount = ReadId(journeys.size() / 2);
- for(ReadId readId=0; readId<readCount; readId++) {
- for(Strand strand=0; strand<2; strand++) {
- const OrientedReadId orientedReadId(readId, strand);
-
- // Get the Journey for this oriented read.
- Journey& journey = journeys[orientedReadId.getValue()];
-
- // Loop over Step(s) in this Journey.
- StepInfo stepInfo;
- stepInfo.orientedReadId = orientedReadId;
- for(uint64_t position=0; position<journey.size(); position++) {
- stepInfo.position = position;
- Step& step = journey[position];
- const uint64_t segmentId = step.segmentId;
-
- // Locate the Cluster corresponding to this segment,
- // creating it if necessary.
- ClusterContainer::iterator it = clusters.find(segmentId);
- if(it == clusters.end()) {
- tie(it, ignore) = clusters.insert(make_pair(segmentId, std::list<Cluster>()));
- it->second.push_back(Cluster(segmentId, 0));
- }
- std::list<Cluster>& segmentClusters = it->second;
-
- // Sanity check: this segmentId must correspond to exactly one Cluster.
- SHASTA_ASSERT(segmentClusters.size() == 1);
- Cluster& cluster = segmentClusters.front();
-
- // Add this Step to the Cluster.
- cluster.steps.push_back(stepInfo);
- step.cluster = &cluster;
- }
- }
- }
-}
-
-
-
-// Find the next/previous cluster for each of the steps in a given cluster.
-// The output vector has size equal to the number of steps in this cluster,
-// and the corresponding OrientedReadId(s) are the same
-// as the ones in the steps vector for the given cluster.
-// Some of the pointers returned can be zero. This can happen if this
-// cluster is the first or last cluster in the journey of an oriented read.
-void Detangler::findNextClusters(
- const Cluster* cluster0,
- vector<const Cluster*>& nextClusters
- ) const
-{
- nextClusters.clear();
-
- // Loop over the steps of this cluster.
- for(const StepInfo& stepInfo: cluster0->steps) {
- const OrientedReadId orientedReadId = stepInfo.orientedReadId;
- const uint64_t position = stepInfo.position;
-
- // Get journey for this oriented read.
- const Journey& journey = journeys[orientedReadId.getValue()];
-
- // Locate the cluster at the next position in the journey.
- // There is none if we are at the end of the journey.
- const Cluster* cluster1 = 0;
- const uint64_t nextPosition = position + 1;
- if(nextPosition < journey.size()) {
- cluster1 = journey[nextPosition].cluster;
- }
-
- // Store it in the output vector.
- nextClusters.push_back(cluster1);
- };
-
-}
-void Detangler::findPreviousClusters(
- const Cluster* cluster0,
- vector<const Cluster*>& previousClusters
- ) const
-{
- previousClusters.clear();
-
- // Loop over the steps of this cluster.
- for(const StepInfo& stepInfo: cluster0->steps) {
- const OrientedReadId orientedReadId = stepInfo.orientedReadId;
- const uint64_t position = stepInfo.position;
-
- // Get the journey for this oriented read.
- const Journey& journey = journeys[orientedReadId.getValue()];
-
- // Locate the cluster at the previous position in the journey.
- // There is none if we are at the end of the journey.
- const Cluster* cluster1 = 0;
- if(position > 0) {
- const uint64_t previousPosition = position - 1;
- cluster1 = journey[previousPosition].cluster;
- }
-
- // Store it in the output vector.
- previousClusters.push_back(cluster1);
- };
-
-}
-
-
-
-// Simple, classical detangling of a single cluster.
-bool Detangler::simpleDetangle(Cluster* cluster0, uint64_t minLinkCoverage)
-{
- // ****** EXPOSE WHEN CODE STABILIZES
- const uint64_t maxDiscordantCount = 2;
- const uint64_t minConcordantCount = 8;
-
- const bool debug = true;
-
- // Find the previous clusters for each of the steps in this cluster.
- vector<const Cluster*> previousClusters;
- findPreviousClusters(cluster0, previousClusters);
- SHASTA_ASSERT(previousClusters.size() == cluster0->steps.size());
-
-
- // Find the next clusters for each of the steps in this cluster.
- vector<const Cluster*> nextClusters;
- findNextClusters(cluster0, nextClusters);
- SHASTA_ASSERT(nextClusters.size() == cluster0->steps.size());
-
- // Count the distinct previous clusters.
- // They are stored sorted.
- vector<const Cluster*> distinctPreviousClusters = previousClusters;
- vector<uint64_t > distinctPreviousClustersCoverage;
- deduplicateAndCount(distinctPreviousClusters, distinctPreviousClustersCoverage);
- SHASTA_ASSERT(distinctPreviousClusters.size() == distinctPreviousClustersCoverage.size());
-
- // If less than two, do nothing.
- if(distinctPreviousClusters.size() < 2) {
- return false;
- }
-
- // Count the distinct previous clusters.
- // They are stored sorted.
- vector<const Cluster*> distinctNextClusters = nextClusters;
- vector<uint64_t > distinctNextClustersCoverage;
- deduplicateAndCount(distinctNextClusters, distinctNextClustersCoverage);
- SHASTA_ASSERT(distinctNextClusters.size() == distinctNextClustersCoverage.size());
-
- // If less than two, do nothing.
- if(distinctPreviousClusters.size() < 2) {
- return false;
- }
-
- // Only keep the previous clusters that have sufficient coverage and are not null.
- vector< pair<const Cluster*, uint64_t> > previousWithCoverage;
- for(uint64_t i=0; i<distinctPreviousClusters.size(); i++) {
- const Cluster* cluster1 = distinctPreviousClusters[i];
- if(cluster1) {
- const uint64_t coverage = distinctPreviousClustersCoverage[i];
- if(coverage >= minLinkCoverage) {
- previousWithCoverage.push_back(make_pair(cluster1, coverage));
- }
- }
- }
-
- // Only keep the next clusters that have sufficient coverage and are not null.
- vector< pair<const Cluster*, uint64_t> > nextWithCoverage;
- for(uint64_t i=0; i<distinctNextClusters.size(); i++) {
- const Cluster* cluster1 = distinctNextClusters[i];
- if(cluster1) {
- const uint64_t coverage = distinctNextClustersCoverage[i];
- if(coverage >= minLinkCoverage) {
- nextWithCoverage.push_back(make_pair(cluster1, coverage));
- }
- }
- }
-
- // Compute the tangle matrix.
- // tangleMatrix[i][j] contains the number of oriented reads
- // that come from the i-th previous cluster and go to the j-th previous cluster.
- vector< vector<uint64_t> > tangleMatrix(previousWithCoverage.size(), vector<uint64_t>(nextWithCoverage.size(), 0));
- for(uint64_t i=0; i<previousWithCoverage.size(); i++) {
- const Cluster* previousCluster = previousWithCoverage[i].first;
- for(uint64_t j=0; j<nextWithCoverage.size(); j++) {
- const Cluster* nextCluster = nextWithCoverage[j].first;
- for(uint64_t k=0; k<previousClusters.size(); k++) {
- if((previousClusters[k] == previousCluster) and (nextClusters[k] == nextCluster)) {
- ++tangleMatrix[i][j];
- }
- }
- }
- }
-
- // For now, only handle the 2 by 2 case.
- if(not(previousWithCoverage.size() == 2 and nextWithCoverage.size() == 2)) {
- return false;
- }
-
- // Compute the sum of diagonal and off-diagonal terms.
- const uint64_t diagonalSum = tangleMatrix[0][0] + tangleMatrix[1][1];
- const uint64_t offDiagonalSum = tangleMatrix[0][1] + tangleMatrix[1][0];
-
- // Check if the criteria for detangle are satisfied.
- const uint64_t concordantCount = max(diagonalSum, offDiagonalSum);
- const uint64_t discordantCount = min(diagonalSum, offDiagonalSum);
- if(concordantCount < minConcordantCount or discordantCount > maxDiscordantCount) {
- return false;
- }
-
- if(debug) {
- cout << "Detangling " << cluster0->stringId() << "\n";
- cout << "Previous:\n";
- for(const auto& p: previousWithCoverage) {
- cout << p.first->stringId() << " " << p.second << "\n";
- }
- cout << "Next:\n";
- for(const auto& p: nextWithCoverage) {
- cout << p.first->stringId() << " " << p.second << "\n";
- }
- cout << "Tangle matrix:\n";
- for(uint64_t i=0; i<previousWithCoverage.size(); i++) {
- const Cluster* previousCluster = previousWithCoverage[i].first;
- for(uint64_t j=0; j<nextWithCoverage.size(); j++) {
- const Cluster* nextCluster = nextWithCoverage[j].first;
- cout << previousCluster->stringId() << " ";
- cout << nextCluster->stringId() << " ";
- cout << tangleMatrix[i][j] << "\n";
- }
- }
- cout << "Diagonal " << diagonalSum << "\n";
- cout << "Off-diagonal " << offDiagonalSum << "\n";
-
- }
-
-
-
- // If getting here, we can detangle this cluster.
- // This generates two new clusters for this segment.
- const bool inPhase = diagonalSum > offDiagonalSum;
-
- // The new steps for cluster0.
- vector<StepInfo> newSteps0;
-
- // Create the two new clusters.
- const uint64_t segmentId = cluster0->segmentId;
- std::list<Cluster>& segmentClusters = clusters[segmentId];
- segmentClusters.push_back(Cluster(segmentId, segmentClusters.size()));
- Cluster& cluster1 = segmentClusters.back();
- segmentClusters.push_back(Cluster(segmentId, segmentClusters.size()));
- Cluster& cluster2 = segmentClusters.back();
-
- // Do the detangling. The steps that correspond to the dominant portion of the
- // tangle matrix are moved to the new clusters.
- for(uint64_t k=0; k<previousClusters.size(); k++) {
- const StepInfo& step = cluster0->steps[k];
- const OrientedReadId orientedReadId = step.orientedReadId;
- Journey& journey = journeys[orientedReadId.getValue()];
- const uint64_t position = step.position;
- const Cluster* previousCluster = previousClusters[k];
- const Cluster* nextCluster = nextClusters[k];
- if(inPhase) {
- if(previousCluster == previousWithCoverage[0].first and nextCluster == nextWithCoverage[0].first) {
- // Add it to the steps of cluster1.
- cluster1.steps.push_back(StepInfo(orientedReadId, position));
- journey[position].cluster = &cluster1;
- } else if(previousCluster == previousWithCoverage[1].first and nextCluster == nextWithCoverage[1].first) {
- // Add it to the steps of cluster2.
- cluster2.steps.push_back(StepInfo(orientedReadId, position));
- journey[position].cluster = &cluster2;
- } else {
- // Leave it in cluster0.
- newSteps0.push_back(StepInfo(orientedReadId, position));
- }
- } else {
- if(previousCluster == previousWithCoverage[0].first and nextCluster == nextWithCoverage[1].first) {
- // Add it to the steps of cluster1.
- cluster1.steps.push_back(StepInfo(orientedReadId, position));
- journey[position].cluster = &cluster1;
- } else if(previousCluster == previousWithCoverage[1].first and nextCluster == nextWithCoverage[0].first) {
- // Add it to the steps of cluster2.
- cluster2.steps.push_back(StepInfo(orientedReadId, position));
- journey[position].cluster = &cluster2;
- } else {
- // Leave it in cluster0.
- newSteps0.push_back(StepInfo(orientedReadId, position));
- }
- }
- }
-
-
-
-
- // Update the steps of the cluster we just detangled.
- cluster0->steps.swap(newSteps0);
-
- return true;
-}
-
-
-
-void Detangler::writeGfa(
- const string& fileName,
- uint64_t minLinkCoverage,
- const MemoryMapped::VectorOfVectors<Base, uint64_t>& segmentSequences,
- uint64_t k) const
-{
- ofstream gfa(fileName);
- writeGfa(gfa, minLinkCoverage, segmentSequences, k);
-}
-void Detangler::writeGfa(
- ostream& gfa,
- uint64_t minLinkCoverage,
- const MemoryMapped::VectorOfVectors<Base, uint64_t>& segmentSequences,
- uint64_t k) const
-{
- // Write the header line.
- gfa << "H\tVN:Z:1.0\n";
-
- // Write one segment for each cluster.
- for(const auto& p: clusters) {
- const uint64_t segmentId = p.first;
- const auto sequence = segmentSequences[segmentId];
- for(const Cluster& cluster: p.second) {
- gfa << "S\t" << cluster.stringId() << "\t";
- copy(sequence.begin()+k/2, sequence.end()-k/2, ostream_iterator<Base>(gfa));
- gfa << "\n";
- }
- }
-
- // Write the links.
- for(const auto& p: clusters) {
- for(const Cluster& cluster0: p.second) {
-
- // Find the next clusters for each of the steps in this cluster.
- vector<const Cluster*> nextClusters;
- findNextClusters(&cluster0, nextClusters);
- SHASTA_ASSERT(nextClusters.size() == cluster0.steps.size());
-
- // Count the distinct previous clusters.
- // They are stored sorted.
- vector<const Cluster*> distinctNextClusters = nextClusters;
- vector<uint64_t > distinctNextClustersCoverage;
- deduplicateAndCount(distinctNextClusters, distinctNextClustersCoverage);
- SHASTA_ASSERT(distinctNextClusters.size() == distinctNextClustersCoverage.size());
-
- for(uint64_t i=0; i<distinctNextClusters.size(); i++) {
- const Cluster* cluster1 = distinctNextClusters[i];
- if(cluster1) {
- const uint64_t coverage = distinctNextClustersCoverage[i];
- if(coverage >= minLinkCoverage) {
- gfa << "L\t" << cluster0.stringId() << "\t+\t" << cluster1->stringId() << "\t+\t*\n";
- }
- }
- }
- }
- }
-
-}
-
-
diff --git a/src/mode3-Detangler.hpp b/src/mode3-Detangler.hpp
deleted file mode 100644
index efe9477..0000000
--- a/src/mode3-Detangler.hpp
+++ /dev/null
@@ -1,153 +0,0 @@
-#ifndef SHASTA_MODE3_DETANGLER_HPP
-#define SHASTA_MODE3_DETANGLER_HPP
-
-// Shasta.
-#include "ReadId.hpp"
-
-// Standard library.
-#include <list>
-#include <map>
-#include "utility.hpp"
-#include "vector.hpp"
-
-/*******************************************************************************
-
-Class mode3::Detangler contains data structures and code used to detangle the
-mode3::AssemblyGraph.
-
-In the Detangler, each oriented read is represented by the sequence
-of AssemblyGraph segments it visits. This sequence is not necessarily a path
-in the AssemblyGraph, unless the assembly graph was created with
-minCoverage for links <=1.
-
-This sequence is called a Journey. In the AssemblyGraph,
-it is represented as a sequence of AssemblyGraphJourneyEntry objects
-and is stored in AssemblyGraph::assemblyGraphJourneys.
-
-In Detangler code, the journey is represented as a sequence of Step objects.
-Step(s) are grouped into Cluster(s). All Step(s) in a Cluster refer to
-the same segmentId, but there can be more than one Cluster for each segmentId.
-At the beginning, there is exactly one Cluster for each segmentId,
-but during the detangling process Cluster(s) can be split.
-
-Each Step stores the segmentId it refers to, and an iterator pointing to
-the Cluster the Step currently belongs to. The segmentId for a Step never
-changes, but the Cluster it points to can change during the detangling process.
-
-*******************************************************************************/
-
-namespace shasta {
- class Base;
- namespace mode3 {
- class Detangler;
- class AssemblyGraph;
- }
- namespace MemoryMapped {
- template<class T, class Int> class VectorOfVectors;
- }
-}
-
-
-
-class shasta::mode3::Detangler {
-public:
-
- // See the comments at the top of this file for the meanings
- // of Step, Journey, Cluster.
-
- class Cluster;
-
- class Step {
- public:
- const uint64_t segmentId;
- const Cluster* cluster = 0;
-
- Step(uint64_t segmentId) :
- segmentId(segmentId) {}
- };
-
- using Journey = vector<Step>;
-
- // The journey of each oriented read.
- // Obtained from the AssemblyGraph::assemblyGraphJourneys.
- // Indexed by OrientedReadId::getValue().
- vector<Journey> journeys;
-
- // Type used to identify a step in a journey.
- class StepInfo {
- public:
- OrientedReadId orientedReadId;
-
- // The position of this entry in the journey of this oriented read.
- uint64_t position;
-
- StepInfo() {}
- StepInfo(OrientedReadId orientedReadId, uint64_t position) :
- orientedReadId(orientedReadId),
- position(position) {}
- };
-
- // A cluster is a set of Step(s) all corresponding to the same
- // segment id.
- class Cluster {
- public:
- uint64_t segmentId;
- uint64_t id = 0; // Within that segmentId.
- vector<StepInfo> steps; // Sorted by orientedReadId.
- Cluster(uint64_t segmentId, uint64_t id) :
- segmentId(segmentId), id(id) {}
- string stringId() const
- {
- return to_string(segmentId) + "." + to_string(id);
- }
- };
-
- // Store the clusters keyed by segmentId.
- // Clusters are never removed.
- // However, during detangling, the steps of a cluster
- // can be moved to other clusters for the same segmentId.
- // We use a list so pointers to Cluster(s) are not invalidated
- // when elements are added.
- using ClusterContainer = std::map<uint64_t, std::list<Cluster> >;
- ClusterContainer clusters;
-
-
- Detangler(const AssemblyGraph&);
-private:
- void createJourneys(const AssemblyGraph&);
- void createInitialClusters();
-
- // Find the next/previous cluster for each of the steps in a given cluster.
- // The output vector has size equal to the number of steps in this cluster,
- // and the corresponding OrientedReadId(s) are the same
- // as the ones in the steps vector for the given cluster.
- // Some of the pointers returned can be zero. This can happen if this
- // cluster is the first or last cluster in the journey of an oriented read.
- void findNextClusters(
- const Cluster*,
- vector<const Cluster*>&
- ) const;
- void findPreviousClusters(
- const Cluster*,
- vector<const Cluster*>&
- ) const;
-
- // Simple, classical detangling of a single cluster.
- bool simpleDetangle(Cluster*, uint64_t minLinkCoverage);
-
- void writeGfa(
- const string& fileName,
- uint64_t minLinkCoverage,
- const MemoryMapped::VectorOfVectors<Base, uint64_t>& segmentSequences,
- uint64_t k) const;
- void writeGfa(
- ostream&,
- uint64_t minLinkCoverage,
- const MemoryMapped::VectorOfVectors<Base, uint64_t>& segmentSequences,
- uint64_t k) const;
-};
-
-
-
-#endif
-
diff --git a/src/mode3-JaccardGraph.cpp b/src/mode3-JaccardGraph.cpp
deleted file mode 100644
index f4e3db4..0000000
--- a/src/mode3-JaccardGraph.cpp
+++ /dev/null
@@ -1,957 +0,0 @@
-#include "mode3-JaccardGraph.hpp"
-#include "deduplicate.hpp"
-#include "mode3.hpp"
-#include "orderPairs.hpp"
-#include "orderVectors.hpp"
-#include "timestamp.hpp"
-using namespace shasta;
-using namespace mode3;
-
-// Boost libraries.
-#include <boost/pending/disjoint_sets.hpp>
-#include <boost/graph/topological_sort.hpp>
-
-// Standard library.
-#include "fstream.hpp"
-
-
-
-// Create a JaccardGraph with the given number of vertices
-// (one for each segment) and no edges.
-JaccardGraph::JaccardGraph(uint64_t segmentCount)
-{
- for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) {
- vertexTable.push_back(add_vertex(JaccardGraphVertex(segmentId), *this));
- }
-}
-
-
-void AssemblyGraph::createJaccardGraph(
- size_t threadCount
- )
-{
- // EXPOSE WHEN CODE STABILIZES.
- const uint64_t minComponentSize = 10; // Likely needs to be decreased. Keep high for debugging.
-
- cout << timestamp << "createJaccardGraph begins." << endl;
-
- // Create the JaccardGraph and its vertices.
- const uint64_t segmentCount = markerGraphPaths.size();
- cout << "The total number of segments in the assembly graph is " << segmentCount << endl;
- jaccardGraphPointer = make_shared<JaccardGraph>(segmentCount);
- JaccardGraph& jaccardGraph = *jaccardGraphPointer;
-
- // Compute edges, in parallel.
- jaccardGraph.threadEdges.resize(threadCount);
- const uint64_t batchSize = 100;
- setupLoadBalancing(segmentCount, batchSize);
- runThreads(&AssemblyGraph::createJaccardGraphThreadFunction, threadCount);
- jaccardGraph.storeEdges();
- jaccardGraph.writeGraphviz("JaccardGraph0.dot", false, false);
- jaccardGraph.writeGraphviz("JaccardGraph0-Labeled.dot", false, true);
- jaccardGraph.writeEdgesCsv("JaccardGraph0Edges.csv");
- cout << "The initial Jaccard graph has " << num_vertices(jaccardGraph) <<
- " vertices (segments) and " << num_edges(jaccardGraph) << " edges." << endl;
-
- // Clear all weak vertices.
- jaccardGraph.clearWeakVertices();
- cout << "After clearing weak vertices, the Jaccard graph has " << num_vertices(jaccardGraph) <<
- " vertices (segments) and " << num_edges(jaccardGraph) << " edges." << endl;
- jaccardGraph.writeGraphviz("JaccardGraph1.dot", false, false);
- jaccardGraph.writeGraphviz("JaccardGraph1-Labeled.dot", false, true);
- jaccardGraph.writeEdgesCsv("JaccardGraph1Edges.csv");
-
- // Compute all connected components of size at least minComponentSize.
- jaccardGraph.computeConnectedComponents(minComponentSize);
-
- // Store the cluster id of each segment.
- // Each connected component of the Jaccard graph with sufficient size
- // generates a cluster.
- createNew(clusterIds, "Mode3-ClusterIds");
- jaccardGraph.findClusters(clusterIds);
-
- // Compute assembly paths.
- jaccardGraph.computeAssemblyPaths();
-
- // Create the ExpandedJaccardGraph.
- ExpandedJaccardGraph expandedJaccardGraph(jaccardGraph);
- expandedJaccardGraph.writeGraphviz("ExpandedJaccardGraph0.dot");
- expandedJaccardGraph.merge();
- expandedJaccardGraph.writeGraphviz("ExpandedJaccardGraph1.dot");
-
- cout << timestamp << "createJaccardGraph ends." << endl;
-}
-
-
-
-void AssemblyGraph::createJaccardGraphThreadFunction(size_t threadId)
-{
- // Loop over all batches assigned to this thread.
- uint64_t begin, end;
- while(getNextBatch(begin, end)) {
-
- // Loop over all segments assigned to this batch.
- for(uint64_t segmentId=begin; segmentId!=end; ++segmentId) {
- createJaccardGraphEdges(segmentId, jaccardGraphPointer->threadEdges[threadId]);
- }
- }
-}
-
-
-
-void AssemblyGraph::createJaccardGraphEdges(
- uint64_t segmentId,
- vector<JaccardGraphEdgeInfo>& edges)
-{
- for(uint64_t direction=0; direction<2; direction++) {
- createJaccardGraphEdges(segmentId, direction, edges);
- }
-}
-
-
-
-// This follows an algorithm similar to the one used by createAssemblyPath3.
-void AssemblyGraph::createJaccardGraphEdges(
- uint64_t primarySegmentId,
- uint64_t direction,
- vector<JaccardGraphEdgeInfo>& edges)
-{
- // EXPOSE WHEN CODE STABILIZES.
- // FOR NOW THESE SHOULD BE THE SAME AS IN AssemblyGraph::createAssemblyPath3.
- const uint64_t minCommonForLink = 3;
- const uint64_t minCommonForPrimary = 3;
- const double minJaccard = 0.75;
- const int32_t minLinkSeparation = -20;
-
- // We start from primarySegmentId
- // and move in the specified direction until we find segmentId1 with
- // sufficiently high Jaccard similarity and number of
- // common oriented reads with primarySegmentId.
- // At each step, we choose the link that has the most common oriented
- // reads with the primarySegmentId.
- SegmentOrientedReadInformation infoPrimary;
- getOrientedReadsOnSegment(primarySegmentId, infoPrimary);
- JaccardGraphEdgeInfo edge;
- edge.direction = direction;
- uint64_t segmentId0 = primarySegmentId;
- std::set<uint64_t> previousSegments;
- while(true) {
-
- // Loop over outgoing or incoming links of segmentId0.
- // Find the link with the most common reads with the primarySegmentId.
- const auto linkIds = (direction == 0) ? linksBySource[segmentId0] : linksByTarget[segmentId0];
- if(linkIds.empty()) {
- return;
- }
- uint64_t linkIdBest = invalid<uint64_t>;
- uint64_t commonOrientedReadCountBest = 0;
- for(const uint64_t linkId: linkIds) {
-
- // If link separation is too negative, skip it.
- // The goal here is to avoid cycles in paths.
- const Link& link = links[linkId];
- if(link.separation < minLinkSeparation) {
- continue;
- }
-
- // Count the number of common oriented reads between the reference segment and this link.
- uint64_t commonOrientedReadCount;
- analyzeSegmentLinkPair(primarySegmentId, linkId, commonOrientedReadCount);
-
- // If better than the one we have it, record it.
- if(commonOrientedReadCount > commonOrientedReadCountBest) {
- linkIdBest = linkId;
- commonOrientedReadCountBest = commonOrientedReadCount;
- }
- }
- if(commonOrientedReadCountBest < minCommonForLink) {
- return;
- }
- const uint64_t linkId = linkIdBest;
-
- // Get the segment at the other side of this link.
- const Link& link = links[linkId];
- const uint64_t segmentId1 = (direction==0) ? link.segmentId1 : link.segmentId0;
-
- // Check that we haven't been here before.
- if(previousSegments.contains(segmentId1)) {
- break;
- }
- previousSegments.insert(segmentId1);
-
- // Check segmentId1 against the primary segment.
- SegmentOrientedReadInformation info1;
- getOrientedReadsOnSegment(segmentId1, info1);
- if(direction == 0) {
- analyzeSegmentPair(
- primarySegmentId, segmentId1,
- infoPrimary, info1,
- markers, edge.segmentPairInformation);
- } else {
- analyzeSegmentPair(
- segmentId1, primarySegmentId,
- info1, infoPrimary,
- markers, edge.segmentPairInformation);
- }
-
- // If the Jaccard similarity is high, we found the Jaccard graph edge
- // we were looking for.
- if( edge.segmentPairInformation.commonCount >= minCommonForPrimary and
- edge.segmentPairInformation.jaccard() >= minJaccard) {
- if(direction == 0) {
- edge.segmentId0 = primarySegmentId;
- edge.segmentId1 = segmentId1;
- } else {
- edge.segmentId0 = segmentId1;
- edge.segmentId1 = primarySegmentId;
- reverse(edge.segmentIds.begin(), edge.segmentIds.end());
- }
- edges.push_back(edge);
- return;
- }
-
- edge.segmentIds.push_back(segmentId1);
- segmentId0 = segmentId1;
- }
-}
-
-
-
-// This storesin the Jaccard graph the edges found by all threads.
-void JaccardGraph::storeEdges()
-{
- JaccardGraph& jaccardGraph = *this;
-
- for(const auto& threadEdges: threadEdges) {
- for(const JaccardGraphEdgeInfo& info: threadEdges) {
-
- const uint64_t segmentId0 = info.segmentId0;
- const uint64_t segmentId1 = info.segmentId1;
- const JaccardGraph::vertex_descriptor v0 = vertexTable[segmentId0];
- const JaccardGraph::vertex_descriptor v1 = vertexTable[segmentId1];
-
- edge_descriptor e;
- bool edgeExists = false;
- tie(e, edgeExists) = boost::edge(v0, v1, jaccardGraph);
- if(not edgeExists) {
- boost::add_edge(v0, v1,
- JaccardGraphEdge(info.segmentPairInformation, info.direction, info.segmentIds),
- jaccardGraph);
- } else {
- jaccardGraph[e].wasFoundInDirection[info.direction] = true;
- }
- }
- }
- threadEdges.clear();
-}
-
-
-
-// A strong vertex is one that is incident to at least one strong edge.
-bool JaccardGraph::isStrongVertex(vertex_descriptor v) const
-{
- const JaccardGraph& jaccardGraph = *this;
-
- // Check the out-edges.
- BGL_FORALL_OUTEDGES(v, e, jaccardGraph, JaccardGraph) {
- if(jaccardGraph[e].isStrong()) {
- return true;
- }
- }
-
- // Check the in-edges.
- BGL_FORALL_INEDGES(v, e, jaccardGraph, JaccardGraph) {
- if(jaccardGraph[e].isStrong()) {
- return true;
- }
- }
-
- // We did not find any strong edges.
- return false;
-}
-
-
-
-
-// Remove all weak vertices.
-void JaccardGraph::removeWeakVertices()
-{
- JaccardGraph& jaccardGraph = *this;
-
- // Find the vertices we are going to remove.
- vector<vertex_descriptor> verticesToBeRemoved;
- BGL_FORALL_VERTICES(v, jaccardGraph, JaccardGraph) {
- if(not isStrongVertex(v)) {
- verticesToBeRemoved.push_back(v);
- }
- }
-
- // Remove the vertices we flagged.
- for(const vertex_descriptor v: verticesToBeRemoved) {
- removeVertex(v);
- }
-
-}
-
-
-
-// Remove all edges to/from weak vertices.
-void JaccardGraph::clearWeakVertices()
-{
- JaccardGraph& jaccardGraph = *this;
-
- vector<vertex_descriptor> verticesToBeCleared;
- BGL_FORALL_VERTICES(v, jaccardGraph, JaccardGraph) {
- if(not isStrongVertex(v)) {
- verticesToBeCleared.push_back(v);
- }
- }
-
- for(const vertex_descriptor v: verticesToBeCleared) {
- clear_vertex(v, jaccardGraph);
- }
-
-}
-
-
-
-// Remove a vertex, making sure to update the vertexTable.
-void JaccardGraph::removeVertex(vertex_descriptor v)
-{
- JaccardGraph& jaccardGraph = *this;
- const uint64_t segmentId = jaccardGraph[v].segmentId;
- vertexTable[segmentId] = null_vertex();
- clear_vertex(v, jaccardGraph);
- remove_vertex(v, jaccardGraph);
-}
-
-
-
-void JaccardGraph::writeGraphviz(
- const string& fileName,
- bool includeIsolatedVertices,
- bool writeLabels) const
-{
- ofstream file(fileName);
- writeGraphviz(file, includeIsolatedVertices, writeLabels);
-}
-
-
-
-void JaccardGraph::writeGraphviz(
- ostream& graphOut,
- bool includeIsolatedVertices,
- bool writeLabels) const
-{
- const JaccardGraph& jaccardGraph = *this;
-
- graphOut << "digraph JaccardGraph {" << endl;
-
- BGL_FORALL_VERTICES(v, jaccardGraph, JaccardGraph) {
- if( includeIsolatedVertices or
- in_degree(v, jaccardGraph) or
- out_degree(v, jaccardGraph)) {
- graphOut << jaccardGraph[v].segmentId;
- if(writeLabels) {
- graphOut << " [label=" << jaccardGraph[v].segmentId << "]";
- }
- graphOut << ";\n";
- }
- }
-
- BGL_FORALL_EDGES(e, jaccardGraph, JaccardGraph) {
- const JaccardGraphEdge& edge = jaccardGraph[e];
- const JaccardGraph::vertex_descriptor v0 = source(e, jaccardGraph);
- const JaccardGraph::vertex_descriptor v1 = target(e, jaccardGraph);
- const uint64_t segmentId0 = jaccardGraph[v0].segmentId;
- const uint64_t segmentId1 = jaccardGraph[v1].segmentId;
-
- graphOut << segmentId0 << "->" << segmentId1 << "[";
-
- // Color the edge based on the direction flags.
- if(edge.wasFoundInDirection[0]) {
- if(edge.wasFoundInDirection[1]) {
- // Found in both directions.
- graphOut << " color=black";
- } else {
- // Only found in the forward direction.
- graphOut << " color=red";
- }
- } else {
- if(edge.wasFoundInDirection[1]) {
- // Only found in the backward direction.
- graphOut << " color=green";
- } else {
- SHASTA_ASSERT(0);
- }
- }
-
- if(writeLabels) {
- graphOut << " label=\"";
- for(const uint64_t segmentId: edge.segmentIds) {
- graphOut << segmentId << "\\n";
- }
- graphOut << "\"";
- }
- graphOut << "];\n";
- }
-
- graphOut << "}" << endl;
-
-}
-
-
-
-// Write edges in csv format.
-void JaccardGraph::writeEdgesCsv(const string& fileName) const
-{
- ofstream file(fileName);
- writeEdgesCsv(file);
-}
-void JaccardGraph::writeEdgesCsv(ostream& csv) const
-{
- const JaccardGraph& jaccardGraph = *this;
-
- csv << "SegmentId0,SegmentId1,FoundForward,FoundBackward,SegmentId\n";
- BGL_FORALL_EDGES(e, jaccardGraph, JaccardGraph) {
- const JaccardGraphEdge& edge = jaccardGraph[e];
- const JaccardGraph::vertex_descriptor v0 = source(e, jaccardGraph);
- const JaccardGraph::vertex_descriptor v1 = target(e, jaccardGraph);
- const uint64_t segmentId0 = jaccardGraph[v0].segmentId;
- const uint64_t segmentId1 = jaccardGraph[v1].segmentId;
-
- for(const uint64_t segmentId: edge.segmentIds) {
- csv << segmentId0 << ",";
- csv << segmentId1 << ",";
- csv << int(edge.wasFoundInDirection[0]) << ",";
- csv << int(edge.wasFoundInDirection[1]) << ",";
- csv << segmentId << "\n";
- }
- }
-}
-
-
-
-// Compute all connected components of size at least minComponentSize.
-// They are stored in order of decreasing size.
-void JaccardGraph::computeConnectedComponents(uint64_t minComponentSize)
-{
- const JaccardGraph& jaccardGraph = *this;
-
- // This must be called without removing any vertices.
- const uint64_t segmentCount = num_vertices(jaccardGraph);
-
- // Compute connected components.
- vector<uint64_t> rank(segmentCount);
- vector<uint64_t> parent(segmentCount);
- boost::disjoint_sets<uint64_t*, uint64_t*> disjointSets(&rank[0], &parent[0]);
- for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) {
- disjointSets.make_set(segmentId);
- }
- BGL_FORALL_EDGES(e, jaccardGraph, JaccardGraph) {
- const JaccardGraph::vertex_descriptor v0 = source(e, jaccardGraph);
- const JaccardGraph::vertex_descriptor v1 = target(e, jaccardGraph);
- const uint64_t segmentId0 = jaccardGraph[v0].segmentId;
- const uint64_t segmentId1 = jaccardGraph[v1].segmentId;
- disjointSets.union_set(segmentId0, segmentId1);
- }
-
- // Gather the segments in each connected component.
- vector< vector<uint64_t> > allComponents(segmentCount);
- for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) {
- const uint64_t componentId = disjointSets.find_set(segmentId);
- allComponents[componentId].push_back(segmentId);
- }
-
- // Create a table of the components of size at least minComponentSize,
- // sorted by decreasing size.
- vector< pair<uint64_t, uint64_t> > componentTable; // pair(componentId, componentSize)
- for(uint64_t componentId=0; componentId<segmentCount; componentId++) {
- const uint64_t componentSize = allComponents[componentId].size();
- if(componentSize >= minComponentSize) {
- componentTable.push_back(make_pair(componentId, componentSize));
- }
- }
- sort(componentTable.begin(), componentTable.end(),
- OrderPairsBySecondOnlyGreater<uint64_t, uint64_t>());
-
- // Store the connected components of size at least minComponentSize.
- components.clear();
- for(uint64_t newComponentId=0; newComponentId<componentTable.size(); newComponentId++) {
- const auto& p = componentTable[newComponentId];
- const uint64_t oldComponentId = p.first;
- const uint64_t componentSize = p.second;
- const vector<uint64_t>& component = allComponents[oldComponentId];
- SHASTA_ASSERT(component.size() == componentSize);
- components.push_back(component);
- }
-
-
- // Write a histogram of component sizes.
- vector<uint64_t> histogram;
- for(const auto& p: componentTable) {
- const uint64_t componentSize = p.second;
- if(componentSize >= histogram.size()) {
- histogram.resize(componentSize + 1, 0);
- }
- ++histogram[componentSize];
- }
- ofstream csv("JaccardGraphComponentSizeHistogram.csv");
- csv << "Size,Frequency,Vertices,\n";
- for(uint64_t componentSize=1; componentSize<histogram.size(); componentSize++) {
- const uint64_t frequency = histogram[componentSize];
- if(frequency > 0) {
- csv << componentSize << ",";
- csv << frequency << ",";
- csv << frequency * componentSize << ",";
- csv << "\n";
- }
- }
-
-}
-
-
-
-// Compute connected component and store the component
-// (define as a cluster) that each segment belongs to.
-void JaccardGraph::findClusters(
- MemoryMapped::Vector<uint64_t>& clusterIds)
-{
- const JaccardGraph& jaccardGraph = *this;
-
- // This must be called without removing any vertices.
- const uint64_t segmentCount = num_vertices(jaccardGraph);
-
- clusterIds.resize(segmentCount);
- fill(clusterIds.begin(), clusterIds.end(), invalid<uint64_t>);
- for(uint64_t componentId=0; componentId<components.size(); componentId++) {
- const vector<uint64_t>& component = components[componentId];
- for(const uint64_t segmentId: component) {
- clusterIds[segmentId] = componentId;
- }
- }
-
-}
-
-
-
-// Construction of the ExpandedJaccardGraph.
-// Each vertex of the JaccardGraph generates a vertex in the ExpandedJaccardGraph.
-// Each edge of the JaccardGraph generates a linear chain of vertices
-// in the ExpandedJaccardGraph.
-ExpandedJaccardGraph::ExpandedJaccardGraph(const JaccardGraph& jaccardGraph)
-{
- using Graph = ExpandedJaccardGraph;
- Graph& graph = *this;
-
- // Generate the vertices.
- std::map<JaccardGraph::vertex_descriptor, Graph::vertex_descriptor> vertexMap;
- BGL_FORALL_VERTICES(v, jaccardGraph, JaccardGraph) {
- const Graph::vertex_descriptor u = add_vertex(
- ExpandedJaccardGraphVertex(jaccardGraph[v].segmentId, true), graph);
- vertexMap.insert(make_pair(v, u));
- }
-
-
-
- // Each edge of the JaccardGraph generates a linear chain of vertices
- // in the ExpandedJaccardGraph.
- BGL_FORALL_EDGES(e, jaccardGraph, JaccardGraph) {
- const JaccardGraph::vertex_descriptor v0 = source(e, jaccardGraph);
- const JaccardGraph::vertex_descriptor v1 = target(e, jaccardGraph);
- const Graph::vertex_descriptor u0 = vertexMap[v0];
- const Graph::vertex_descriptor u1 = vertexMap[v1];
- const vector<uint64_t>& segmentIds = jaccardGraph[e].segmentIds;
-
- Graph::vertex_descriptor u = u0;
- for(const uint64_t segmentId: segmentIds) {
- const Graph::vertex_descriptor w = add_vertex(
- ExpandedJaccardGraphVertex(segmentId, false), graph);
- add_edge(u, w, graph);
- u = w;
- }
- add_edge(u, u1, graph);
- }
-}
-
-
-
-void ExpandedJaccardGraph::writeGraphviz(const string& fileName) const
-{
- ofstream s(fileName);
- writeGraphviz(s);
-}
-void ExpandedJaccardGraph::writeGraphviz(ostream& s) const
-{
- using Graph = ExpandedJaccardGraph;
- const Graph& graph = *this;
-
- const bool debug = false;
-
- s << "digraph ExpandedJaccardGraph {" << endl;
-
- // We can't use the segment ids to identify vertices
- // because each segment id can appear multiple times.
- BGL_FORALL_VERTICES(v, graph, Graph) {
- const ExpandedJaccardGraphVertex& vertex = graph[v];
- const double primaryFraction = vertex.primaryFraction();
- s << "\"" << v << "\" [label=\"" << vertex.segmentId;
- if(debug) {
- s << "\\n" << v;
- }
- s << "\\n" << vertex.primaryCount << "/" << vertex.totalCount << "\"";
- const double H = primaryFraction / 3.;
- const double S = 0.5;
- const double V = 1.;
- s << " style=filled fillcolor=\"" << H << " " << " " << S << " "<< V << "\"";
- s << "];\n";
- }
-
- BGL_FORALL_EDGES(e, graph, Graph) {
- const Graph::vertex_descriptor v0 = source(e, graph);
- const Graph::vertex_descriptor v1 = target(e, graph);
-
- s << "\"" << v0 << "\"->\"" << v1 << "\";\n";
- }
-
- s << "}" << endl;
-
-}
-
-
-
-// Recursively merge pairs of vertices that have a common parent or child
-// and that refer to the same segmentId.
-void ExpandedJaccardGraph::merge()
-{
- using Graph = ExpandedJaccardGraph;
- Graph& graph = *this;
-
- const bool debug = false;
- if(debug) {
- cout << "ExpandedJaccardGraph::merge begins." << endl;
- }
-
- std::set<Branch> branches;
- BGL_FORALL_VERTICES(v, graph, Graph) {
- if(out_degree(v, graph) > 1) {
- branches.insert(make_pair(v, 0));
- }
- if(in_degree(v, graph) > 1) {
- branches.insert(make_pair(v, 1));
- }
- }
-
-
-
- // Recursive merge.
- vector<vertex_descriptor> neighbors;
- while(not branches.empty()) {
- const auto it = branches.begin();
- const vertex_descriptor v0 = it->first;
- const uint64_t direction = it->second;
- branches.erase(it);
-
- if(debug) {
- cout << "Working on branch " << v0 << " " << direction << endl;
- }
-
- // Gather the children or parents.
- neighbors.clear();
- if(direction == 0) {
- BGL_FORALL_OUTEDGES(v0, e, graph, Graph) {
- neighbors.push_back(target(e, graph));
- }
- } else if(direction == 1) {
- BGL_FORALL_INEDGES(v0, e, graph, Graph) {
- neighbors.push_back(source(e, graph));
- }
-
- } else {
- SHASTA_ASSERT(0);
- }
- if(debug) {
- cout << neighbors.size() << " neighbors:";
- for(const vertex_descriptor v: neighbors) {
- cout << " " << v;
- }
- cout << endl;
- }
- SHASTA_ASSERT(neighbors.size() > 1);
-
- // Find a pair of neighbors with the same segmentId.
- vertex_descriptor v1, v2;
- bool found = false;
- for(uint64_t i1=0; i1<neighbors.size()-1; i1++) {
- v1 = neighbors[i1];
- for(uint64_t i2=i1+1; i2<neighbors.size(); i2++) {
- v2 = neighbors[i2];
- if(graph[v1].segmentId == graph[v2].segmentId) {
- found = true;
- break;
- }
- }
- if(found) {
- break;
- }
- }
-
- // If we did not find a pair of neighbors with the same segmentId,
- // there is nothing to do. We already removed this branch, so we
- // are done.
- if(not found) {
- if(debug) {
- cout << "No pair can be merged for this branch." << endl;
- }
- continue;
- }
- if(debug) {
- cout << "Merging " << v1 << " " << v2 << endl;
- }
-
- // Merge v1 and v2, and update the branches.
- merge(v1, v2, branches, debug);
-
- }
-
- if(debug) {
- cout << "ExpandedJaccardGraph::merge ends." << endl;
- }
-}
-
-
-
-// Merge v1 and v2 while updating the set of branches.
-void ExpandedJaccardGraph::merge(
- vertex_descriptor v1,
- vertex_descriptor v2,
- std::set<Branch>& branches,
- bool debug)
-{
- using Graph = ExpandedJaccardGraph;
- Graph& graph = *this;
-
- const ExpandedJaccardGraphVertex& vertex1 = graph[v1];
- const ExpandedJaccardGraphVertex& vertex2 = graph[v2];
-
- // Check the segmentId.
- const uint64_t segmentId = vertex1.segmentId;
- SHASTA_ASSERT(segmentId == vertex2.segmentId);
-
- // Find the children of v1 and v2.
- // These will be the children of the merged vertex v3.
- vector<vertex_descriptor> children;
- BGL_FORALL_OUTEDGES(v1, e, graph, Graph) {
- children.push_back(target(e, graph));
- }
- BGL_FORALL_OUTEDGES(v2, e, graph, Graph) {
- children.push_back(target(e, graph));
- }
- deduplicate(children);
-
- // Find the parents of v1 and v2.
- // These will be the parents of the merged vertex v3.
- vector<vertex_descriptor> parents;
- BGL_FORALL_INEDGES(v1, e, graph, Graph) {
- parents.push_back(source(e, graph));
- }
- BGL_FORALL_INEDGES(v2, e, graph, Graph) {
- parents.push_back(source(e, graph));
- }
- deduplicate(parents);
-
- if(debug) {
- cout << "Merging " << v1 << " " << v2 << endl;
- cout << "Children:";
- for(const vertex_descriptor v: children) {
- cout << " " << v;
- }
- cout << endl;
- cout << "Parents:";
- for(const vertex_descriptor v: parents) {
- cout << " " << v;
- }
- cout << endl;
- }
-
- // Remove the branches that will be affected by the merge.
- // We will add branches back as necessary.
- for(const vertex_descriptor v: children) {
- branches.erase(make_pair(v, 1));
- }
- for(const vertex_descriptor v: parents) {
- branches.erase(make_pair(v, 0));
- }
- branches.erase(make_pair(v1, 0));
- branches.erase(make_pair(v1, 1));
- branches.erase(make_pair(v2, 0));
- branches.erase(make_pair(v2, 1));
-
- // Create the merged vertex.
- ExpandedJaccardGraphVertex vertex3;
- vertex3.segmentId = segmentId;
- vertex3.totalCount = vertex1.totalCount + vertex2.totalCount;
- vertex3.primaryCount = vertex1.primaryCount + vertex2.primaryCount;
- const vertex_descriptor v3 = add_vertex(vertex3, graph);
- if(debug) {
- cout << "Created merged vertex " << v3 << endl;
- }
-
- // Remove the vertices that were merged, v1 and v2.
- clear_vertex(v1, graph);
- clear_vertex(v2, graph);
- remove_vertex(v1, graph);
- remove_vertex(v2, graph);
- if(debug) {
- cout << "Removed the merged vertices " << v1 << " " << v2 << endl;
- }
-
- // Add the edges to/from the merged vertex.
- for(const vertex_descriptor v: children) {
- add_edge(v3, v, graph);
- if(debug) {
- cout << "Added edge " << v3 << " " << v << endl;
- }
- }
- for(const vertex_descriptor v: parents) {
- add_edge(v, v3, graph);
- if(debug) {
- cout << "Added edge " << v << " " << v3 << endl;
- }
- }
-
- // Add back any necessary branches.
- if(out_degree(v3, graph) > 1) {
- branches.insert(make_pair(v3, 0));
- if(debug) {
- cout << "Added branch " << v3 << " " << 0 << endl;
- }
- }
- if(in_degree(v3, graph) > 1) {
- branches.insert(make_pair(v3, 1));
- if(debug) {
- cout << "Added branch " << v3 << " " << 1 << endl;
- }
- }
- for(const vertex_descriptor v: children) {
- if(in_degree(v, graph) > 1) {
- branches.insert(make_pair(v, 1));
- if(debug) {
- cout << "Added branch " << v << " " << 1 << endl;
- }
- }
- }
- for(const vertex_descriptor v: parents) {
- if(out_degree(v, graph) > 1) {
- branches.insert(make_pair(v, 0));
- if(debug) {
- cout << "Added branch " << v << " " << 0 << endl;
- }
- }
- }
-}
-
-
-
-// Compute assembly paths.
-void JaccardGraph::computeAssemblyPaths()
-{
- assemblyPaths.clear();
- for(uint64_t componentId=0; componentId<components.size(); componentId++) {
- computeAssemblyPaths(componentId);
- }
-}
-void JaccardGraph::computeAssemblyPaths(uint64_t componentId)
-{
- const JaccardGraph& jaccardGraph = *this;
-
- const bool debug = true;
- const vector<uint64_t>& component = components[componentId];
- if(debug) {
- cout << "Computing assembly paths for component " << componentId <<
- " of size " << component.size() << endl;
- }
-
- // Create a Graph to represent just this component.
- // Each vertex of the Graph stores the corresponding
- // vertex descriptor in the JaccardGraph.
- using Graph = boost::adjacency_list<
- boost::listS, boost::vecS, boost::bidirectionalS,
- JaccardGraph::vertex_descriptor>;
- Graph graph;
- std::map<JaccardGraph::vertex_descriptor, Graph::vertex_descriptor> vertexMap;
- for(uint64_t segmentId: component) {
- const JaccardGraph::vertex_descriptor jv = vertexTable[segmentId];
- const Graph::vertex_descriptor gv = add_vertex(jv, graph);
- vertexMap.insert(make_pair(jv, gv));
- }
- BGL_FORALL_VERTICES(gv0, graph, Graph) {
- const JaccardGraph::vertex_descriptor jv0 = graph[gv0];
- BGL_FORALL_OUTEDGES(jv0, e, jaccardGraph, JaccardGraph) {
- const JaccardGraph::vertex_descriptor jv1 = target(e, jaccardGraph);
- add_edge(vertexMap[jv0], vertexMap[jv1], graph);
- }
- }
- if(debug) {
- cout << "This component has " << num_vertices(graph) <<
- " vertices and " << num_edges(graph) << " edges." << endl;
- }
-
- // Topological sort of this connected component.
- vector<Graph::vertex_descriptor> reverseTopologicalSort;
- try {
- boost::topological_sort(graph, back_inserter(reverseTopologicalSort));
- } catch (boost::not_a_dag&) {
- if(debug) {
- cout << "Topological sort for this connected component failed." << endl;
- cout << "Computation of assembly path will skip this connected component." << endl;
- }
- return;
- }
-
-
-
- // Find the longest path in this component.
- // See https://en.wikipedia.org/wiki/Longest_path_problem#Acyclic_graphs
- vector<uint64_t> pathLength(component.size(), 0);
- vector<Graph::vertex_descriptor> successor(component.size(), Graph::null_vertex());
-
- // Process vertices in reverse topological order.
- for(const Graph::vertex_descriptor gv0: reverseTopologicalSort) {
- BGL_FORALL_OUTEDGES(gv0, e, graph, Graph) {
- const Graph::vertex_descriptor gv1 = target(e, graph);
- if(pathLength[gv1] + 1 > pathLength[gv0]) {
- pathLength[gv0] = pathLength[gv1] + 1;
- successor[gv0] = gv1;
- }
- }
- }
-
- // Find the vertex with the longest pathLength.
- // This will be the first vertex of the longest path.
- Graph::vertex_descriptor gv0 =
- std::max_element(pathLength.begin(), pathLength.end()) - pathLength.begin();
-
- // Find the longest path by following the successors.
- vector<uint64_t> longestPath;
- longestPath.push_back(jaccardGraph[graph[gv0]].segmentId);
- while(true) {
- const Graph::vertex_descriptor gv1 = successor[gv0];
- if(gv1 == Graph::null_vertex()) {
- break;
- }
- longestPath.push_back(jaccardGraph[graph[gv1]].segmentId);
- gv0 = gv1;
- }
-
- // Store the longest path.
- assemblyPaths.push_back(longestPath);
-
- if(debug) {
- cout << "Longest path has " << longestPath.size() << " segments:" << endl;
- for(const uint64_t segmentId: longestPath) {
- cout << segmentId << " ";
- }
- cout << endl;
- }
-}
diff --git a/src/mode3-JaccardGraph.hpp b/src/mode3-JaccardGraph.hpp
deleted file mode 100644
index f01f881..0000000
--- a/src/mode3-JaccardGraph.hpp
+++ /dev/null
@@ -1,256 +0,0 @@
-#ifndef SHASTA_MODE3_JACCARD_GRAPH_HPP
-#define SHASTA_MODE3_JACCARD_GRAPH_HPP
-
-/*******************************************************************************
-
-The mode3::JaccardGraph is a directed graph in which each vertex represents
-a segment in the mode3::AssemblyGraph.
-
-A directed edge S0->S1 is created if S0 and S1 have:
-- A sufficient number of common reads.
-- High Jaccard similarity.
-- Low unexplained fractions.
-(The above quantities defined as computed by
-mode3::AssemblyGraph::analyzeSegmentPair).
-For the edge to be created, we also require one of the following:
-1. S1 is the first primary segment encountered starting from S0,
- and performing a forward path search using the algorithm defined by
- mode3::AssemblyGraph::createAssemblyPath3.
-2. S0 is the first primary segment encountered starting from S1,
- and performing a backward path search using the algorithm defined by
- mode3::AssemblyGraph::createAssemblyPath3.
-
-*******************************************************************************/
-
-// Shasta.
-#include "mode3-SegmentPairInformation.hpp"
-
-// Boost libraries.
-#include <boost/graph/adjacency_list.hpp>
-#include <boost/graph/iteration_macros.hpp>
-
-// Standard library.
-#include "cstdint.hpp"
-#include "iosfwd.hpp"
-#include <map>
-#include "string.hpp"
-#include "tuple.hpp"
-#include "utility.hpp"
-#include "vector.hpp"
-
-
-
-namespace shasta {
- namespace mode3 {
- class JaccardGraph;
- class JaccardGraphEdge;
- class JaccardGraphEdgeInfo;
- class JaccardGraphVertex;
-
- using JaccardGraphBaseClass = boost::adjacency_list<
- boost::listS, boost::listS, boost::bidirectionalS,
- JaccardGraphVertex, JaccardGraphEdge>;
-
- class ExpandedJaccardGraph;
- class ExpandedJaccardGraphVertex;
- using ExpandedJaccardGraphBaseClass = boost::adjacency_list<
- boost::setS, boost::listS, boost::bidirectionalS,
- ExpandedJaccardGraphVertex>;
-
- }
-
- namespace MemoryMapped {
- template<class T> class Vector;
- }
-}
-
-
-
-class shasta::mode3::JaccardGraphVertex {
-public:
-
- // The assembly graph segment corresponding to this vertex.
- uint64_t segmentId;
-};
-
-
-
-class shasta::mode3::JaccardGraphEdge {
-public:
-
- // The SegmentPairInformation computed by
- // mode3::AssemblyGraph::analyzeSegmentPair
- // when called for (segmentId0, segmentId1), in this order.
- SegmentPairInformation segmentPairInformation;
-
- // The segments encountered on the way.
- vector<uint64_t> segmentIds;
-
- // Flags for the directions in which this edge was found
- // (0=forward, 1=backward).
- array<bool, 2> wasFoundInDirection = {false, false};
-
- // A strong edge is one that was found in both directions.
- bool isStrong() const
- {
- return wasFoundInDirection[0] and wasFoundInDirection[1];
- }
-
- JaccardGraphEdge(
- const SegmentPairInformation& segmentPairInformation,
- uint64_t direction,
- const vector<uint64_t>& segmentIds) :
- segmentPairInformation(segmentPairInformation),
- segmentIds(segmentIds)
- {
- wasFoundInDirection[direction] = true;
- }
-};
-
-
-
-// This is only used during parallel creation of the edges.
-class shasta::mode3::JaccardGraphEdgeInfo {
-public:
- uint64_t segmentId0;
- uint64_t segmentId1;
-
- // The direction in which we found this (0=forward, 1=backward).
- uint64_t direction;
-
- // SegmentPairInformation between segmentId0 and segmentId1.
- SegmentPairInformation segmentPairInformation;
-
- // The segments encountered on the way.
- vector<uint64_t> segmentIds;
-};
-
-
-
-class shasta::mode3::JaccardGraph : public JaccardGraphBaseClass {
-public:
-
- // Create a JaccardGraph with the given number of vertices
- // (one for each segment) and no edges.
- JaccardGraph(uint64_t segmentCount);
-
- // Map segment ids to vertices.
- // If vertex is removed, the corresponding entry will be null_vertex().
- vector<vertex_descriptor> vertexTable;
-
- // Remove a vertex, making sure to update the vertexTable.
- void removeVertex(vertex_descriptor v);
-
- // The edges found by each thread.
- // Only used during edge creation.
- vector< vector<JaccardGraphEdgeInfo> > threadEdges;
-
- // Use the threadEdges to add edges to the graph.
- void storeEdges();
-
- // A strong vertex is one that is incident to at least one strong edge.
- bool isStrongVertex(vertex_descriptor) const;
-
- // Remove all weak vertices.
- void removeWeakVertices();
-
- // Remove all edges to/from weak vertices.
- void clearWeakVertices();
-
- // Write the JaccardGraph in graphviz format.
- void writeGraphviz(
- const string& fileName,
- bool includeIsolatedVertices,
- bool writeLabels) const;
- void writeGraphviz(
- ostream&,
- bool includeIsolatedVertices,
- bool writeLabels) const;
-
- // Write edges in csv format.
- void writeEdgesCsv(const string& fileName) const;
- void writeEdgesCsv(ostream&) const;
-
- // Compute all connected components of size at least minComponentSize.
- // They are stored in order of decreasing size.
- // The vectors contain segmentIds. Use the vertexMap
- // to convert to file decriptors.
- void computeConnectedComponents(uint64_t minComponentSize);
- vector< vector<uint64_t> > components;
-
- // Each stored connected component generates a cluster.
- void findClusters(
- MemoryMapped::Vector<uint64_t>& clusterIds);
-
- // Compute assembly paths.
- void computeAssemblyPaths();
- void computeAssemblyPaths(uint64_t componentId);
- vector< vector<uint64_t> > assemblyPaths;
-
-};
-
-
-
-class shasta::mode3::ExpandedJaccardGraphVertex {
-public:
-
- // The assembly graph segment corresponding to this vertex.
- uint64_t segmentId;
-
- // The total number of JaccardGraph vertices that were merged
- // into this vertex.
- uint64_t totalCount;
-
- // The number of primary JaccardGraph vertices that were merged
- // into this vertex.
- uint64_t primaryCount;
-
- // Construction
- ExpandedJaccardGraphVertex() {}
- ExpandedJaccardGraphVertex(
- uint64_t segmentId,
- bool isPrimary) :
- segmentId(segmentId),
- totalCount(1),
- primaryCount(isPrimary ? 1 : 0)
- {}
-
- double primaryFraction() const
- {
- return double(primaryCount) / double(totalCount);
- }
-
-};
-
-
-
-// The ExpandedJaccardGraph is constructed starting with vertices
-// of the JaccardGraph, and expanding each of the edges into a linear
-// chain of vertices. The graph is then cleaned up by merging equivalent branches.
-class shasta::mode3::ExpandedJaccardGraph : public ExpandedJaccardGraphBaseClass {
-public:
- ExpandedJaccardGraph(const JaccardGraph&);
-
- // Write in graphviz format.
- void writeGraphviz(const string& fileName) const;
- void writeGraphviz(ostream&) const;
-
- // Recursively merge pairs of vertices that have a common parent or child
- // and that refer to the same segmentId.
- void merge();
-private:
-
- // Each Branch represents a pair (vertex_descriptor, direction)
- // where direction can be:
- // - 0 (forward). In this case the vertex has out_degree>1.
- // or
- // - 1 (backward). In this case the vertex has in_degree>1.
- using Branch = pair<vertex_descriptor, uint64_t>;
-
- // Merge v1 and v2 while updating the set of branches.
- void merge(vertex_descriptor v1, vertex_descriptor v2, std::set<Branch>&, bool debug);
-};
-
-
-
-#endif
diff --git a/src/mode3-LocalAssembly.cpp b/src/mode3-LocalAssembly.cpp
new file mode 100644
index 0000000..8b62f00
--- /dev/null
+++ b/src/mode3-LocalAssembly.cpp
@@ -0,0 +1,1997 @@
+// Shasta.
+#include "mode3-LocalAssembly.hpp"
+#include "Assembler.hpp"
+#include "globalMsa.hpp"
+#include "markerAccessFunctions.hpp"
+#include "MarkerGraph.hpp"
+#include "orderPairs.hpp"
+#include "performanceLog.hpp"
+#include "platformDependent.hpp"
+#include "runCommandWithTimeout.hpp"
+#include "Reads.hpp"
+#include "timestamp.hpp"
+using namespace shasta;
+using namespace mode3;
+
+// Seqan.
+#include <seqan/align.h>
+
+// Boost libraries.
+#include <boost/pending/disjoint_sets.hpp>
+#include <boost/graph/iteration_macros.hpp>
+#include <boost/graph/strong_components.hpp>
+#include <boost/uuid/uuid.hpp>
+#include <boost/uuid/uuid_generators.hpp>
+#include <boost/uuid/uuid_io.hpp>
+
+// Standard library.
+#include "fstream.hpp"
+
+
+
+// The oriented reads common between edgeIdA and edgeIdB are always
+// used for assembly. The oriented reads that appear only
+// on edgeIdA or edgeIdB are used for assembly under control
+// of useA and useB.
+// So, if useA and useB are both true (the default), the assembly uses the
+// union of the oriented reads on edgeIdA and edgeIdB.
+// If they are both false, the assembly uses the
+// intersection of the oriented reads on edgeIdA and edgeIdB.
+// If useA is true and useB is false, the assembly uses the
+// oriented reads on edgeIdA, regardless of whether they appear on edgeIdB.
+// If useA is false and useB is true, the assembly uses the
+// oriented reads on edgeIdB, regardless of whether they appear on edgeIdA.
+LocalAssembly::LocalAssembly(
+ const Assembler& assembler,
+ MarkerGraphEdgeId edgeIdA,
+ MarkerGraphEdgeId edgeIdB,
+ uint64_t minVertexCoverage, // 0 = automatic
+ const LocalAssemblyDisplayOptions& displayOptions,
+ const Mode3AssemblyOptions::LocalAssemblyOptions& options,
+ bool useA,
+ bool useB) :
+ assembler(assembler),
+ edgeIdA(edgeIdA),
+ edgeIdB(edgeIdB),
+ options(displayOptions),
+ html(displayOptions.html)
+{
+
+
+ // Store the source target of edgeIdA and the source vertex of edgeIdB.
+ const MarkerGraph::Edge& edgeA = assembler.markerGraph.edges[edgeIdA];
+ const MarkerGraph::Edge& edgeB = assembler.markerGraph.edges[edgeIdB];
+ vertexIdA = edgeA.target;
+ vertexIdB = edgeB.source;
+
+ // If the edges are adjacent, stop here, leaving the AssembnlyPath empty.
+ // This results in empty secondary sequence.
+ if(vertexIdA == vertexIdB) {
+ if(html) {
+ html << "<br>The two edges are adjacent. Intervening sequence is empty.";
+ }
+ return;
+ }
+
+ // Check assumptions here as this used vertexIdA and vertexIdB.
+ checkAssumptions();
+
+ // Oriented reads.
+ gatherOrientedReads(useA, useB);
+
+ // Use the oriented reads that appear both on vertexIdA and vertexIdB
+ // to estimate the base offset between vertexIdA and vertexIdB.
+ estimateOffset();
+
+ // If the offset is negative or cannot be estimated, stop here.
+ // This is pathological and results in empty assembled sequence.
+ if((estimatedABOffset == invalid<int64_t>) or (estimatedABOffset <= 0)) {
+ if(html) {
+ html << "<br>The estimated offset is not positive." << endl;
+ }
+ return;
+ }
+
+ // Markers.
+ gatherMarkers(options.estimatedOffsetRatio);
+ writeOrientedReads();
+ writeOrientedReadsSequences();
+
+ // Assembly graph.
+ alignAndDisjointSets(
+ options.matchScore, options.mismatchScore, options.gapScore,
+ options.maxSkipBases,
+ options.maxDrift, options.minHalfBand, options.minScoreRatio);
+ writeMarkers();
+
+ // Iteration to reduce minVertexCoverage if a long MSA is encountered.
+ while(true) {
+
+ minVertexCoverage = createVertices(minVertexCoverage, options.vertexSamplingRate);
+ createEdges();
+ writeGraph("Initial assembly graph");
+
+ // Remove strongly connected components, then regenerate
+ // edges from scratch with the remaining vertices.
+ if(removeStrongComponents() > 0) {
+ removeAllEdges();
+ createEdges();
+ writeGraph("Assembly graph after removal of strong connected components");
+ }
+
+ // This must be done after removing strongly connected components.
+ // Otherwise we can have inaccessible vertices that cause the
+ // assembly path to encounter dead ends.
+ if(removeInaccessibleVertices()) {
+ writeGraph("Assembly graph after removal of inaccessible vertices.");
+ }
+
+ // Assemble.
+ findAssemblyPath();
+ if(minVertexCoverage > 2) {
+ try {
+ assembleAssemblyPathEdges(options.maxMsaLength, LongMsaPolicy::throwException);
+ } catch(...) {
+ --minVertexCoverage;
+ clear();
+ if(html and displayOptions.showDebugInformation) {
+ html << "<br>minVertexCoverage reduced to " << minVertexCoverage;
+ }
+ continue;
+ }
+
+ } else {
+ assembleAssemblyPathEdges(options.maxMsaLength, LongMsaPolicy::assembleAtLowCoverage);
+ }
+ writeGraph("Assembly graph after assembly");
+
+ // Write assembled sequence.
+ if(html) {
+ vector<Base> sequence;
+ getSecondarySequence(sequence);
+
+ html <<
+ "<h2>Assembled sequence</h2>"
+ "Assembled sequence not including the first and last edge is " <<
+ sequence.size() << " bases long."
+ "<pre style='font-family:monospace'>\n";
+ copy(sequence.begin(), sequence.end(), ostream_iterator<Base>(html));
+ html << "</pre>";
+
+ ofstream fasta("LocalAssembly.fasta");
+ fasta << ">LocalAssembly " << sequence.size() << endl;
+ copy(sequence.begin(), sequence.end(), ostream_iterator<Base>(fasta));
+
+ getCompleteSequence(sequence);
+
+ html <<
+ "Assembled sequence including the first and last edge is " <<
+ sequence.size() << " bases long."
+ "<pre style='font-family:monospace'>\n";
+ copy(sequence.begin(), sequence.end(), ostream_iterator<Base>(html));
+ html << "</pre>";
+ }
+
+ break;
+ }
+
+}
+
+
+
+void LocalAssembly::checkAssumptions() const
+{
+ SHASTA_ASSERT(edgeIdA != edgeIdB);
+ SHASTA_ASSERT(assembler.assemblerInfo->assemblyMode == 3);
+ SHASTA_ASSERT(assembler.getReads().representation == 0);
+ SHASTA_ASSERT(not assembler.markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdA));
+ SHASTA_ASSERT(not assembler.markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdB));
+
+ const MarkerGraph& markerGraph = assembler.markerGraph;
+ const auto& markers = assembler.markers;
+
+ // edgeIdA and edgeIdB cannot have duplicate oriented reads.
+ if(markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdA)) {
+ throw runtime_error("Duplicated oriented read on edgeIdA.");
+ }
+ if(markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdB)) {
+ throw runtime_error("Duplicated oriented read on edgeIdB.");
+ }
+
+ // Neither can their source and target vertices.
+ if(markerGraph.vertexHasDuplicateOrientedReadIds(vertexIdA, markers)) {
+ throw runtime_error("Duplicated oriented read on target vertex of edgeIdA.");
+ }
+ if(markerGraph.vertexHasDuplicateOrientedReadIds(vertexIdB, markers)) {
+ throw runtime_error("Duplicated oriented read on source vertex of edgeIdB.");
+ }
+}
+
+
+
+void LocalAssembly::gatherOrientedReads(bool useA, bool useB)
+{
+ // Joint loop over marker intervals that appear in edgeIdA and/or edgeIdB.
+ const auto markerIntervalsA = assembler.markerGraph.edgeMarkerIntervals[edgeIdA];
+ const auto markerIntervalsB = assembler.markerGraph.edgeMarkerIntervals[edgeIdB];
+ const auto beginA = markerIntervalsA.begin();
+ const auto beginB = markerIntervalsB.begin();
+ const auto endA = markerIntervalsA.end();
+ const auto endB = markerIntervalsB.end();
+ auto itA = beginA;
+ auto itB = beginB;
+ while(true) {
+ if((itA == endA) and (itB == endB)) {
+ break;
+ }
+
+ // Oriented reads that appear only in edgeIdA.
+ if((itB == endB) or (itA != endA and itA->orientedReadId < itB->orientedReadId)) {
+
+ if(useA) {
+ const MarkerInterval& markerIntervalA = *itA;
+ const OrientedReadId orientedReadIdA = markerIntervalA.orientedReadId;
+ const uint32_t ordinalA = markerIntervalA.ordinals[1]; // Because vertexIdA is the target of edgeIdA
+
+ OrientedReadInfo info(orientedReadIdA);
+ info.ordinalA = ordinalA;
+ orientedReadInfos.push_back(info);
+ }
+
+ ++itA;
+ }
+
+
+
+ // Oriented reads that appear only in edgeIdB.
+ else if((itA == endA) or (itB != endB and itB->orientedReadId < itA->orientedReadId)) {
+
+ if(useB) {
+ const MarkerInterval& markerIntervalB = *itB;
+ const OrientedReadId orientedReadIdB = markerIntervalB.orientedReadId;
+ const uint32_t ordinalB = markerIntervalB.ordinals[0]; // Because vertexIdB is the source of edgeIdB
+
+ OrientedReadInfo info(orientedReadIdB);
+ info.ordinalB = ordinalB;
+ orientedReadInfos.push_back(info);
+ }
+
+ ++itB;
+ }
+
+
+
+ // Oriented reads that appear in both edgeIdA and edgeIdB.
+ // They are always used for assembly regardless of the settings of useA and useB.
+ else {
+ SHASTA_ASSERT(itA != endA);
+ SHASTA_ASSERT(itB != endB);
+
+ const MarkerInterval& markerIntervalA = *itA;
+ const OrientedReadId orientedReadIdA = markerIntervalA.orientedReadId;
+
+ const MarkerInterval& markerIntervalB = *itB;
+ const OrientedReadId orientedReadIdB = markerIntervalB.orientedReadId;
+
+ SHASTA_ASSERT(orientedReadIdA == orientedReadIdB);
+ const OrientedReadId orientedReadId = orientedReadIdA;
+
+ const uint32_t ordinalA = markerIntervalA.ordinals[1]; // Because vertexIdA is the target of edgeIdA
+ const uint32_t ordinalB = markerIntervalB.ordinals[0]; // Because vertexIdB is the source of edgeIdB
+
+ // Only use it if the ordinal offset is not negative.
+ if(ordinalB >= ordinalA) {
+
+ OrientedReadInfo info(orientedReadId);
+ info.ordinalA = ordinalA;
+ info.ordinalB = ordinalB;
+ orientedReadInfos.push_back(info);
+ }
+
+ ++itA;
+ ++itB;
+ }
+
+ }
+}
+
+
+
+void LocalAssembly::writeOrientedReads() const
+{
+ if(not html) {
+ return;
+ }
+ if(not options.showOrientedReads) {
+ return;
+ }
+
+ html <<
+ "<h2>Oriented reads</h2>"
+ "<table>"
+ "<tr>"
+ "<th>Index"
+ "<th>Oriented<br>read"
+ "<th>OrdinalA"
+ "<th>OrdinalB"
+ "<th>Ordinal<br>offset"
+ "<th>PositionA"
+ "<th>PositionB"
+ "<th>Position<br>offset"
+ "<th>First<br>ordinal"
+ "<th>Last<br>ordinal"
+ "<th>First<br>position"
+ "<th>Last<br>position"
+ ;
+
+ for(uint64_t i=0; i<orientedReadInfos.size(); i++) {
+ const OrientedReadInfo& info = orientedReadInfos[i];
+
+ html <<
+ "<tr>"
+ "<td class=centered>" << i <<
+ "<td class=centered>" << info.orientedReadId;
+
+ html << "<td class=centered>";
+ if(info.isOnA()) {
+ html << info.ordinalA;
+ }
+
+ html << "<td class=centered>";
+ if(info.isOnB()) {
+ html << info.ordinalB;
+ }
+
+ html << "<td class=centered>";
+ if(info.isOnA() and info.isOnB()) {
+ html << info.ordinalOffset();
+ }
+
+ html << "<td class=centered>";
+ if(info.isOnA()) {
+ html << basePosition(info.orientedReadId, info.ordinalA);
+ }
+
+ html << "<td class=centered>";
+ if(info.isOnB()) {
+ html << basePosition(info.orientedReadId, info.ordinalB);
+ }
+
+ html << "<td class=centered>";
+ if(info.isOnA() and info.isOnB()) {
+ const int64_t baseOffset =
+ basePosition(info.orientedReadId, info.ordinalB) -
+ basePosition(info.orientedReadId, info.ordinalA);
+ SHASTA_ASSERT(baseOffset >= 0);
+ html << baseOffset;
+ }
+
+ SHASTA_ASSERT(not info.markerInfos.empty());
+ const MarkerInfo& firstMarkerInfo = info.markerInfos.front();
+ const MarkerInfo& lastMarkerInfo = info.markerInfos.back();
+ html <<
+ "<td class=centered>" << firstMarkerInfo.ordinal <<
+ "<td class=centered>" << lastMarkerInfo.ordinal <<
+ "<td class=centered>" << firstMarkerInfo.position <<
+ "<td class=centered>" << lastMarkerInfo.position;
+ }
+
+ html << "</table>";
+
+
+ // Count reads.
+ uint64_t commonCount = 0;
+ uint64_t onlyACount = 0;
+ uint64_t onlyBCount = 0;
+ for(const OrientedReadInfo& info: orientedReadInfos) {
+ const bool isOnA = info.isOnA();
+ const bool isOnB = info.isOnB();
+ if(isOnA) {
+ if(isOnB) {
+ ++commonCount;
+ } else {
+ ++onlyACount;
+ }
+ } else {
+ if(isOnB) {
+ ++onlyBCount;
+ } else {
+ SHASTA_ASSERT(0);
+ }
+
+ }
+ }
+ html <<
+ "<p><table>"
+ "<tr><th class=left>Common<td class=centered>" << commonCount <<
+ "<tr><th class=left>On A only<td class=centered>" << onlyACount <<
+ "<tr><th class=left>On B only<td class=centered>" << onlyBCount <<
+ "<tr><th class=left>Total<td class=centered>" << orientedReadInfos.size() <<
+ "</table>";
+
+}
+
+
+
+// Get the base position of a marker in an oriented read
+// given the ordinal.
+int64_t LocalAssembly::basePosition(OrientedReadId orientedReadId, int64_t ordinal) const
+{
+ const MarkerId markerId = assembler.getMarkerId(orientedReadId, uint32_t(ordinal));
+ const int64_t position = int64_t(assembler.markers.begin()[markerId].position);
+ return position;
+
+}
+
+
+
+void LocalAssembly::estimateOffset()
+{
+ int64_t n = 0;
+ int64_t sum = 0;
+ for(const OrientedReadInfo& info: orientedReadInfos) {
+ if(info.isOnA() and info.isOnB()) {
+ const OrientedReadId orientedReadId = info.orientedReadId;
+ const int64_t positionA = basePosition(orientedReadId, info.ordinalA);
+ const int64_t positionB = basePosition(orientedReadId, info.ordinalB);
+ const int64_t baseOffset = positionB - positionA;
+ SHASTA_ASSERT(baseOffset >= 0);
+
+ sum += baseOffset;
+ ++n;
+ }
+ }
+ if(n == 0) {
+ estimatedABOffset = invalid<int64_t>;
+
+ if(html) {
+ html << "<br>The offset cannot be estimated because there are no common oriented reads between " <<
+ edgeIdA << " and " << edgeIdB;
+ }
+ } else {
+ estimatedABOffset = int64_t(std::round(double(sum) / double(n)));
+
+ if(html) {
+ html << "<br>Estimated position offset is " << estimatedABOffset << " bases.";
+ }
+ }
+}
+
+
+
+// Fill in the markerInfos vector of each read.
+void LocalAssembly::gatherMarkers(double estimatedOffsetRatio)
+{
+ const int64_t offsetThreshold = int64_t(estimatedOffsetRatio * double(estimatedABOffset));
+
+
+ // Loop over our oriented reads.
+ for(uint64_t i=0; i<orientedReadInfos.size(); i++) {
+ OrientedReadInfo& info = orientedReadInfos[i];
+ const OrientedReadId orientedReadId = info.orientedReadId;
+ info.markerInfos.clear();
+
+ // Oriented reads that appear on both edgeIdA and edgeIdB.
+ if(info.isOnA() and info.isOnB()) {
+ for(int64_t ordinal=info.ordinalA;
+ ordinal<=info.ordinalB; ordinal++) {
+ addMarkerInfo(i, ordinal);
+ }
+ }
+
+ // Oriented reads that appear on edgeIdA but not on edgeIdB.
+ else if(info.isOnA() and not info.isOnB()) {
+ const int64_t maxPosition = basePosition(orientedReadId, info.ordinalA) + offsetThreshold;
+ const int64_t markerCount = int64_t(assembler.markers.size(orientedReadId.getValue()));
+
+ for(int64_t ordinal=info.ordinalA;
+ ordinal<markerCount; ordinal++) {
+ const int64_t position = basePosition(orientedReadId, ordinal);
+ if(position > maxPosition) {
+ break;
+ }
+ addMarkerInfo(i, ordinal);
+ }
+ }
+
+ // Oriented reads that appear on edgeIdB but not on edgeIdA.
+ else if(info.isOnB() and not info.isOnA()) {
+ const int64_t minPosition = basePosition(orientedReadId, info.ordinalB) - offsetThreshold;
+
+ for(int64_t ordinal=info.ordinalB; ordinal>=0; ordinal--) {
+ const int64_t position = basePosition(orientedReadId, ordinal);
+ if(position < minPosition) {
+ break;
+ }
+ addMarkerInfo(i, ordinal);
+ }
+
+ // We added the MarkerInfos in reverse order, so we have to reverse them.
+ reverse(info.markerInfos.begin(), info.markerInfos.end());
+ }
+
+ else {
+ SHASTA_ASSERT(0);
+ }
+ }
+
+}
+
+
+
+// Add the marker at given ordinal to the i-th oriented read.
+void LocalAssembly::addMarkerInfo(uint64_t i, int64_t ordinal)
+{
+ OrientedReadInfo& info = orientedReadInfos[i];
+
+ MarkerInfo markerInfo;
+ markerInfo.ordinal = ordinal;
+ markerInfo.position = basePosition(info.orientedReadId, ordinal);
+ markerInfo.kmerId = getOrientedReadMarkerKmerId(
+ info.orientedReadId,
+ uint32_t(ordinal),
+ assembler.assemblerInfo->k,
+ assembler.getReads(),
+ assembler.markers);
+
+ info.markerInfos.push_back(markerInfo);
+}
+
+
+
+void LocalAssembly::writeMarkers()
+{
+ if(not (html and options.showMarkers)) {
+ return;
+ }
+
+ const uint64_t k = assembler.assemblerInfo->k;
+
+ html <<
+ "<h2>Markers used in this assembly step</h2>"
+ "<table>"
+ "<tr>"
+ "<th>Oriented<br>read<br>index"
+ "<th>Oriented<br>read"
+ "<th>Ordinal"
+ "<th>Ordinal<br>offset<br>from A"
+ "<th>Ordinal<br>offset<br>to B"
+ "<th>Position"
+ "<th>KmerId"
+ "<th>Kmer"
+ "<th>Vertex"
+ "<th>Coverage";
+
+ for(uint64_t i=0; i<orientedReadInfos.size(); i++) {
+ const OrientedReadInfo& info = orientedReadInfos[i];
+ for(const MarkerInfo& markerInfo: info.markerInfos) {
+ const Kmer kmer(markerInfo.kmerId, k);
+
+ html <<
+ "<tr>"
+ "<td class=centered>" << i <<
+ "<td class=centered>" << info.orientedReadId <<
+ "<td class=centered>" << markerInfo.ordinal;
+
+ // Ordinal offset from A.
+ html << "<td class=centered>";
+ if(info.isOnA()) {
+ html << markerInfo.ordinal - info.markerInfos.front().ordinal;
+ }
+
+ // Ordinal offset to B.
+ html << "<td class=centered>";
+ if(info.isOnB()) {
+ html << info.markerInfos.back().ordinal - markerInfo.ordinal;
+ }
+
+ html <<
+ "<td class=centered>" << markerInfo.position <<
+ "<td class=centered>" << markerInfo.kmerId <<
+ "<td class=centered style='font-family:monospace'>";
+ kmer.write(html, k);
+ html <<
+ "<td class=centered>" << markerInfo.disjointSetId <<
+ "<td class=centered>" << disjointSetsMap[markerInfo.disjointSetId].size();
+ }
+ }
+
+ html << "</table>";
+}
+
+
+
+// Compute alignments and use them to create the disjoint set data structure,
+// from which the marker graph will be created.
+// maxDrift is the maximum tolerated length drift of each read.
+// Used to compute the band for banded alignments.
+void LocalAssembly::alignAndDisjointSets(
+ uint64_t matchScore,
+ uint64_t mismatchScore,
+ uint64_t gapScore,
+ uint64_t maxSkipBases,
+ double maxDrift,
+ uint64_t minHalfBand,
+ double minScoreRatio
+ )
+{
+
+ // SeqAn types we need.
+ using TSequence = seqan::String<KmerId>;
+ using TStringSet = seqan::StringSet<TSequence>;
+ using TDepStringSet = seqan::StringSet< TSequence, seqan::Dependent<> >;
+ using TAlignGraph = seqan::Graph< seqan::Alignment<TDepStringSet> >;
+
+ const bool detailedDebugOutput = false;
+ ofstream dot;
+ ofstream csv;
+ if(detailedDebugOutput) {
+ dot.open("LocalAssembly-AlignmentDetails.dot");
+ dot << "graph PathFiler3lignments {\n";
+ csv.open("LocalAssembly-AlignmentDetails.csv");
+ csv << "OrientedReadId0,Ordinal0,OrientedReadId1,Ordinal1\n";
+ }
+
+ // Assign ids to markers.
+ uint64_t markerId = 0;
+ for(OrientedReadInfo& info: orientedReadInfos) {
+ for(MarkerInfo& markerInfo: info.markerInfos) {
+ markerInfo.id = markerId++;
+ }
+ }
+
+ // Initialize the disjoint sets data structure.
+ const uint64_t markerCount = markerId;
+ vector<uint64_t> rank(markerCount);
+ vector<uint64_t> parent(markerCount);
+ boost::disjoint_sets<uint64_t*, uint64_t*> disjointSets(&rank[0], &parent[0]);
+ for(uint64_t markerId=0; markerId<markerCount; markerId++) {
+ disjointSets.make_set(markerId);
+ }
+
+ // Construct a Seqan sequence containing the KmerIds for each oriented read.
+ // Add 100 to each KmerId because Seqan uses 45 to represent a gap.
+ vector<TSequence> seqanSequences(orientedReadInfos.size());
+ for(uint64_t i=0; i<orientedReadInfos.size(); i++) {
+ const OrientedReadInfo& info = orientedReadInfos[i];
+ TSequence& seqanSequence = seqanSequences[i];
+ for(const MarkerInfo& markerInfo: info.markerInfos) {
+ seqan::appendValue(seqanSequence, markerInfo.kmerId + 100);
+ }
+ }
+
+
+
+ // Loop over pairs of reads.
+ for(uint64_t i0=0; i0<orientedReadInfos.size()-1; i0++) {
+ const OrientedReadInfo& info0 = orientedReadInfos[i0];
+ const uint64_t length0 = info0.markerInfos.size();
+ const TSequence& seqanSequence0 = seqanSequences[i0];
+ for(uint64_t i1=i0+1; i1<orientedReadInfos.size(); i1++) {
+ const OrientedReadInfo& info1 = orientedReadInfos[i1];
+ // cout << "*** " << info0.orientedReadId << " " << info1.orientedReadId << endl;
+ const uint64_t length1 = info1.markerInfos.size();
+ const TSequence& seqanSequence1 = seqanSequences[i1];
+
+ // Figure the constraints for this alignment.
+ const bool constrainedA = info0.isOnA() and info1.isOnA();
+ const bool constrainedB = info0.isOnB() and info1.isOnB();
+
+ // If constrained on A, merge the first markers of the two reads,
+ // as the alignment does not guarantee that.
+ // If constrained on B, merge the last markers of the two reads,
+ // as the alignment does not guarantee that.
+ if(constrainedA) {
+ disjointSets.union_set(info0.markerInfos.front().id, info1.markerInfos.front().id);
+ }
+ if(constrainedB) {
+ disjointSets.union_set(info0.markerInfos.back().id, info1.markerInfos.back().id);
+ }
+
+ // Only do alignments that are constrained on at least one side.
+ if(not (constrainedA or constrainedB)) {
+ continue;
+ }
+
+ // Align the KmerIds of these oriented reads.
+ // For now we do a full blown alignment, but later
+ // we should use banded alignments instead.
+ // Store them in a SeqAn string set.
+ TStringSet sequences;
+ appendValue(sequences, seqanSequence0);
+ appendValue(sequences, seqanSequence1);
+
+
+#if 0
+ // Old code that used geneal alignment, not banded alignments.
+ // Compute the alignment.
+ using namespace seqan;
+ TAlignGraph graph(sequences);
+ if(constrainedA and constrainedB) {
+ globalAlignment(
+ graph,
+ Score<int, Simple>(int(matchScore), int(mismatchScore), int(gapScore)),
+ AlignConfig<false, false, false, false>(),
+ LinearGaps());
+ } else if(constrainedA and not constrainedB) {
+ globalAlignment(
+ graph,
+ Score<int, Simple>(int(matchScore), int(mismatchScore), int(gapScore)),
+ AlignConfig<false, false, true, true>(),
+ LinearGaps());
+ } else if(constrainedB and not constrainedA) {
+ globalAlignment(
+ graph,
+ Score<int, Simple>(int(matchScore), int(mismatchScore), int(gapScore)),
+ AlignConfig<true, true, false, false>(),
+ LinearGaps());
+ } else {
+ SHASTA_ASSERT(0);
+ }
+#endif
+
+
+ // Banded alignment, allowing for the specified maxDrift.
+ // This is necessary to prevent large cycles in the graph.
+ // It is also good for performance.
+ using namespace seqan;
+ TAlignGraph graph(sequences);
+ int score = 0;
+ if(constrainedA and constrainedB) {
+ const int64_t diagonalA = 0;
+ const int64_t diagonalB = int64_t(length0) - int64_t(length1);
+ const int64_t totalDrift = int64_t(maxDrift * 0.5 * double(min(length0, length1)));
+ const int64_t halfBand = totalDrift + int64_t(minHalfBand);
+ const int64_t minBand = min(diagonalA, diagonalB) - halfBand;
+ const int64_t maxBand = max(diagonalA, diagonalB) + halfBand;
+ score = globalAlignment(
+ graph,
+ Score<int, Simple>(int(matchScore), int(mismatchScore), int(gapScore)),
+ AlignConfig<false, false, false, false>(),
+ int(minBand), int(maxBand),
+ LinearGaps());
+ } else if(constrainedA and not constrainedB) {
+ const int64_t diagonalA = 0;
+ const int64_t totalDrift = int64_t(maxDrift * double(min(length0, length1)));
+ const int64_t halfBand = totalDrift + int64_t(minHalfBand);
+ const int64_t minBand = diagonalA - halfBand;
+ const int64_t maxBand = diagonalA + halfBand;
+ score = globalAlignment(
+ graph,
+ Score<int, Simple>(int(matchScore), int(mismatchScore), int(gapScore)),
+ AlignConfig<false, false, true, true>(),
+ int(minBand), int(maxBand),
+ LinearGaps());
+ } else if(constrainedB and not constrainedA) {
+ const int64_t diagonalB = int64_t(length0) - int64_t(length1);
+ const int64_t totalDrift = int64_t(maxDrift * double(min(length0, length1)));
+ const int64_t halfBand = totalDrift + int64_t(minHalfBand);
+ const int64_t minBand = diagonalB - halfBand;
+ const int64_t maxBand = diagonalB + halfBand;
+ score = globalAlignment(
+ graph,
+ Score<int, Simple>(int(matchScore), int(mismatchScore), int(gapScore)),
+ AlignConfig<true, true, false, false>(),
+ int(minBand), int(maxBand),
+ LinearGaps());
+ } else {
+ SHASTA_ASSERT(0);
+ }
+
+ // If SeqAn was not able to compute the banded aignment, ignore it.
+ if(score == MinValue<int>::VALUE) {
+ if(html and options.showDebugInformation) {
+ html << "<br>Alignment between " << info0.orientedReadId <<
+ " and " << info1.orientedReadId <<
+ " ignored.";
+ }
+ continue;
+ }
+
+ // Check that the score is sufficiently good.
+ const uint64_t bestPossibleScore = matchScore * min(length0, length1);
+ const double scoreRatio = double(score) / double(bestPossibleScore);
+ if(scoreRatio < minScoreRatio) {
+ if(html and options.showDebugInformation) {
+ html << "<br>Alignment between " << info0.orientedReadId <<
+ " and " << info1.orientedReadId << ": lengths " << length0 << " " << length1 <<
+ ", score " << score << "/" << bestPossibleScore << " " <<
+ double(score) / double(bestPossibleScore) <<
+ " discarded due to low score.";
+ }
+ continue;
+ }
+
+
+
+ // Extract the alignment from the graph.
+ // This creates a single sequence consisting of the two rows
+ // of the alignment, concatenated.
+ TSequence align;
+ convertAlignment(graph, align);
+ const uint64_t totalAlignmentLength = seqan::length(align);
+ SHASTA_ASSERT((totalAlignmentLength % 2) == 0); // Because we are aligning two sequences.
+ const uint64_t alignmentLength = totalAlignmentLength / 2;
+ const uint64_t seqanGapValue = 45;
+
+#if 0
+ // This is not needed when doing banded alignments.
+ // If the alignment has large base skips, don't use it.
+ bool hasLargeSkip = false;
+ uint64_t j0 = 0;
+ uint64_t j1 = 0;
+ uint64_t previousPosition0 = invalid<uint64_t>;
+ uint64_t previousPosition1 = invalid<uint64_t>;
+ for(uint64_t positionInAlignment=0; positionInAlignment<alignmentLength; positionInAlignment++) {
+ const KmerId kmerId0 = align[positionInAlignment];
+ const KmerId kmerId1 = align[positionInAlignment + alignmentLength];
+
+ if(kmerId0 == seqanGapValue) {
+ if(kmerId1 == seqanGapValue) {
+ // Both 0 and 1 are gaps.
+ SHASTA_ASSERT(0);
+ } else {
+ // 0 is gap, 1 is not gap.
+ ++j1;
+ }
+ } else {
+ if(kmerId1 == seqanGapValue) {
+ // 0 is not gap, 1 is gap.
+ ++j0;
+ } else {
+ // Neither 0 nor 1 is a gap.
+ if(kmerId0 == kmerId1) {
+ // Check for large base skips.
+ const uint64_t position0 = info0.markerInfos[j0].position;
+ const uint64_t position1 = info1.markerInfos[j1].position;
+ // cout << "***A " << position0 << " " << position1 << endl;
+ if(previousPosition0 != invalid<uint64_t>) {
+ const int64_t offset = int64_t(position0) - int64_t(position1);
+ const int64_t previousOffset = int64_t(previousPosition0) - int64_t(previousPosition1);
+ if(abs(offset - previousOffset) > int64_t(maxSkipBases)) {
+ hasLargeSkip = true;
+ // cout << "Skip" << endl;
+ break;
+ }
+ }
+ previousPosition0 = position0;
+ previousPosition1 = position1;
+ }
+ ++j0;
+ ++j1;
+ }
+
+ }
+ }
+ if(hasLargeSkip) {
+ if(html and options.showDebugInformation) {
+ html << "<br>Alignment between " << info0.orientedReadId <<
+ " and " << info1.orientedReadId <<
+ " suppressed.";
+ }
+ continue;
+ }
+#endif
+
+
+ // Use the alignment to update the disjoint sets data structure.
+ uint64_t j0 = 0;
+ uint64_t j1 = 0;
+ for(uint64_t positionInAlignment=0; positionInAlignment<alignmentLength; positionInAlignment++) {
+ const KmerId kmerId0 = align[positionInAlignment];
+ const KmerId kmerId1 = align[positionInAlignment + alignmentLength];
+
+ if(kmerId0 == seqanGapValue) {
+ if(kmerId1 == seqanGapValue) {
+ // Both 0 and 1 are gaps.
+ SHASTA_ASSERT(0);
+ } else {
+ // 0 is gap, 1 is not gap.
+ ++j1;
+ }
+ } else {
+ if(kmerId1 == seqanGapValue) {
+ // 0 is not gap, 1 is gap.
+ ++j0;
+ } else {
+ // Neither 0 nor 1 is a gap.
+ if(kmerId0 == kmerId1) {
+ // If a match, merge the disjoint sets containing these two markers.
+ disjointSets.union_set(info0.markerInfos[j0].id, info1.markerInfos[j1].id);
+ if(detailedDebugOutput) {
+ dot << "\"" << info0.orientedReadId << "-";
+ dot << info0.markerInfos[j0].ordinal << "\"--\"";
+ dot << info1.orientedReadId << "-";
+ dot << info1.markerInfos[j1].ordinal << "\";\n";
+ csv <<
+ info0.orientedReadId << "," <<
+ info0.markerInfos[j0].ordinal << "," <<
+ info1.orientedReadId << "," <<
+ info1.markerInfos[j1].ordinal << "\n";
+ }
+ }
+ ++j0;
+ ++j1;
+ }
+
+ }
+ }
+ SHASTA_ASSERT(j0 == length0);
+ SHASTA_ASSERT(j1 == length1);
+ }
+ }
+
+ // Store in each MarkerInfo the id of the disjoint set it belongs to.
+ for(uint64_t i=0; i<orientedReadInfos.size(); i++) {
+ OrientedReadInfo& info = orientedReadInfos[i];
+ for(MarkerInfo& markerInfo: info.markerInfos) {
+ markerInfo.disjointSetId = disjointSets.find_set(markerInfo.id);
+ }
+ }
+
+ // Fill in the disjoint sets map.
+ disjointSetsMap.clear();
+ for(uint64_t i=0; i<orientedReadInfos.size(); i++) {
+ const OrientedReadInfo& info = orientedReadInfos[i];
+ for(uint64_t j=0; j<info.markerInfos.size(); j++) {
+ const MarkerInfo& markerInfo = info.markerInfos[j];
+ disjointSetsMap[markerInfo.disjointSetId].push_back({i, j});
+ }
+ }
+
+ // Histogram disjoint sets sizes.
+ disjointSetsSizeHistogram.clear();
+ for(const auto& p: disjointSetsMap) {
+ const uint64_t disjointSetSize = p.second.size();
+ if(disjointSetSize >= disjointSetsSizeHistogram.size()) {
+ disjointSetsSizeHistogram.resize(disjointSetSize + 1, 0);
+ }
+ ++disjointSetsSizeHistogram[disjointSetSize];
+ }
+
+
+ // Write the histogram of disjoint sets sizes.
+ if(html and options.showDebugInformation) {
+
+ html <<
+ "<h2>Disjoint sets size histogram</h2>"
+ "<table>"
+ "<tr>"
+ "<th>Size"
+ "<th>Frequency"
+ "<th>Markers";
+
+ for(uint64_t disjointSetSize=0; disjointSetSize<disjointSetsSizeHistogram.size(); disjointSetSize++) {
+ const uint64_t frequency = disjointSetsSizeHistogram[disjointSetSize];
+ if(frequency) {
+ html <<
+ "<tr>"
+ "<td class=centered>" << disjointSetSize <<
+ "<td class=centered>" << frequency <<
+ "<td class=centered>" << frequency * disjointSetSize;
+ }
+ }
+
+ html << "</table>";
+ }
+
+ if(detailedDebugOutput) {
+ dot << "}\n";
+ }
+}
+
+
+
+// Create vertices. Each disjoint set with at least minVertexCoverage markers
+// generates a vertex.
+uint64_t LocalAssembly::createVertices(
+ uint64_t minVertexCoverage,
+ double vertexSamplingRate) // Only used if minVertexCoverage is 0
+{
+ LocalAssembly& graph = *this;
+
+ // Remove all vertices and edges, just in case.
+ LocalAssemblyBaseClass::clear();
+ vertexMap.clear();
+
+ // Find the disjoint sets corresponding to vertexIdA and vertexIdB.
+ // Those will always generate a vertex regardless of coverage.
+ disjointSetIdA = invalid<uint64_t>;
+ disjointSetIdB = invalid<uint64_t>;
+ for(const OrientedReadInfo& info: orientedReadInfos) {
+ if(info.isOnA()) {
+ const MarkerInfo& markerInfoA = info.markerInfos.front();
+ if(disjointSetIdA == invalid<uint64_t>) {
+ disjointSetIdA = markerInfoA.disjointSetId;
+ } else {
+ SHASTA_ASSERT(disjointSetIdA == markerInfoA.disjointSetId);
+ }
+ }
+ if(info.isOnB()) {
+ const MarkerInfo& markerInfoB = info.markerInfos.back();
+ if(disjointSetIdB == invalid<uint64_t>) {
+ disjointSetIdB = markerInfoB.disjointSetId;
+ } else {
+ SHASTA_ASSERT(disjointSetIdB == markerInfoB.disjointSetId);
+ }
+ }
+ }
+
+ if(html) {
+ html << "<br>Start vertex " << disjointSetIdA << ", end vertex " << disjointSetIdB;
+ }
+
+
+
+ // If minVertexCoverage is 0, select a value automatically.
+ // Select a value that gives a number of vertices approximately correct given
+ // the estimated offset.
+ if(minVertexCoverage == 0) {
+
+ // Estimate the desired number of vertices given the estimated offset.
+ const uint64_t totalBaseCount = assembler.assemblerInfo->baseCount * 2; // Both strands.
+ const uint64_t totalMarkerCount = assembler.markers.totalSize();
+ const double markerDensity = double(totalMarkerCount) / double(totalBaseCount);
+ const uint64_t desiredVertexCount = uint64_t(
+ vertexSamplingRate * markerDensity * double(estimatedABOffset));
+
+ // Use the disjointSetsSizeHistogram to choose a value of minVertexCoverage
+ // that will give us approximately this number of vertices.
+ // Never reduce minVertexCoverage below 2.
+ uint64_t cumulativeDisjointSetsCount = 0;
+ for(minVertexCoverage = disjointSetsSizeHistogram.size()-1; minVertexCoverage>2; --minVertexCoverage) {
+ cumulativeDisjointSetsCount += disjointSetsSizeHistogram[minVertexCoverage];
+#if 0
+ if(html and options.showDebugInformation) {
+ html << "<br>minVertexCoverage " << minVertexCoverage <<
+ " would generate " << cumulativeDisjointSetsCount <<
+ " vertices and we want " << desiredVertexCount;
+ }
+#endif
+ if(cumulativeDisjointSetsCount >= desiredVertexCount) {
+ break;
+ }
+ }
+
+ if(html and options.showDebugInformation) {
+ html << "<br>Set minVertexCoverage to " << minVertexCoverage <<
+ " based on marker density " << markerDensity <<
+ ", vertex sampling rate " << vertexSamplingRate <<
+ ", desired number of vertices " << desiredVertexCount;
+ }
+ }
+
+
+
+ // Loop over disjoint sets that are large enough.
+ // Also always include disjointSetIdA and disjointSetIdB.
+ for(const auto& p: disjointSetsMap) {
+ const uint64_t disjointSetId = p.first;
+ const auto& disjointSet = p.second;
+ if(disjointSet.size() >= minVertexCoverage or
+ disjointSetId==disjointSetIdA or
+ disjointSetId==disjointSetIdB) {
+
+ const vertex_descriptor v = add_vertex({disjointSetId}, graph);
+ vertexMap.insert(make_pair(disjointSetId, v));
+ }
+ }
+
+ if(html and options.showDebugInformation) {
+ html << "<br>The assembly graph has " << num_vertices(graph) << " vertices.";
+ }
+
+ return minVertexCoverage;
+}
+
+
+
+// Create edges by following the reads.
+void LocalAssembly::createEdges()
+{
+ LocalAssembly& graph = *this;
+
+ removeAllEdges();
+
+ // Loop over all reads.
+ for(uint64_t i=0; i<orientedReadInfos.size(); i++) {
+ const OrientedReadInfo& info = orientedReadInfos[i];
+
+ // Follow this read, finding the vertices it reaches.
+ vertex_descriptor v0 = null_vertex();
+ LocalAssemblyMarkerIndexes indexes0;
+ for(uint64_t j=0; j<info.markerInfos.size(); j++) {
+ const MarkerInfo& markerInfo = info.markerInfos[j];
+ const uint64_t disjointSetId = markerInfo.disjointSetId;
+ const auto it = vertexMap.find(disjointSetId);
+
+ if(it != vertexMap.end()) {
+ const vertex_descriptor v1 = it->second;
+ const LocalAssemblyMarkerIndexes indexes1 = {i, j};
+ if(v0 != null_vertex()) {
+
+ // Get the edge v0->v1, creating it if necessary.
+ edge_descriptor e;
+ bool edgeExists = false;
+ tie(e, edgeExists) = edge(v0, v1, graph);
+ if(not edgeExists) {
+ bool edgeWasAdded = false;
+ tie(e, edgeWasAdded) = add_edge(v0, v1, graph);
+ SHASTA_ASSERT(edgeWasAdded);
+ }
+ LocalAssemblyEdge& edge = graph[e];
+
+ edge.markerIntervals.push_back({indexes0, indexes1});
+ }
+
+ // v1 becomes the previous vertex.
+ v0 = v1;
+ indexes0 = indexes1;
+
+ }
+ }
+ }
+ if(html and options.showDebugInformation) {
+ html << "<br>The assembly graph has " << num_edges(graph) << " edges.";
+ }
+}
+
+
+
+void LocalAssembly::removeAllEdges()
+{
+ LocalAssembly& graph = *this;
+ BGL_FORALL_VERTICES(v, graph, LocalAssembly) {
+ clear_vertex(v, graph);
+ }
+}
+
+
+
+void LocalAssembly::writeGraphviz(const string& fileName) const
+{
+ ofstream file(fileName);
+ writeGraphviz(file);
+}
+
+
+
+void LocalAssembly::writeGraphviz(ostream& s) const
+{
+ const LocalAssembly& graph = *this;
+
+ // S and V for edges HSV.
+ const double S = 0.7;
+ const double V = 1.;
+
+ // Gather assembly path edges.
+ vector<edge_descriptor> sortedAssemblyPathEdges = assemblyPath;
+ sort(sortedAssemblyPathEdges.begin(), sortedAssemblyPathEdges.end());
+
+ s <<
+ "digraph LocalAssembly {\n"
+ "mclimit=0.01;\n" // For layout speed
+ "edge [penwidth=6];\n"
+ "node [fontname=\"Courier New\"];\n"
+ "edge [fontname=\"Courier New\"];\n";
+
+ if(options.showVertices) {
+ if(options.showVertexLabels) {
+ s << "node [shape=rectangle style=filled color=black fillcolor=gray80];\n";
+ } else {
+ s << "node [shape=point width=0.2];\n";
+ }
+ } else {
+ s << "node [shape=point style=invis];\n";
+ }
+
+ // Vertices.
+ BGL_FORALL_VERTICES(v, graph, LocalAssembly) {
+ const uint64_t disjointSetId = graph[v].disjointSetId;
+ auto it = disjointSetsMap.find(disjointSetId);
+ SHASTA_ASSERT(it != disjointSetsMap.end());
+ const uint64_t coverage = it->second.size();
+
+ const bool isA = (graph[v].disjointSetId == disjointSetIdA);
+ const bool isB = (graph[v].disjointSetId == disjointSetIdB);
+
+ s << disjointSetId << "[";
+
+ // Label.
+ s << "label=\"";
+ if(isA) {
+ s << "A\\n";
+ }
+ if(isB) {
+ s << "B\\n";
+ }
+ s << graph[v].disjointSetId << "\\n" << coverage;
+ s << "\" ";
+
+ // Special drawing of the begin/end vertices.
+ if(isA or isB) {
+ s << "shape=rectangle style=filled color=black fillcolor=cyan";
+ }
+
+ s << "];\n";
+ }
+
+ // Edges.
+ BGL_FORALL_EDGES(e, graph, LocalAssembly) {
+ const LocalAssemblyEdge& edge = graph[e];
+ const vertex_descriptor v0 = source(e, graph);
+ const vertex_descriptor v1 = target(e, graph);
+ const uint64_t coverage = edge.coverage();
+
+ // Compute the hue based on coverage.
+ double H;
+ if(coverage >= orientedReadInfos.size()) {
+ H = 1./3.;
+ } else {
+ H = (double(coverage - 1) / (3. * double(orientedReadInfos.size() - 1)));
+ }
+ const string colorString = "\"" + to_string(H) + " " + to_string(S) + " " + to_string(V) + "\"";
+
+ s <<
+ graph[v0].disjointSetId << "->" <<
+ graph[v1].disjointSetId << " [";
+
+ if(options.showEdgeLabels) {
+ s << "label=\"" << coverage << "\"";
+ }
+ s << " color=" << colorString;
+
+ // Tooltip.
+ s << " tooltip=\"";
+ s << "Coverage " << coverage << "\\n";
+ s << "\"";
+
+ // If we have an assembly path and this edge is not on the assembly path,
+ // draw it dashed.
+ if(not assemblyPath.empty()) {
+ if(not std::binary_search(sortedAssemblyPathEdges.begin(), sortedAssemblyPathEdges.end(), e)) {
+ s << " style=dashed";
+ }
+ }
+
+ s << "];\n";
+ }
+
+ s << "}\n";
+}
+
+
+
+void LocalAssembly::writeGraph(const string& title)
+{
+ LocalAssembly& graph = *this;
+
+ if(html and options.showGraph) {
+ html << "<h2>" << title << "</h2>";
+ html << "<p>The assembly graph has " << num_vertices(graph) <<
+ " vertices and " << num_edges(graph) << " edges.";
+ writeGraph();
+ }
+}
+
+
+
+void LocalAssembly::writeGraph() const
+{
+ // Write out the graph in graphviz format.
+ const string uuid = to_string(boost::uuids::random_generator()());
+ const string dotFileName = tmpDirectory() + uuid + ".dot";
+ {
+ ofstream dotFile(dotFileName);
+ writeGraphviz(dotFile);
+ }
+
+ // Compute layout in svg format.
+ const string command = "dot -O -T svg " + dotFileName;
+ bool timeoutTriggered = false;
+ bool signalOccurred = false;
+ int returnCode = 0;
+ const double timeout = 600;
+ runCommandWithTimeout(command, timeout, timeoutTriggered, signalOccurred, returnCode);
+ if(returnCode!=0 or signalOccurred) {
+ throw runtime_error("An error occurred while running the following command: " + command);
+ }
+ if(timeoutTriggered) {
+ std::filesystem::remove(dotFileName);
+ throw runtime_error("Timeout during graph layout computation.");
+ }
+
+ // Remove the .dot file.
+ std::filesystem::remove(dotFileName);
+
+ // Copy the svg file to html.
+ const string svgFileName = dotFileName + ".svg";
+ ifstream svgFile(svgFileName);
+ html << "<p>" << svgFile.rdbuf();
+ svgFile.close();
+
+ // Remove the .svg file.
+ std::filesystem::remove(svgFileName);
+}
+
+
+
+uint64_t LocalAssembly::removeStrongComponents()
+{
+ LocalAssembly& graph = *this;
+ uint64_t removedCount = 0;
+
+ // Map the vertices to integers.
+ uint64_t vertexIndex = 0;
+ std::map<vertex_descriptor, uint64_t> vertexMap;
+ BGL_FORALL_VERTICES(v, graph, LocalAssembly) {
+ vertexMap.insert({v, vertexIndex++});
+ }
+
+ // Compute strong components.
+ std::map<vertex_descriptor, uint64_t> componentMap;
+ boost::strong_components(
+ graph,
+ boost::make_assoc_property_map(componentMap),
+ boost::vertex_index_map(boost::make_assoc_property_map(vertexMap)));
+
+ // Gather the vertices in each strong component.
+ std::map<uint64_t, vector<vertex_descriptor> > componentVertices;
+ for(const auto& p: componentMap) {
+ componentVertices[p.second].push_back(p.first);
+ }
+
+
+
+ // Keep the non-trivial ones.
+ // A non-trivial strong component has at least one internal edge.
+ // This means that it either has more than one vertex,
+ // or it consists of a single vertex with a self-edge.
+ for(const auto& p: componentVertices) {
+
+ // Figure out if it is non-trivial.
+ bool isNonTrivial;
+ if(p.second.size() > 1) {
+
+ // More than one vertex. Certainly non-trivial.
+ isNonTrivial = true;
+ } else if (p.second.size() == 1) {
+
+ // Only one vertex. Non-trivial if self-edge present.
+ const vertex_descriptor v = p.second.front();
+ bool selfEdgeExists = false;
+ tie(ignore, selfEdgeExists) = edge(v, v, graph);
+ isNonTrivial = selfEdgeExists;
+ } else {
+
+ // Empty. This should never happen.
+ SHASTA_ASSERT(0);
+ }
+
+ // If non-trivial, remove all of its vertices.
+ // But don't remove vertexIdA or vertexIdB.
+ if(isNonTrivial) {
+ for(const vertex_descriptor v: p.second) {
+ const LocalAssemblyVertex& vertex = graph[v];
+ if(vertex.disjointSetId == disjointSetIdA or vertex.disjointSetId == disjointSetIdB) {
+ continue;
+ }
+ removeVertex(v);
+ ++removedCount;
+ }
+ }
+ }
+
+ if(html and options.showDebugInformation) {
+ html <<
+ "<br>Removed " << removedCount <<
+ " vertices in non-trivial strongly connected components."
+ "<br>The graph has now " << num_vertices(graph) <<
+ " vertices.";
+
+ }
+
+ return removedCount;
+}
+
+
+
+void LocalAssembly::removeVertex(vertex_descriptor v)
+{
+ LocalAssembly& graph = *this;
+
+ vertexMap.erase(graph[v].disjointSetId);
+
+ clear_vertex(v, graph);
+ remove_vertex(v, graph);
+
+}
+
+
+
+void LocalAssembly::findAssemblyPath()
+{
+ const LocalAssembly& graph = *this;
+ assemblyPath.clear();
+
+
+ // Find the first and last vertex of the path we are looking for.
+ vertex_descriptor vA = null_vertex();
+ vertex_descriptor vB = null_vertex();
+ BGL_FORALL_VERTICES(v, graph, LocalAssembly) {
+ const LocalAssemblyVertex& vertex = graph[v];
+ if(vertex.disjointSetId == disjointSetIdA) {
+ SHASTA_ASSERT(vA == null_vertex());
+ vA = v;
+ }
+ if(vertex.disjointSetId == disjointSetIdB) {
+ SHASTA_ASSERT(vB == null_vertex());
+ vB = v;
+ }
+ }
+ SHASTA_ASSERT(vA != null_vertex());
+ SHASTA_ASSERT(vB != null_vertex());
+
+
+ // Main iteration loop.
+ vertex_descriptor v = vA;
+ while(v != vB) {
+
+ // Find the edge with the most coverage.
+ edge_descriptor eNext;
+ uint64_t bestCoverage = 0;
+ BGL_FORALL_OUTEDGES(v, e, graph, LocalAssembly) {
+ // Ignore a self-edge A->A.
+ // This can exist because we did not allow vertex A (and B)
+ // to be removed when removing strong components.
+ if(v == vA and target(e, graph) == vA) {
+ continue;
+ }
+ const uint64_t coverage = graph[e].coverage();
+ if(coverage > bestCoverage) {
+ eNext = e;
+ bestCoverage = coverage;
+ }
+ }
+ if(bestCoverage == 0) {
+ cout << "LocalAssembly: at " << graph[v].disjointSetId <<
+ ": no out-edge found when filling path from " <<
+ edgeIdA << " to " << edgeIdB << endl;
+ }
+ SHASTA_ASSERT(bestCoverage > 0);
+
+ // Store this edge.
+ assemblyPath.push_back(eNext);
+ v = target(eNext, graph);
+ }
+
+ if(html and options.showDebugInformation) {
+ html << "<br>The assembly path has " << assemblyPath.size() << " edges.";
+ }
+}
+
+
+
+
+void LocalAssembly::assembleAssemblyPathEdges(
+ uint64_t maxMsaLength,
+ LongMsaPolicy longMsaPolicy)
+{
+ const LocalAssembly& graph = *this;
+
+ for(const edge_descriptor e: assemblyPath) {
+ assembleEdge(maxMsaLength, longMsaPolicy, e);
+ }
+
+
+
+ // Write a table containing a summary of edge sequences with coverage,
+ // and their position in assembled sequence.
+ if(html and options.showAssemblyDetails) {
+ html <<
+ "<br><table>"
+ "<tr>"
+ "<th>Source"
+ "<th>Target"
+ "<th>Begin"
+ "<th>End"
+ "<th>length"
+ "<th>Sequence"
+ ;
+
+ uint64_t position = 0;
+ for(const edge_descriptor e: assemblyPath) {
+ const LocalAssemblyEdge& edge = graph[e];
+ const vector<Base>& sequence = edge.consensusSequence;
+ const vector<uint64_t>& coverage = edge.consensusCoverage;
+ SHASTA_ASSERT(sequence.size() == coverage.size());
+
+ html <<
+ "<tr>"
+ "<td class=centered>" << graph[source(e, graph)].disjointSetId <<
+ "<td class=centered>" << graph[target(e, graph)].disjointSetId <<
+ "<td class=centered>" << position <<
+ "<td class=centered>" << position + sequence.size() <<
+ "<td class=centered>" << sequence.size() <<
+ "<td class=centered style='font-family:monospace'>";
+ copy(sequence.begin(), sequence.end(), ostream_iterator<Base>(html));
+ html << "<br>";
+ for(const uint64_t c: coverage) {
+ writeCoverageCharacterToHtml(c);
+ }
+
+ position += sequence.size();
+ }
+
+ html << "</table>";
+ }
+}
+
+
+
+void LocalAssembly::assembleEdge(
+ uint64_t maxMsaLength,
+ LongMsaPolicy longMsaPolicy,
+ edge_descriptor e)
+{
+ LocalAssembly& graph = *this;
+ LocalAssemblyEdge& edge = graph[e];
+
+ if(html and options.showAssemblyDetails) {
+ html << "<h2>Assembly details for edge " <<
+ graph[source(e, graph)].disjointSetId << "->" <<
+ graph[target(e, graph)].disjointSetId << "</h2>"
+ "<table>"
+ "<tr><th>Oriented<br>read<th>Sequence<br>length<th>Sequence";
+ }
+
+ const uint64_t k = assembler.assemblerInfo->k;
+ SHASTA_ASSERT((k % 2) == 0);
+ const uint64_t kHalf = k / 2;
+
+ // Gather the sequences of the contributing oriented reads.
+ // Each sequence is stored with the number of distinct oriented reads that
+ // have that sequence.
+ vector< pair<vector<Base>, uint64_t> > orientedReadSequences;
+
+ // Loop over marker intervals of this edge.
+ vector<Base> orientedReadSequence;
+ for(const auto& p: edge.markerIntervals) {
+
+ // Locate the two markers of this marker interval.
+ const LocalAssemblyMarkerIndexes indexes0 = p.first;
+ const LocalAssemblyMarkerIndexes indexes1 = p.second;
+ const uint64_t i0 = indexes0.i;
+ const uint64_t i1 = indexes1.i;
+ const uint64_t j0 = indexes0.j;
+ const uint64_t j1 = indexes1.j;
+
+ // They must belong to the same oriented read.
+ SHASTA_ASSERT(i0 == i1);
+ const uint64_t i = i0;
+ const OrientedReadInfo& info = orientedReadInfos[i];
+ const OrientedReadId orientedReadId = info.orientedReadId;
+
+ const MarkerInfo& markerInfo0 = info.markerInfos[j0];
+ const MarkerInfo& markerInfo1 = info.markerInfos[j1];
+
+ // Now we can get the contributing sequence.
+ const uint64_t position0 = markerInfo0.position + kHalf;
+ const uint64_t position1 = markerInfo1.position + kHalf;
+
+ // Now we can get the sequence contributed by this oriented read.
+ orientedReadSequence.clear();
+ for(uint64_t position=position0; position!=position1; position++) {
+ const Base base = assembler.getReads().getOrientedReadBase(orientedReadId, uint32_t(position));
+ orientedReadSequence.push_back(base);
+ }
+
+ if(html and options.showAssemblyDetails) {
+ html <<
+ "<tr><td class=centered>" << orientedReadId <<
+ "<td class=centered>" << orientedReadSequence.size() <<
+ "<td class=centered style='font-family:monospace'>";
+ copy(orientedReadSequence.begin(), orientedReadSequence.end(),
+ ostream_iterator<Base>(html));
+ }
+
+ // Store it.
+ bool found = false;
+ for(auto& p: orientedReadSequences) {
+ if(p.first == orientedReadSequence) {
+ ++p.second;
+ found = true;
+ break;
+ }
+ }
+ if(not found) {
+ orientedReadSequences.push_back(make_pair(orientedReadSequence, 1));
+ }
+
+ }
+
+ // Sort the sequences by decreasing number of supporting reads.
+ sort(orientedReadSequences.begin(), orientedReadSequences.end(),
+ OrderPairsBySecondOnlyGreater<vector<Base>, uint64_t>());
+
+ if(html and options.showAssemblyDetails) {
+ html << "</table>";
+
+ html << "<p><table>"
+ "<tr><th>Coverage<th>Sequence<br>length<th>Sequence";
+ for(const auto& p: orientedReadSequences) {
+ const vector<Base>& sequence = p.first;
+ const uint64_t coverage = p.second;
+ html <<
+ "<tr>"
+ "<td class=centered>" << coverage <<
+ "<td class=centered>" << sequence.size() <<
+ "<td class=centered style='font-family:monospace'>";
+ copy(sequence.begin(), sequence.end(), ostream_iterator<Base>(html));
+
+ }
+ html << "</table>";
+ }
+
+ // If there is only one distinct sequence (all reads agree),
+ // store that one sequence as the consensus.
+ // This is the most common case.
+ if(orientedReadSequences.size() == 1) {
+ const auto& p = orientedReadSequences.front();
+ const vector<Base>& sequence = p.first;
+ const uint64_t coverage = p.second;
+ edge.consensusSequence = sequence;
+ edge.consensusCoverage.clear();
+ edge.consensusCoverage.resize(sequence.size(), coverage);
+ return;
+ }
+
+
+ // If getting here, we have more than one sequence, and we must
+ // compute a consensus via multiple sequence alignment (MSA).
+
+ // If any of the sequences are too long, react according to longMsaPolicy.
+ // This can be problematic.
+ if(orientedReadSequences.size() > 1) {
+
+ // Find the length of the longest sequence.
+ uint64_t maxLength = 0;
+ for(const auto& p: orientedReadSequences) {
+ const vector<Base>& sequence = p.first;
+ maxLength = max(sequence.size(), maxMsaLength);
+ }
+
+ if(maxLength > maxMsaLength) {
+ if(html and options.showDebugInformation) {
+ html << "<br>MSA length " << maxLength << " at " <<
+ graph[source(e, graph)].disjointSetId << "->" <<
+ graph[target(e, graph)].disjointSetId;
+ }
+ if(longMsaPolicy == LongMsaPolicy::throwException) {
+ throw runtime_error("Long MSA.");
+ } else {
+ orientedReadSequences.resize(1);
+ if(html and options.showDebugInformation) {
+ html << "<br>Assembling this edge at coverage " << orientedReadSequences.front().second;
+ }
+ }
+ }
+ }
+
+ // Compute the MSA.
+ vector< vector<AlignedBase> > alignment;
+ globalMsaSpoa(orientedReadSequences, alignment);
+ SHASTA_ASSERT(alignment.size() == orientedReadSequences.size());
+
+ // Compute coverage at each alignment position for each of the 5 AlignedBases.
+ const uint64_t alignmentLength = alignment.front().size();
+ vector< array<uint64_t, 5> > coverage(alignmentLength, {0, 0, 0, 0, 0});
+ for(uint64_t i=0; i<orientedReadSequences.size(); i++) {
+ const vector<AlignedBase>& alignmentRow = alignment[i];
+ SHASTA_ASSERT(alignmentRow.size() == alignmentLength);
+ for(uint64_t position=0; position<alignmentLength; position++) {
+ const AlignedBase b = alignmentRow[position];
+ coverage[position][b.value] += orientedReadSequences[i].second;
+ }
+ }
+
+ // Compute coverage-based consensus at each alignment position.
+ vector<AlignedBase> alignedConsensus;
+ vector<uint64_t> alignmentConsensusCoverage;
+ for(const auto& c: coverage) {
+ const uint64_t iBase = std::max_element(c.begin(), c.end()) - c.begin();
+ alignedConsensus.push_back(AlignedBase::fromInteger(iBase));
+ alignmentConsensusCoverage.push_back(c[iBase]);
+ }
+ SHASTA_ASSERT(alignedConsensus.size() == alignmentLength);
+
+ // Store in the edge the consensus and its coverage, excluding the gaps.
+ edge.consensusSequence.clear();
+ edge.consensusCoverage.clear();
+ for(uint64_t position=0; position<alignedConsensus.size(); position++) {
+ const AlignedBase b = alignedConsensus[position];
+ if(not b.isGap()) {
+ edge.consensusSequence.push_back(Base(b));
+ edge.consensusCoverage.push_back(alignmentConsensusCoverage[position]);
+ }
+ }
+
+ if(html and options.showAssemblyDetails) {
+
+ html << "<p><table>"
+ "<tr><th>Coverage<th>Sequence<br>length<th>Aligned<br>sequence";
+
+ // Write one row for each distinct sequence.
+ for(uint64_t i=0; i<orientedReadSequences.size(); i++) {
+ const auto& p = orientedReadSequences[i];
+ const vector<Base>& sequence = p.first;
+ const uint64_t coverage = p.second;
+ const vector<AlignedBase>& alignedSequence = alignment[i];
+ html <<
+ "<tr>"
+ "<td class=centered>" << coverage <<
+ "<td class=centered>" << sequence.size() <<
+ "<td class=centered style='font-family:monospace'>";
+ for(uint64_t position=0; position<alignedSequence.size(); position++) {
+ const AlignedBase b = alignedSequence[position];
+ const bool isDiscordant = (b != alignedConsensus[position]);
+ if(isDiscordant) {
+ html << "<span style='background-color:LightCoral'>";
+ }
+ html << alignedSequence[position];
+ if(isDiscordant) {
+ html << "</span>";
+ }
+ }
+ }
+
+ // Write one row with aligned consensus.
+ html <<
+ "<tr>"
+ "<td class=centered colspan=2>Consensus"
+ "<td class=centered style='font-family:monospace'>";
+ copy(alignedConsensus.begin(), alignedConsensus.end(),
+ ostream_iterator<AlignedBase>(html));
+
+ // Write one row with aligned consensus coverage.
+ html <<
+ "<tr>"
+ "<td class=centered colspan=2>Consensus coverage"
+ "<td class=centered style='font-family:monospace'>";
+ for(uint64_t position=0; position<coverage.size(); position++) {
+ writeCoverageCharacterToHtml(alignmentConsensusCoverage[position]);
+ }
+
+ // Write one row with aligned discordant coverage.
+ html <<
+ "<tr>"
+ "<td class=centered colspan=2>Discordant coverage"
+ "<td class=centered style='font-family:monospace'>";
+ for(uint64_t position=0; position<coverage.size(); position++) {
+ writeCoverageCharacterToHtml(edge.coverage() - alignmentConsensusCoverage[position]);
+ }
+
+ // Write one row with coverage for each of the 5 AlignedBases.
+ for(uint64_t b=0; b<5; b++) {
+ html <<
+ "<tr><td colspan=2 class=centered>" << AlignedBase::fromInteger(b) << " coverage"
+ "<td class=centered style='font-family:monospace'>";
+ for(uint64_t position=0; position<coverage.size(); position++) {
+ writeCoverageCharacterToHtml(coverage[position][b]);
+ }
+ }
+ html << "</table>";
+
+ // Write another table with the final, ungapped consensus and its coverage.
+ html <<
+ "<p>Consensus length is " << edge.consensusSequence.size() <<
+ "<br><table>"
+ "<tr><th>Consensus<td class=centered style='font-family:monospace'>";
+ copy(edge.consensusSequence.begin(), edge.consensusSequence.end(),
+ ostream_iterator<Base>(html));
+ html << "<tr><th>Consensus coverage<td class=centered style='font-family:monospace'>";
+ for(const uint64_t coverage: edge.consensusCoverage) {
+ writeCoverageCharacterToHtml(coverage);
+ }
+ html << "<tr><th>Discordant coverage<td class=centered style='font-family:monospace'>";
+ for(const uint64_t coverage: edge.consensusCoverage) {
+ writeCoverageCharacterToHtml(edge.coverage() - coverage);
+ }
+ html << "</table>";
+ }
+
+}
+
+
+
+void LocalAssembly::writeCoverageCharacterToHtml(uint64_t coverage) const
+{
+ if(coverage == 0) {
+ html << "&nbsp;";
+ } else if(coverage < 10) {
+ html << coverage;
+ } else if(coverage < 36) {
+ html << char((coverage - 10) + 'A');
+ } else {
+ html << "*";
+ }
+
+}
+
+
+// Get the sequence between edgeIdA and edgeIdB.
+// This does not include the sequences of edgeIdA and edgeIdB themselves.
+void LocalAssembly::getSecondarySequence(
+ vector<Base>& sequence) const
+{
+ const LocalAssembly& graph = *this;
+
+ sequence.clear();
+ for(const edge_descriptor e: assemblyPath) {
+ const vector<Base>& edgeSequence = graph[e].consensusSequence;
+ copy(edgeSequence.begin(), edgeSequence.end(), back_inserter(sequence));
+ }
+
+}
+
+
+
+// Get the complete sequence, including the sequences of edgeIdA and edgeIdB.
+void LocalAssembly::getCompleteSequence(
+ vector<Base>& sequence) const
+{
+ const LocalAssembly& graph = *this;
+
+ sequence.clear();
+
+ const auto edgeASequence = assembler.markerGraph.edgeSequence[edgeIdA];
+ copy(edgeASequence.begin(), edgeASequence.end(), back_inserter(sequence));
+
+ for(const edge_descriptor e: assemblyPath) {
+ const vector<Base>& edgeSequence = graph[e].consensusSequence;
+ copy(edgeSequence.begin(), edgeSequence.end(), back_inserter(sequence));
+ }
+
+ const auto edgeBSequence = assembler.markerGraph.edgeSequence[edgeIdB];
+ copy(edgeBSequence.begin(), edgeBSequence.end(), back_inserter(sequence));
+
+
+}
+
+
+
+// Remove vertices that are not accessible from vertexIdA
+// or from which vertexIdB is not accessible.
+// Returns the number of vertices that were removed.
+uint64_t LocalAssembly::removeInaccessibleVertices()
+{
+ LocalAssembly& graph = *this;
+
+ // Find the vertices corresponding to vertexIdA and vertexIdB.
+ vertex_descriptor vA = null_vertex();
+ vertex_descriptor vB = null_vertex();
+ BGL_FORALL_VERTICES(v, graph, LocalAssembly) {
+ const LocalAssemblyVertex& vertex = graph[v];
+ if(vertex.disjointSetId == disjointSetIdA) {
+ SHASTA_ASSERT(vA == null_vertex());
+ vA = v;
+ }
+ if(vertex.disjointSetId == disjointSetIdB) {
+ SHASTA_ASSERT(vB == null_vertex());
+ vB = v;
+ }
+ }
+ SHASTA_ASSERT(vA != null_vertex());
+ SHASTA_ASSERT(vB != null_vertex());
+
+
+
+ // Use a forward BFS to find the vertices that are accessible from vertexIdA,
+ // moving forward. Those vertices get their isAccessibleA flag set.
+ {
+ std::queue<vertex_descriptor> q;
+ q.push(vA);
+ graph[vA].isAccessibleA = true;
+ while(not q.empty()) {
+ const vertex_descriptor v0 = q.front();
+ q.pop();
+
+ BGL_FORALL_OUTEDGES(v0, e, graph, LocalAssembly) {
+ const vertex_descriptor v1 = target(e, graph);
+ auto& vertex1 = graph[v1];
+ if(not vertex1.isAccessibleA) {
+ vertex1.isAccessibleA = true;
+ q.push(v1);
+ }
+ }
+ }
+ SHASTA_ASSERT(graph[vB].isAccessibleA);
+ }
+
+
+
+ // Use a backward BFS to find the vertices that are accessible from vertexIdB,
+ // moving backward. Those vertices get their isAccessibleB flag set.
+ {
+ std::queue<vertex_descriptor> q;
+ q.push(vB);
+ graph[vB].isAccessibleB = true;
+ while(not q.empty()) {
+ const vertex_descriptor v0 = q.front();
+ q.pop();
+
+ BGL_FORALL_INEDGES(v0, e, graph, LocalAssembly) {
+ const vertex_descriptor v1 = source(e, graph);
+ auto& vertex1 = graph[v1];
+ if(not vertex1.isAccessibleB) {
+ vertex1.isAccessibleB = true;
+ q.push(v1);
+ }
+ }
+ }
+ SHASTA_ASSERT(graph[vA].isAccessibleB);
+ }
+
+
+ // Gather the vertices to be removed.
+ vector<vertex_descriptor> verticesToBeRemoved;
+ BGL_FORALL_VERTICES(v, graph, LocalAssembly) {
+ const auto& vertex = graph[v];
+ if(not (vertex.isAccessibleA and vertex.isAccessibleB)) {
+ verticesToBeRemoved.push_back(v);
+ }
+ }
+
+ // Remove them.
+ for(const vertex_descriptor v: verticesToBeRemoved) {
+ removeVertex(v);
+ }
+
+ return verticesToBeRemoved.size();
+}
+
+
+
+// Remove all vertices and edges and clear the vertexMap and assemblyPath.
+// All other data are left alone.
+void LocalAssembly::clear()
+{
+ LocalAssemblyBaseClass::clear();
+ vertexMap.clear();
+ assemblyPath.clear();
+}
+
+
+
+void LocalAssembly::writeOrientedReadsSequences() const
+{
+ if(not html) {
+ return;
+ }
+ if(not options.showOrientedReads) {
+ return;
+ }
+
+ const uint64_t k = assembler.assemblerInfo->k;
+ SHASTA_ASSERT((k % 2) == 0);
+ const uint64_t kHalf = k / 2;
+
+ ofstream fasta("LocalAssembly-OrientedReadSequences.fasta");
+
+ for(const OrientedReadInfo& info: orientedReadInfos) {
+
+ SHASTA_ASSERT(not info.markerInfos.empty());
+ const uint64_t position0 = uint64_t(info.markerInfos.front().position) + kHalf;
+ const uint64_t position1 = uint64_t(info.markerInfos.back().position) + kHalf;
+
+ fasta <<
+ ">" << info.orientedReadId << " " <<
+ position0 << ":" << position1 <<
+ " length " << position1-position0 << "\n";
+ for(uint64_t position=position0; position!=position1; position++) {
+ const Base base = assembler.getReads().getOrientedReadBase(info.orientedReadId, uint32_t(position));
+ fasta << base;
+ }
+ fasta << "\n";
+ }
+}
diff --git a/src/mode3-LocalAssembly.hpp b/src/mode3-LocalAssembly.hpp
new file mode 100644
index 0000000..b52d207
--- /dev/null
+++ b/src/mode3-LocalAssembly.hpp
@@ -0,0 +1,345 @@
+#pragma once
+
+// LocalAssembly assembles the sequence between two primary marker graph edges.
+// It uses a local marker graph.
+
+// Shasta.
+#include "AssemblerOptions.hpp"
+#include "Base.hpp"
+#include "invalid.hpp"
+#include "ReadId.hpp"
+#include "shastaTypes.hpp"
+
+// Boost libraries.
+#include <boost/graph/adjacency_list.hpp>
+
+// Standard library.
+#include "utility.hpp"
+#include "vector.hpp"
+
+
+
+namespace shasta {
+ namespace mode3 {
+ class LocalAssemblyVertex;
+ class LocalAssemblyEdge;
+ class LocalAssembly;
+ using LocalAssemblyBaseClass = boost::adjacency_list<
+ boost::listS,
+ boost::listS,
+ boost::bidirectionalS,
+ LocalAssemblyVertex,
+ LocalAssemblyEdge
+ >;
+ class LocalAssemblyDisplayOptions;
+ class LocalAssemblyMarkerIndexes;
+ }
+ class Assembler;
+};
+
+
+
+class shasta::mode3::LocalAssemblyDisplayOptions {
+public:
+
+ // If this is not open, no output takes place.
+ ostream& html;
+
+ bool showGraph = false;
+ bool showOrientedReads = false;
+ bool showMarkers = false;
+ bool showVertices = false;
+ bool showVertexLabels = false;
+ bool showEdgeLabels = false;
+ bool showAssemblyDetails = false;
+ bool showDebugInformation = false;
+
+ LocalAssemblyDisplayOptions(ostream& html) : html(html) {}
+};
+
+
+
+// A way to identify a marker in LocalAssembly, besides its id.
+class shasta::mode3::LocalAssemblyMarkerIndexes {
+public:
+ uint64_t i; // Index in orientedReadInfos
+ uint64_t j; // Index in OrientedReadInfo::markerInfos;
+};
+
+
+
+class shasta::mode3::LocalAssemblyVertex {
+public:
+ uint64_t disjointSetId;
+ bool isAccessibleA = false;
+ bool isAccessibleB = false;
+};
+
+
+
+class shasta::mode3::LocalAssemblyEdge {
+public:
+
+ // Each marker interval is identified by the two markers.
+ vector< pair<LocalAssemblyMarkerIndexes, LocalAssemblyMarkerIndexes> > markerIntervals;
+
+ uint64_t coverage() const
+ {
+ return markerIntervals.size();
+ }
+
+ // Consensus of the sequences contributes by each marker interval.
+ vector<Base> consensusSequence;
+ vector<uint64_t> consensusCoverage;
+};
+
+
+
+class shasta::mode3::LocalAssembly : public LocalAssemblyBaseClass {
+public:
+
+ // Hide class Base defined in boost::adjacency_list.
+ using Base = shasta::Base;
+
+ // The oriented reads common between edgeIdA and edgeIdB are always
+ // used for assembly. The oriented reads that appear only
+ // on edgeIdA or edgeIdB are used for assembly under control
+ // of useA and useB.
+ // So, if useA and useB are both true (the default), the assembly uses the
+ // union of the oriented reads on edgeIdA and edgeIdB.
+ // If they are both false, the assembly uses the
+ // intersection of the oriented reads on edgeIdA and edgeIdB.
+ // If useA is true and useB is false, the assembly uses the
+ // oriented reads on edgeIdA, regardless of whether they appear on edgeIdB.
+ // If useA is false and useB is true, the assembly uses the
+ // oriented reads on edgeIdB, regardless of whether they appear on edgeIdA.
+ LocalAssembly(
+ const Assembler&,
+ MarkerGraphEdgeId edgeIdA,
+ MarkerGraphEdgeId edgeIdB,
+ uint64_t minVertexCoverage, // 0 = automatic
+ const LocalAssemblyDisplayOptions&,
+ const Mode3AssemblyOptions::LocalAssemblyOptions&,
+ bool useA = true,
+ bool useB = true);
+
+ // Get the sequence between edgeIdA and edgeIdB.
+ // This does not include the sequences of edgeIdA and edgeIdB themselves.
+ void getSecondarySequence(
+ vector<Base>&) const;
+
+ // Get the complete sequence, including the sequences of edgeIdA and edgeIdB.
+ void getCompleteSequence(
+ vector<Base>&) const;
+
+private:
+
+ // Store constructor arguments.
+ const Assembler& assembler;
+ MarkerGraphEdgeId edgeIdA;
+ MarkerGraphEdgeId edgeIdB;
+ const LocalAssemblyDisplayOptions& options;
+ ostream& html;
+
+ MarkerGraphVertexId vertexIdA; // The target vertex of marker graph edge edgeIdA.
+ MarkerGraphVertexId vertexIdB; // The target vertex of marker graph edge edgeIdA.
+
+ void checkAssumptions() const;
+
+
+
+ // A class used to store information about a marker of
+ // an oriented read used in this assembly.
+ // The ordinal and position are stored signed to facilitate manipulations
+ // that involve subtractions.
+ class MarkerInfo {
+ public:
+ int64_t ordinal;
+ int64_t position;
+ KmerId kmerId;
+
+ // An id for this marker, global to the LocalAssembly.
+ // This is the index of this marker in the disjoint sets data structure.
+ uint64_t id;
+
+ // The id of the disjoint set this MarkerInfo belongs to.
+ uint64_t disjointSetId;
+
+ };
+
+
+
+ // Information about the portion of an oriented read used in this assembly.
+ class OrientedReadInfo {
+ public:
+ OrientedReadId orientedReadId;
+ OrientedReadInfo(OrientedReadId orientedReadId) :
+ orientedReadId(orientedReadId)
+ {}
+
+ // The ordinal of vertexIdA in this oriented read.
+ // Only initialized for oriented reads that appear in edgeIdA.
+ int64_t ordinalA = invalid<int64_t>;
+ bool isOnA() const
+ {
+ return ordinalA != invalid<int64_t>;
+ }
+
+ // The ordinal of vertexIdB in this oriented read.
+ // Only initialized for oriented reads that appear in edgeIdB.
+ int64_t ordinalB = invalid<int64_t>;
+ bool isOnB() const
+ {
+ return ordinalB != invalid<int64_t>;
+ }
+
+ // Note we are assuming that each oriented read appears once on edgeIdA, edgeIdB,
+ // and their source and target vertices.
+
+ // Order OrientedReadInfos by OrientedReadId.
+ bool operator<(const OrientedReadInfo& that) const
+ {
+ return orientedReadId < that.orientedReadId;
+ }
+
+
+ // The ordinal offset between vertexIdA and vertexIdB.
+ int64_t ordinalOffset() const
+ {
+ SHASTA_ASSERT(isOnA() and isOnB());
+ return ordinalB - ordinalA;
+ }
+
+ // Information about the markers of this read we will use in this assembly.
+ // The first one is at ordinal firstOrdinal.
+ // The last one is a ordinal lastOrdinal.
+ vector<MarkerInfo> markerInfos;
+
+ // The first and last ordinals of this oriented read used for this assembly.
+ // For reads on edgeIdA, firstOrdinal equals ordinalA.
+ // For reads on edgeIdB, lastOrdinal equals ordinalB.
+ int64_t firstOrdinal()
+ {
+ SHASTA_ASSERT(not markerInfos.empty());
+ return markerInfos.front().ordinal;
+ }
+ int64_t lastOrdinal()
+ {
+ SHASTA_ASSERT(not markerInfos.empty());
+ return markerInfos.back().ordinal;
+ }
+
+ };
+
+ // Get the base position of a marker in an oriented read
+ // given the ordinal.
+ int64_t basePosition(OrientedReadId, int64_t ordinal) const;
+
+ // For assembly, we use the union of the oriented reads
+ // that appear in edgeIdA and edgeIdB, and that have positive ordinal offset.
+ // OrientedReadInfos are stored sorted by OrientedReadId.
+ vector<OrientedReadInfo> orientedReadInfos;
+ void gatherOrientedReads(bool useA, bool useB);
+ void writeOrientedReads() const;
+ void writeOrientedReadsSequences() const;
+
+ // Estimated offset in bases between vertexIdA and vertexIdB.
+ // The estimate is done using the oriented reads that appear
+ // both in edgeIdA and edgeIdB.
+ // If the offset cannot be estimated because there are no
+ // common oriented reads between egeIdA and edgeIdB,
+ // it is set to invalid<int64_t>.
+ // In that case, or if the offset is negative,
+ // the assembly fails, which results in empty secondary sequence.
+ int64_t estimatedABOffset;
+ void estimateOffset();
+
+ // Fill in the markerInfos vector of each read.
+ void gatherMarkers(double estimatedOffsetRatio);
+ void writeMarkers();
+
+ // Add the marker at given ordinal to the i-th oriented read.
+ void addMarkerInfo(uint64_t i, int64_t ordinal);
+
+ // Compute alignments and use them to create the disjoint set data structure,
+ // from which the marker graph will be created.
+ // maxDrift is the maximum tolerated length drift of each read.
+ // Used to compute the band for banded alignments.
+ void alignAndDisjointSets(
+ uint64_t matchScore,
+ uint64_t mismatchScore,
+ uint64_t gapScore,
+ uint64_t maxSkipBases,
+ double maxDrift,
+ uint64_t minHalfBand,
+ double minScoreRatio
+ );
+
+ // This stores the markers in each disjoint set.
+ // Each marker is stored as pair(i, j)
+ // where i is the index of the OrientedReadInfo in orientedReadInfos
+ // and j is the index of the MarkerInfo in orientedReadInfo.markerInfos.
+ // Keyed by the disjoint set id (the same also stored in each marker).
+ std::map<uint64_t, vector<LocalAssemblyMarkerIndexes> > disjointSetsMap;
+
+ vector<uint64_t> disjointSetsSizeHistogram;
+
+ // Create vertices. Each disjoint set with at least minVertexCoverage markers
+ // generates a vertex.
+ // If minVertexCoverage is 0, a suitable value is computed.
+ // This returns the value of minVertexCoverage actually used.
+ uint64_t createVertices(
+ uint64_t minVertexCoverage,
+ double vertexSamplingRate); // Only used if minVertexCoverage is 0;
+ void removeVertex(vertex_descriptor);
+
+ // The disjoint sets corresponding to vertexIdA and vertexIdB.
+ // Those will always generate a vertex regardless of coverage.
+ uint64_t disjointSetIdA = invalid<uint64_t>;
+ uint64_t disjointSetIdB = invalid<uint64_t>;
+
+ // Map that gives the vertex descriptor corresponding to a disjoint set id, if any.
+ std::map<uint64_t, vertex_descriptor> vertexMap;
+
+ // Create edges by following the reads.
+ void createEdges();
+ void removeAllEdges();
+
+ // Remove strongly connected components.
+ // Returns the number of vertices removed.
+ uint64_t removeStrongComponents();
+
+ // Remove vertices that are not accessible from vertexIdA
+ // or from which vertexIdB is not accessible.
+ // Returns the number of vertices that were removed.
+ uint64_t removeInaccessibleVertices();
+
+ // Possible courses of action when a long MSA is encountered.
+ enum class LongMsaPolicy {
+ throwException,
+ assembleAtLowCoverage
+ };
+
+ // The assembly path, beginning at vertexIdA and ending at vertexIdB.
+ // This means that the sequences of edgeIdA and edgeIdB are not included.
+ vector<edge_descriptor> assemblyPath;
+ void findAssemblyPath();
+ void assembleAssemblyPathEdges(uint64_t maxMsaLength, LongMsaPolicy);
+ void assembleEdge(
+ uint64_t maxMsaLength,
+ LongMsaPolicy,
+ edge_descriptor);
+
+ // Graphviz output.
+ void writeGraph() const;
+ void writeGraph(const string& title);
+ void writeGraphviz(const string& fileName) const;
+ void writeGraphviz(ostream&) const;
+
+ void writeCoverageCharacterToHtml(uint64_t coverage) const;
+
+ // Remove all vertices and edges and clear the vertexMap and assemblyPath.
+ // All other data are left alone.
+ void clear();
+};
+
diff --git a/src/mode3-LocalAssemblyGraph.cpp b/src/mode3-LocalAssemblyGraph.cpp
deleted file mode 100644
index 7798147..0000000
--- a/src/mode3-LocalAssemblyGraph.cpp
+++ /dev/null
@@ -1,1576 +0,0 @@
-// Shasta.
-#include "mode3-LocalAssemblyGraph.hpp"
-#include "mode3-AssemblyPath.hpp"
-#include "mode3-SegmentPairInformation.hpp"
-#include "computeLayout.hpp"
-#include "html.hpp"
-#include "HttpServer.hpp"
-#include "MarkerGraph.hpp"
-#include "MurmurHash2.hpp"
-#include "writeGraph.hpp"
-using namespace shasta;
-using namespace mode3;
-
-// Boost libraries.
-#include <boost/geometry/algorithms/make.hpp>
-#include <boost/geometry/algorithms/length.hpp>
-#include <boost/graph/adjacency_list.hpp>
-#include <boost/graph/iteration_macros.hpp>
-#include <boost/graph/fruchterman_reingold.hpp>
-#include <boost/graph/random_layout.hpp>
-#include <boost/graph/topology.hpp>
-
-// Standard library.
-#include <map>
-#include <queue>
-#include "tuple.hpp"
-
-
-
-// Create the LocalAssemblyGraph using a BFS
-// that starts at the specified vertex and moves away
-// (in both directions) up to the specified distance
-mode3::LocalAssemblyGraph::LocalAssemblyGraph(
- const MarkerGraph& markerGraph,
- const AssemblyGraph& assemblyGraph,
- uint64_t startSegmentId,
- uint64_t maxDistance) :
- markerGraph(markerGraph),
- assemblyGraph(assemblyGraph),
- maxDistance(maxDistance)
-{
- LocalAssemblyGraph& localAssemblyGraph= *this;
-
- // The BFS queue.
- std::queue<uint64_t> q;
-
- // Map segments in the AssemblyGraph to vertices in
- // the LocalAssemblyGraph.
- std::map<uint64_t, vertex_descriptor> segmentMap;
-
- // Initialize the BFS.
- if(maxDistance > 0) {
- q.push(startSegmentId);
- }
- const vertex_descriptor vStart = addVertex(startSegmentId, 0);
- segmentMap.insert(make_pair(startSegmentId, vStart));
-
-
-
- // BFS.
- while(not q.empty()) {
-
- // Dequeue a segment.
- const uint64_t segmentId0 = q.front();
- q.pop();
- const vertex_descriptor v0 = segmentMap[segmentId0];
- const uint64_t distance0 = localAssemblyGraph[v0].distance;
- const uint64_t distance1 = distance0 + 1;
-
- // Loop over children.
- for(const uint64_t linkId: assemblyGraph.linksBySource[segmentId0]) {
- const mode3::AssemblyGraph::Link& link = assemblyGraph.links[linkId];
- const uint64_t segmentId1 = link.segmentId1;
- if(segmentMap.find(segmentId1) != segmentMap.end()) {
- // We already encountered this segment.
- continue;
- }
- const vertex_descriptor v1 = addVertex(segmentId1, distance1);
- segmentMap.insert(make_pair(segmentId1, v1));
- if(distance1 < maxDistance) {
- q.push(segmentId1);
- }
- }
-
- // Loop over parents.
- for(const uint64_t linkId: assemblyGraph.linksByTarget[segmentId0]) {
- const mode3::AssemblyGraph::Link& link = assemblyGraph.links[linkId];
- const uint64_t segmentId1 = link.segmentId0;
- if(segmentMap.find(segmentId1) != segmentMap.end()) {
- // We already encountered this segment.
- continue;
- }
- const vertex_descriptor v1 = addVertex(segmentId1, distance1);
- segmentMap.insert(make_pair(segmentId1, v1));
- if(distance1 < maxDistance) {
- q.push(segmentId1);
- }
- }
- }
-
-
-
- // Add the edges.
- for(const auto& p: segmentMap) {
- const uint64_t segmentId0 = p.first;
- const vertex_descriptor v0 = p.second;
-
- for(const uint64_t linkId: assemblyGraph.linksBySource[segmentId0]) {
- const mode3::AssemblyGraph::Link& link = assemblyGraph.links[linkId];
- const uint64_t segmentId1 = link.segmentId1;
- const auto it1 = segmentMap.find(segmentId1);
- if(it1 == segmentMap.end()) {
- continue;
- }
- const vertex_descriptor v1 = it1->second;
- boost::add_edge(v0, v1, LocalAssemblyGraphEdge(linkId), localAssemblyGraph);
- }
- }
-
-}
-
-
-
-mode3::LocalAssemblyGraphVertex::LocalAssemblyGraphVertex(
- uint64_t segmentId,
- uint64_t distance) :
- segmentId(segmentId),
- distance(distance)
-{
-}
-
-
-
-mode3::LocalAssemblyGraphVertex::LocalAssemblyGraphVertex() :
- segmentId(0),
- distance(0)
-{
-}
-
-
-
-mode3::LocalAssemblyGraph::vertex_descriptor mode3::LocalAssemblyGraph::addVertex(
- uint64_t segmentId,
- uint64_t distance)
-{
- return add_vertex(LocalAssemblyGraphVertex(segmentId, distance), *this);
-}
-
-
-
-void mode3::LocalAssemblyGraph::writeHtml(ostream& html, const SvgOptions& options) const
-{
- // Write the svg object.
- html << "<div style='display: inline-block; vertical-align:top'>";
- vector<mode3::AssemblyGraph::AnalyzeSubgraphClasses::Cluster> clusters;
- writeSvg(html, options, clusters);
- html << "</div>";
- addSvgDragAndZoom(html);
-
- // Side panel.
- html << "<div style='display: inline-block'>";
-
-
-
- // Highlight a segment.
- html << R"stringDelimiter(
- <script>
- function highlightSegment()
- {
- // Get the segment id from the input field.
- inputField = document.getElementById("highlightInputField");
- segmentId = inputField.value;
- inputField.value = "";
-
- // Make it dashed and wider.
- var element = document.getElementById("Segment-" + segmentId);
- var thickness = element.getAttribute("stroke-width");
- element.style.strokeDasharray = 0.2 * thickness;
- element.setAttribute("stroke-width", 2. * thickness);
- }
- </script>
- Highlight segment
- <input id=highlightInputField type=text onchange="highlightSegment()" size=10>
- )stringDelimiter";
-
-
-
- // Zoom to a segment.
- html << R"stringDelimiter(
- <script>
- function zoomToSegment()
- {
- // Get the segment id from the input field.
- inputField = document.getElementById("zoomInputField");
- segmentId = inputField.value;
- inputField.value = "";
-
- zoomToGivenSegment(segmentId);
- }
-
- function zoomToGivenSegment(segmentId)
- {
-
- // Find the bounding box and its center.
- var element = document.getElementById("Segment-" + segmentId);
- var box = element.getBBox();
- var xCenter = box.x + 0.5 * box.width;
- var yCenter = box.y + 0.5 * box.height;
-
- // Change the viewbox of the svg to be a bit larger than a square
- // containing the bounding box.
- var enlargeFactor = 5.;
- var size = enlargeFactor * Math.max(box.width, box.height);
- width = size;
- height = size;
- x = xCenter - 0.5 * size;
- y = yCenter - 0.5 * size;
- var svg = document.querySelector('svg');
- svg.setAttribute('viewBox', `${x} ${y} ${size} ${size}`);
- ratio = size / svg.getBoundingClientRect().width;
-
- }
- </script>
- <p>Zoom to segment
- <input id=zoomInputField type=text onchange="zoomToSegment()" size=10>
- )stringDelimiter";
-
-
-
- // Initial zoom to segment of interest.
- if(options.segmentColoring == "path") {
- html << "\n<script>zoomToGivenSegment(" << options.pathStart << ");</script>\n";
- }
- if(
- options.segmentColoring == "byCommonReads" or
- options.segmentColoring == "byJaccard" or
- options.segmentColoring == "byRawJaccard" or
- options.segmentColoring == "byUnexplainedFractionOnReferenceSegment" or
- options.segmentColoring == "byUnexplainedFractionOnDisplayedSegment"
- ) {
- html << "\n<script>zoomToGivenSegment(" << options.referenceSegmentId << ");</script>\n";
- }
-
-
-
- // Tables that will be automatically updated when the mouse is on a segment.
- html << R"zzz(
-<p>
-Hover on a segment to populate the tables below.
-<p>
-<table style='font-size:9'>
-<tr><th class='left'>Segment id<td id='segmentIdCell' class=centered style='width:8em'>
-<tr><th class='left'>Distance from start segment<td id='distanceCell' class=centered style='width:8em'>
-<tr><th class='left'>Path length<td id='pathLengthCell' class=centered style='width:8em'>
-<tr><th class='left'>Average edge coverage<td id='coverageCell' class=centered style='width:8em'>
-<tr><th class='left'>Cluster id<td id='clusterIdCell' class=centered style='width:8em'>
-</table>
-<p>
-Comparison of read compositions
-<p>
-<table>
-
-<tr>
-<td>
-<th>Reference<br>segment
-<th>Displayed<br>segment
-
-<tr>
-<th class='left'>Total
-<th id='totalReferenceCell'>
-<th id='totalDisplayedCell'>
-
-<tr>
-<th class='left'>Common
-<th id='commonReferenceCell'>
-<th id='commonDisplayedCell'>
-
-<tr>
-<th class='left'>Short
-<th id='shortReferenceCell'>
-<th id='shortDisplayedCell'>
-
-<tr>
-<th class='left'>Jaccard
-<th id='jaccardReferenceCell'>
-<th id='jaccardDisplayedCell'>
-
-<tr>
-<th class='left'>Raw Jaccard
-<th id='rawJaccardReferenceCell'>
-<th id='rawJaccardDisplayedCell'>
-
-<tr>
-<th class='left'>Unexplained
-<th id='unexplainedReferenceCell'>
-<th id='unexplainedDisplayedCell'>
-
-<tr>
-<th class='left'>Unexplained fraction
-<th id='unexplainedFractionReferenceCell'>
-<th id='unexplainedFractionDisplayedCell'>
-
-</table>
-
-<script>
-function onMouseEnterSegment(id, distance, pathLength, coverage, clusterId,
- totalReference, totalDisplayed,
- shortReference, shortDisplayed,
- common,
- unexplainedReference, unexplainedDisplayed)
-{
- document.getElementById('segmentIdCell').innerHTML = id;
- document.getElementById('distanceCell').innerHTML = distance;
- document.getElementById('pathLengthCell').innerHTML = pathLength;
- document.getElementById('coverageCell').innerHTML = coverage;
- if(clusterId != 18446744073709551615) {
- document.getElementById('clusterIdCell').innerHTML = clusterId;
- }
-
- document.getElementById('totalReferenceCell').innerHTML = totalReference;
- document.getElementById('totalDisplayedCell').innerHTML = totalDisplayed;
- document.getElementById('commonReferenceCell').innerHTML = common;
- document.getElementById('commonDisplayedCell').innerHTML = common;
-
- if(common > 0) {
- document.getElementById('shortReferenceCell').innerHTML = shortReference;
- document.getElementById('shortDisplayedCell').innerHTML = shortDisplayed;
- jaccard = (common / (common + unexplainedReference + unexplainedDisplayed)).toFixed(2);
- rawJaccard = (common / (totalReference + totalDisplayed - common)).toFixed(2);
- document.getElementById('jaccardReferenceCell').innerHTML = jaccard;
- document.getElementById('jaccardDisplayedCell').innerHTML = jaccard;
- document.getElementById('rawJaccardReferenceCell').innerHTML = rawJaccard;
- document.getElementById('rawJaccardDisplayedCell').innerHTML = rawJaccard;
- document.getElementById('unexplainedReferenceCell').innerHTML = unexplainedReference;
- document.getElementById('unexplainedDisplayedCell').innerHTML = unexplainedDisplayed;
- document.getElementById('unexplainedFractionReferenceCell').innerHTML =
- (unexplainedReference / (common + unexplainedReference)).toFixed(2);
- document.getElementById('unexplainedFractionDisplayedCell').innerHTML =
- (unexplainedDisplayed / (common + unexplainedDisplayed)).toFixed(2);
- }
-}
-function onMouseExitSegment()
-{
- document.getElementById('segmentIdCell').innerHTML = '';
- document.getElementById('distanceCell').innerHTML = '';
- document.getElementById('pathLengthCell').innerHTML = '';
- document.getElementById('coverageCell').innerHTML = '';
- document.getElementById('clusterIdCell').innerHTML = '';
-
- document.getElementById('totalReferenceCell').innerHTML = '';
- document.getElementById('totalDisplayedCell').innerHTML = '';
- document.getElementById('shortReferenceCell').innerHTML = '';
- document.getElementById('shortDisplayedCell').innerHTML = '';
- document.getElementById('commonReferenceCell').innerHTML = '';
- document.getElementById('commonDisplayedCell').innerHTML = '';
- document.getElementById('jaccardReferenceCell').innerHTML = '';
- document.getElementById('jaccardDisplayedCell').innerHTML = '';
- document.getElementById('unexplainedReferenceCell').innerHTML = '';
- document.getElementById('unexplainedDisplayedCell').innerHTML = '';
- document.getElementById('unexplainedFractionReferenceCell').innerHTML = '';
- document.getElementById('unexplainedFractionDisplayedCell').innerHTML = '';
-}
-</script>
- )zzz";
-
-
-
- // Change segment thickness
- html << R"stringDelimiter(
- <p><table>
- <tr><th class=left>Segment thickness<td>
- <button type='button' onClick='segmentThickness(0.1)' style='width:3em'>---</button>
- <button type='button' onClick='segmentThickness(0.5)' style='width:3em'>--</button>
- <button type='button' onClick='segmentThickness(0.8)' style='width:3em'>-</button>
- <button type='button' onClick='segmentThickness(1.25)' style='width:3em'>+</button>
- <button type='button' onClick='segmentThickness(2.)' style='width:3em'>++</button>
- <button type='button' onClick='segmentThickness(10.)' style='width:3em'>+++</button>
- <script>
- function segmentThickness(factor)
- {
- const group = document.getElementById('LocalAssemblyGraph-segments');
- descendants = group.querySelectorAll("path");
- for (let i=0; i<descendants.length; i++) {
- path = descendants[i];
- path.setAttribute('stroke-width', factor * path.getAttribute('stroke-width'));
- }
- }
- </script>
- )stringDelimiter";
-
-
-
- // Change link thickness
- html << R"stringDelimiter(
- <tr><th class=left>Link thickness<td>
- <button type='button' onClick='linkThickness(0.1)' style='width:3em'>---</button>
- <button type='button' onClick='linkThickness(0.5)' style='width:3em'>--</button>
- <button type='button' onClick='linkThickness(0.8)' style='width:3em'>-</button>
- <button type='button' onClick='linkThickness(1.25)' style='width:3em'>+</button>
- <button type='button' onClick='linkThickness(2.)' style='width:3em'>++</button>
- <button type='button' onClick='linkThickness(10.)' style='width:3em'>+++</button>
- <script>
- function linkThickness(factor)
- {
- const group1 = document.getElementById('LocalAssemblyGraph-links');
- for (let i=0; i<group1.children.length; i++) {
- group2 = group1.children[i];
- if(group2.tagName == 'g') {
- for (let j=0; j<group2.children.length; j++) {
- path = group2.children[j];
- if(path.tagName == 'path') {
- path.setAttribute('stroke-width', factor * path.getAttribute('stroke-width'));
- }
- }
- }
- }
- }
- </script>
- )stringDelimiter";
-
-
-
- // Zoom buttons.
- html << R"stringDelimiter(
- <tr title='Or use the mouse wheel.'><th class=left>Zoom<td>
- <button type='button' onClick='zoomSvg(0.1)' style='width:3em'>---</button>
- <button type='button' onClick='zoomSvg(0.5)' style='width:3em'>--</button>
- <button type='button' onClick='zoomSvg(0.8)' style='width:3em'>-</button>
- <button type='button' onClick='zoomSvg(1.25)' style='width:3em'>+</button>
- <button type='button' onClick='zoomSvg(2.)' style='width:3em'>++</button>
- <button type='button' onClick='zoomSvg(10.)' style='width:3em'>+++</button>
- </table>
- )stringDelimiter";
-
-
- // Code to display one local cluster at a time, with a button
- // to cycle through them.
- if(options.segmentColoring == "byLocalCluster") {
- html <<
- "<br>Found " << clusters.size() << " clusters. "
- "Displaying cluster <span id='currentCluster'></span>"
- "<br><button onClick='previousCluster()'>Previous<br>cluster</button>"
- "<button onClick='nextCluster()'>Next<br>cluster</button>"
- "<script>\n"
- "var clusters = [";
- for(uint64_t i=0; i<clusters.size(); i++) {
- html << "[";
- const auto & cluster = clusters[i];
- for(uint64_t j=0; j<cluster.segments.size(); j++) {
- html << cluster.segments[j].first;
- if(j != cluster.segments.size() - 1) {
- html << ",";
- }
- }
- html << "]";
- if(i != clusters.size() -1) {
- html << ",";
- }
- }
- html << "];\n";
-
- html << R"stringDelimiter(
-
- function clusterColor(clusterId)
- {
- var ratio = clusterId / clusters.length;
- return 'hsl(' + Math.round(360*ratio) + ', 85%, 70%)';
- }
-
- function highlightCluster(clusterId, color)
- {
- for(i=0; i<clusters[clusterId].length; i++) {
- segmentId = clusters[clusterId][i];
- document.getElementById("Segment-" + segmentId).style.stroke = color;
- document.getElementById("marker" + segmentId).style.fill = color;
- }
- }
- var currentCluster = 0;
- highlightCluster(currentCluster, clusterColor(currentCluster));
- document.getElementById("currentCluster").innerHTML = currentCluster;
- function nextCluster()
- {
- highlightCluster(currentCluster, "Black");
- currentCluster = currentCluster + 1;
- if(currentCluster == clusters.length) {
- currentCluster = 0;
- }
- highlightCluster(currentCluster, clusterColor(currentCluster));
- document.getElementById("currentCluster").innerHTML = currentCluster;
- }
- function previousCluster()
- {
- highlightCluster(currentCluster, "Black");
- if(currentCluster == 0) {
- currentCluster = clusters.length;
- }
- currentCluster = currentCluster - 1;
- highlightCluster(currentCluster, clusterColor(currentCluster));
- document.getElementById("currentCluster").innerHTML = currentCluster;
- }
- </script>
-
- )stringDelimiter";
- }
-
- // End of side panel.
- html << "</div>";
-
-}
-
-
-
-void mode3::LocalAssemblyGraph::writeSvg(
- const string& fileName,
- const SvgOptions& options,
- vector<mode3::AssemblyGraph::AnalyzeSubgraphClasses::Cluster>& clusters) const
-{
- ofstream svg(fileName);
- writeSvg(svg, options, clusters);
-}
-void mode3::LocalAssemblyGraph::writeSvg(
- ostream& svg,
- const SvgOptions& options,
- vector<mode3::AssemblyGraph::AnalyzeSubgraphClasses::Cluster>& clusters
- ) const
-{
- const LocalAssemblyGraph& localAssemblyGraph = *this;
-
-
- // If necessary, compute a map containing a SegmentPairInformation object
- // containing pair information between the reference segment
- // and each segment in the local assembly graph.
- const bool doSegmentPairComputations = true;
- std::map<vertex_descriptor, SegmentPairInformation> segmentPairInformationTable;
- mode3::AssemblyGraph::SegmentOrientedReadInformation referenceSegmentInfo;
- if(doSegmentPairComputations) {
-
- // Find oriented reads in the reference segment.
- assemblyGraph.getOrientedReadsOnSegment(options.referenceSegmentId, referenceSegmentInfo);
-
- // Loop over segments in the localAssemblyGraph.
- BGL_FORALL_VERTICES(v, localAssemblyGraph, LocalAssemblyGraph){
- mode3::AssemblyGraph::SegmentOrientedReadInformation segmentInfo;
- assemblyGraph.getOrientedReadsOnSegment(
- localAssemblyGraph[v].segmentId, segmentInfo);
-
- SegmentPairInformation segmentPairInformation;
- assemblyGraph.analyzeSegmentPair(
- options.referenceSegmentId, localAssemblyGraph[v].segmentId,
- referenceSegmentInfo, segmentInfo,
- assemblyGraph.markers, segmentPairInformation);
-
- segmentPairInformationTable.insert(make_pair(v, segmentPairInformation));
- }
- }
-
-
- std::map<uint64_t, vector<pair<uint64_t, bool> > > pathSegments; // map(segmentId, (positionsInPath, is referenceSegment)).
- AssemblyPath path;
- if(options.segmentColoring == "path") {
- if(options.pathDirection=="forward" or options.pathDirection=="backward") {
- // Forward or backward.
- assemblyGraph.createAssemblyPath(options.pathStart,
- (options.pathDirection == "forward") ? 0 : 1, path);
- if(options.pathDirection == "backward") {
- reverse(path.segments.begin(), path.segments.end());
- }
- } else {
- // Bidirectional.
- AssemblyPath forwardPath;
- AssemblyPath backwardPath;
- assemblyGraph.createAssemblyPath(options.pathStart, 0, forwardPath);
- assemblyGraph.createAssemblyPath(options.pathStart, 1, backwardPath);
- // Stitch them together, making sure not to repeat the starting segment.
- path.segments.clear();
- copy(backwardPath.segments.rbegin(), backwardPath.segments.rend(), back_inserter(path.segments));
- copy(forwardPath.segments.begin() + 1, forwardPath.segments.end(), back_inserter(path.segments));
- }
- for(uint64_t position=0; position<path.segments.size(); position++) {
- const AssemblyPathSegment& segment = path.segments[position];
- const uint64_t segmentId = segment.id;
- pathSegments[segmentId].push_back(make_pair(position, segment.isPrimary));
- }
- svg << "\nPath of length " << path.segments.size() << " starting at segment " << path.segments.front().id <<
- " and ending at segment " << path.segments.back().id << "<br>";
-
- ofstream csv("Path.csv");
- csv << "Position,SegmentId,Reference\n";
- for(uint64_t position=0; position<path.segments.size(); position++) {
- const AssemblyPathSegment& segment = path.segments[position];
- csv << position << "," << segment.id << "," << int(segment.isPrimary) << "\n";
- }
-
- // If requested, assemble path sequence.
- if(options.assemblePathSequence) {
- path.assemble(assemblyGraph);
- }
- }
-
-
-
- // If coloring by local cluster, call mode3::AssemblyGraph::analyzeSubgraph,
- // passing as input all the segments in the LocalAssemblyGraph
- // except those at maximum distance.
- if(options.segmentColoring == "byLocalCluster") {
- vector<uint64_t> segmentIds;
- BGL_FORALL_VERTICES(v, localAssemblyGraph, LocalAssemblyGraph) {
- const LocalAssemblyGraphVertex& vertex = localAssemblyGraph[v];
- if(vertex.distance != maxDistance) {
- segmentIds.push_back(vertex.segmentId);
- }
- }
- assemblyGraph.analyzeSubgraph(segmentIds, clusters, true);
-
- }
-
-
-
- // If coloring by cluster id only some clusters, create a color map
- // for the clusters to be colored.
- std::map<uint64_t, string> clusterColorMap;
- if(options.segmentColoring == "byCluster") {
- const uint64_t clusterCount = options.clustersToBeColored.size();
- if(clusterCount > 0) {
- for(uint64_t i=0; i<clusterCount; i++) {
- const uint64_t hue = uint64_t(std::round(double(i) * 360. / double(clusterCount)));
- const string color = "hsl(" + to_string(hue) + ",100%, 50%)";
- const uint64_t clusterId = options.clustersToBeColored[i];
- clusterColorMap.insert(make_pair(clusterId, color));
- }
- }
- }
-
-
-
- using boost::geometry::add_point;
- using boost::geometry::expand;
- using boost::geometry::make_inverse;
- using boost::geometry::multiply_value;
- using boost::geometry::subtract_point;
- using Box = boost::geometry::model::box<Point>;
-
- // Compute the view box.
- Box box = make_inverse<Box>();
- BGL_FORALL_VERTICES(v, localAssemblyGraph, LocalAssemblyGraph) {
- const LocalAssemblyGraphVertex& vertex = localAssemblyGraph[v];
- SHASTA_ASSERT(vertex.position.size() >= 2);
- const Point& p1 = vertex.position.front();
- const Point& p2 = vertex.position.back();
-
- expand(box, p1);
- expand(box, p2);
- }
- Point minCorner = box.min_corner();
- Point maxCorner = box.max_corner();
-
- // Add a bit of extra space.
- Point delta = maxCorner;
- subtract_point(delta, minCorner);
- multiply_value(delta, 0.05);
- subtract_point(minCorner, delta);
- add_point(maxCorner, delta);
-
-
-
- // Figure out the required size of the viewbox.
- Point diagonal = maxCorner;
- subtract_point(diagonal, minCorner);
-
- // Begin the svg.
- const string svgId = "LocalAssemblyGraph";
- svg << "\n<svg id='" << svgId <<
- "' width='" << options.sizePixels <<
- "' height='" << options.sizePixels <<
- "' viewbox='" << minCorner.x() << " " << minCorner.y() << " " <<
- diagonal.x() << " " <<
- diagonal.y() << "'"
- " style='border-style:solid;border-color:Black;'"
- ">\n";
-
-
-
- // Write the links first, so they don't overwrite the segments.
- svg << "<g id='" << svgId << "-links'>\n";
- BGL_FORALL_EDGES(e, localAssemblyGraph, LocalAssemblyGraph) {
- const uint64_t linkId = localAssemblyGraph[e].linkId;
- const AssemblyGraph::Link& link = assemblyGraph.links[linkId];
-
- // Access the LocalAssemblyGraph vertices corresponding to
- // the two segments of this Link and extract some information
- // from them.
- const vertex_descriptor v1 = source(e, localAssemblyGraph);
- const vertex_descriptor v2 = target(e, localAssemblyGraph);
- const LocalAssemblyGraphVertex& vertex1 = localAssemblyGraph[v1];
- const LocalAssemblyGraphVertex& vertex2 = localAssemblyGraph[v2];
- const uint64_t segmentId1 = vertex1.segmentId;
- const uint64_t segmentId2 = vertex2.segmentId;
-
- // Get the positions of the ends of this link.
- SHASTA_ASSERT(vertex1.position.size() >= 2);
- SHASTA_ASSERT(vertex2.position.size() >= 2);
- const Point& p1 = vertex1.position.back();
- const Point& p2 = vertex2.position.front();
- const double length = boost::geometry::distance(p1, p2);
-
- // Get the tangents and compute the control points.
- const double controlPointDistance = 0.25 * length;
- const Point& t1 = vertex1.t2;
- const Point& t2 = vertex2.t1;
- Point q1 = t1;
- multiply_value(q1, controlPointDistance);
- add_point(q1, p1);
- Point q2 = t2;
- multiply_value(q2, controlPointDistance);
- add_point(q2, p2);
-
- const double linkThickness =
- options.minimumLinkThickness +
- options.additionalLinkThicknessPerRead * double(assemblyGraph.linkCoverage(linkId) - 1);
-
- const string dash =
- link.segmentsAreAdjacent ? "" :
- " stroke-dasharray='0 " + to_string(1.5 * linkThickness) + "'";
-
- // If the link participates in a path, color it consistently with the
- // segments is joins.
- string linkColor = options.linkColor;
- if(options.segmentColoring == "path") {
- const auto it1 = pathSegments.find(segmentId1);
- if(it1 != pathSegments.end()) {
- const auto positions1 = it1->second;
- SHASTA_ASSERT(not positions1.empty());
- const auto it2 = pathSegments.find(segmentId2);
- if(it2 != pathSegments.end()) {
- const auto positions2 = it2->second;
- SHASTA_ASSERT(not positions2.empty());
- if(positions1.size()==1 and positions2.size()==1) {
- const uint64_t position1 = positions1.front().first;
- const uint64_t position2 = positions2.front().first;
- if(position2 == position1 + 1) {
- const uint32_t hue = uint32_t(
- std::round(120. * double(position1 + position2) / double(path.segments.size())));
- linkColor = "hsl(" + to_string(hue) + ",100%, 20%)";
- }
- } else {
- linkColor = "Fuchsia";
- }
- }
- }
- }
-
- svg <<
- "<g>"
- // "<a href='exploreMode3AssemblyGraphLink?linkId=" << linkId << "'>"
- "<title>"
- "Link " << linkId <<
- " from segment " << segmentId1 <<
- " to segment " << segmentId2 <<
- ", coverage " << assemblyGraph.linkCoverage(linkId) <<
- ", separation " << link.separation <<
- "</title>"
- "<path d="
- "'M " << p1.x() << " " << p1.y() <<
- " C " << q1.x() << " " << q1.y() << ", "
- << q2.x() << " " << q2.y() << ","
- << p2.x() << " " << p2.y() << "'"
- " stroke='" << linkColor << "'" <<
- dash <<
- " stroke-width='" << linkThickness << "'"
- " stroke-linecap='round'"
- " fill='transparent'"
- // " vector-effect='non-scaling-stroke'"
- " onclick='if(event.ctrlKey) {location.href=\"exploreMode3AssemblyGraphLink?linkId=" << linkId << "\";}'"
- "/>"
- // "</a>"
- "</g>\n";
-
- }
- svg << "</g>\n";
-
-
-
- // Write the segments.
- svg << "<g id='" << svgId << "-segments'>\n";
- BGL_FORALL_VERTICES(v, localAssemblyGraph, LocalAssemblyGraph) {
- const LocalAssemblyGraphVertex& vertex = localAssemblyGraph[v];
- const uint64_t distance = localAssemblyGraph[v].distance;
-
- // Get the positions of the ends of this segment.
- SHASTA_ASSERT(vertex.position.size() >= 2);
- const Point& p1 = vertex.position.front();
- const Point& p2 = vertex.position.back();
- const double length = boost::geometry::distance(p1, p2);
-
- // Get the tangents and compute the control points.
- const double controlPointDistance = 0.25 * length;
- const Point& t1 = vertex.t1;
- const Point& t2 = vertex.t2;
- Point q1 = t1;
- multiply_value(q1, -controlPointDistance);
- add_point(q1, p1);
- Point q2 = t2;
- multiply_value(q2, -controlPointDistance);
- add_point(q2, p2);
-
- const uint64_t segmentId = localAssemblyGraph[v].segmentId;
-
-
-
- // Decide the color for this segment.
- string color;
- if(distance == maxDistance) {
- color = options.segmentAtMaxDistanceColor;
- } else {
- if(options.segmentColoring == "random") {
- color = randomSegmentColor(segmentId);
- } else if(options.segmentColoring == "uniform") {
- color = options.segmentColor;
- } else if(options.segmentColoring == "byCommonReads") {
- const uint64_t commonCount = segmentPairInformationTable[v].commonCount;
- double fraction;
- if(options.greenThreshold) {
- fraction = min(1., double(commonCount) / double(options.greenThreshold));
- } else {
- fraction = double(commonCount) / double(referenceSegmentInfo.infos.size());
- }
- const uint64_t hue = uint64_t(std::round(fraction * 120.));
- color = "hsl(" + to_string(hue) + ",100%, 50%)";
- } else if(options.segmentColoring == "byJaccard") {
- const auto& pairInfo = segmentPairInformationTable[v];
- if(pairInfo.commonCount > 0) {
- const double jaccard = pairInfo.jaccard();
- const uint64_t hue = uint64_t(std::round(jaccard * 120.));
- color = "hsl(" + to_string(hue) + ",100%, 50%)";
- } else {
- color = "blue";
- }
- } else if(options.segmentColoring == "byRawJaccard") {
- const auto& pairInfo = segmentPairInformationTable[v];
- if(pairInfo.commonCount > 0) {
- const double rawJaccard = pairInfo.rawJaccard();
- const uint64_t hue = uint64_t(std::round(rawJaccard * 120.));
- color = "hsl(" + to_string(hue) + ",100%, 50%)";
- } else {
- color = "blue";
- }
- } else if(options.segmentColoring == "byUnexplainedFractionOnReferenceSegment") {
- const auto& pairInfo = segmentPairInformationTable[v];
- if(pairInfo.commonCount > 0) {
- const double fraction = 1. - pairInfo.unexplainedFraction(0);
- const uint64_t hue = uint64_t(std::round(fraction * 120.));
- color = "hsl(" + to_string(hue) + ",100%, 50%)";
- } else {
- color = "blue";
- }
- } else if(options.segmentColoring == "byUnexplainedFractionOnDisplayedSegment") {
- const auto& pairInfo = segmentPairInformationTable[v];
- if(pairInfo.commonCount > 0) {
- const double fraction = 1. - pairInfo.unexplainedFraction(1);
- const uint64_t hue = uint64_t(std::round(fraction * 120.));
- color = "hsl(" + to_string(hue) + ",100%, 50%)";
- } else {
- color = "blue";
- }
- } else if(options.segmentColoring == "byCluster") {
- const uint64_t clusterId = assemblyGraph.clusterIds[segmentId];
- if(clusterId == std::numeric_limits<uint64_t>::max()) {
- color = "Gray";
- } else {
- if(options.clustersToBeColored.empty()) {
- // We are coloring all cluster. Use a hash function to decide the color.
- const uint32_t hashValue = MurmurHash2(&clusterId, sizeof(clusterId), uint32_t(options.hashSeed));
- const uint32_t hue = hashValue % 360;
- color = "hsl(" + to_string(hue) + ",100%, 50%)";
- } else {
- // We are only coloring some segments.
- auto it = clusterColorMap.find(clusterId);
- if(it == clusterColorMap.end()) {
- color = "Black";
- } else {
- color = it->second;
- }
- }
- }
- } else if(options.segmentColoring == "path") {
- auto it = pathSegments.find(segmentId);
- if(it == pathSegments.end()) {
- color = "Black";
- } else {
- const auto positions = it->second;
- SHASTA_ASSERT(not positions.empty());
- if(positions.size() == 1) {
- const auto& p = positions.front();
- const uint64_t positionInPath = p.first;
- const bool isReferenceSegment = p.second;
- const uint32_t hue = uint32_t(
- std::round(240. * double(positionInPath) / double(path.segments.size())));
- color = "hsl(" + to_string(hue) + ",100%, " + (isReferenceSegment ? "40%" : "70%") + ")";
- } else {
- // This segment appears more than once on the path.
- color = "Fuchsia";
- }
- }
- } else {
- color = "Black";
- }
- }
-
-
-
- // Get the oriented reads and average edge coverage.
- vector<OrientedReadId> orientedReadIds;
- const double averageEdgeCoverage = assemblyGraph.findOrientedReadsOnSegment(segmentId, orientedReadIds);
-
- // Create a marker to show the arrow for this segment.
- const string arrowMarkerName = "arrow" + to_string(segmentId);
- svg <<
- "<defs>\n"
- "<marker id='" << arrowMarkerName <<
- "' viewBox='0 0 0.6 1'\n"
- "refX='0.1' refY='0.5'\n"
- "markerUnits='strokeWidth'\n"
- "markerWidth='0.6' markerHeight='1'\n"
- "orient='auto'>\n"
- "<path id='marker" << segmentId << "' d='M 0 0 L 0.1 0 L 0.6 0.5 L 0.1 1 L 0 1 z' "
- "fill='" << color << "' "
- "/>\n"
- "</marker>\n"
- "</defs>\n";
-
- // Add this segment to the svg.
- const auto& segmentPairInfo = segmentPairInformationTable[v];
- const auto oldPrecision = svg.precision(1);
- const auto oldFlags = svg.setf(std::ios_base::fixed, std::ios_base::floatfield);
-
- if(options.segmentColoring == "path") {
- svg << "<g>";
- auto it = pathSegments.find(segmentId);
- if(it != pathSegments.end()) {
- const auto positions = it->second;
- SHASTA_ASSERT(not positions.empty());
- svg << "<title>";
- for(const auto& p: positions) {
- svg << p.first << " ";
- }
- svg << "</title>";
- }
- }
-
- /*
- svg <<
- "<g>"
- // "<a href='exploreMode3AssemblyGraphSegment?segmentId=" << segmentId << "'>"
- "<title>"
- "Segment " << segmentId <<
- ", distance from start segment " << distance <<
- ", path length " << assemblyGraph.paths.size(segmentId) <<
- ", average marker graph edge coverage " << averageEdgeCoverage <<
- ", number of distinct oriented reads " << orientedReadIds.size();
- if(doSegmentPairComputations) {
- svg << ", number of common oriented reads " << segmentPairInfo.commonOrientedReadCount <<
- " of " << referenceSegmentInfo.infos.size();
- }
- */
- svg <<
- // "</title>"
- "<path id='Segment-" << segmentId << "'"
- " onmouseenter='onMouseEnterSegment(" <<
- segmentId << "," <<
- distance << "," <<
- assemblyGraph.markerGraphPaths.size(segmentId) << "," <<
- averageEdgeCoverage << "," <<
- assemblyGraph.clusterIds[segmentId] << "," <<
- segmentPairInfo.totalCount[0] << "," <<
- segmentPairInfo.totalCount[1] << "," <<
- segmentPairInfo.shortCount[0] << "," <<
- segmentPairInfo.shortCount[1] << "," <<
- segmentPairInfo.commonCount << "," <<
- segmentPairInfo.unexplainedCount[0] << "," <<
- segmentPairInfo.unexplainedCount[1] << ")'" <<
- " onmouseleave='onMouseExitSegment()'" <<
-
-#if 0
- // Old code that displays the segment as a cubic spline.
- // This can create artifacts when the segment is very thick.
- "' d='M " <<
- p1.x() << " " << p1.y() << " C " <<
- q1.x() << " " << q1.y() << ", " <<
- q2.x() << " " << q2.y() << ", " <<
- p2.x() << " " << p2.y() << "'" <<
-#endif
-
- " d='M " <<
- p1.x() << " " << p1.y() << " L " <<
- p2.x() << " " << p2.y() << "'" <<
- " stroke='" << color << "'"
- " stroke-width='" <<
- options.minimumSegmentThickness + averageEdgeCoverage * options.additionalSegmentThicknessPerUnitCoverage << "'"
- " fill='none'"
- " marker-end='url(#" <<
- arrowMarkerName <<
- ")'"
- " onclick='if(event.ctrlKey) {"
- "location.href=\"exploreMode3AssemblyGraphSegment?segmentId=" << segmentId <<
- "&showSequence=on\";}'"
- "/>"
- // "</a>"
- // "</g>"
- "\n";
- svg.precision(oldPrecision);
- svg.flags(oldFlags);
- if(options.segmentColoring == "path") {
- svg << "</g>";
- }
- }
- svg << "</g>\n";
-
-
-
- // End the svg.
- svg << "</svg>\n";
-}
-
-
-
-void mode3::LocalAssemblyGraph::computeLayout(
- const SvgOptions& options,
- double timeout)
-{
- LocalAssemblyGraph& localAssemblyGraph = *this;
-
-
- // Create an auxiliary graph with two vertices for each segment.
- using G = boost::adjacency_list<boost::vecS, boost::vecS, boost::undirectedS>;
- G g;
- std::map<vertex_descriptor, array<G::vertex_descriptor, 2> > vertexMap;
- std::map<G::edge_descriptor, double> edgeLengthMap;
- BGL_FORALL_VERTICES(v, localAssemblyGraph, LocalAssemblyGraph) {
- const uint64_t segmentId = localAssemblyGraph[v].segmentId;
-
- const uint64_t pathLength = assemblyGraph.markerGraphPaths.size(segmentId);
- const double displayLength =
- options.minimumSegmentLength +
- double(pathLength - 1) * options.additionalSegmentLengthPerMarker;
-
- // Add the auxiliary vertices.
- array<G::vertex_descriptor, 2>& auxiliaryVertices = vertexMap[v];
- for(uint64_t i=0; i<2; i++) {
- auxiliaryVertices[i] = boost::add_vertex(g);
- }
-
- // Add the edge between these auxiliary vertices.
- G::edge_descriptor e;
- tie(e, ignore) = boost::add_edge(auxiliaryVertices[0], auxiliaryVertices[1], g);
- edgeLengthMap.insert(make_pair(e, displayLength));
- }
-
-
-
- // Add auxiliary graph edges between vertices corresponding to different
- // LocalAssemblyGraph vertices.
- BGL_FORALL_EDGES(e, localAssemblyGraph, LocalAssemblyGraph) {
- const vertex_descriptor v1 = source(e, localAssemblyGraph);
- const vertex_descriptor v2 = target(e, localAssemblyGraph);
- const LocalAssemblyGraphEdge& edge = localAssemblyGraph[e];
- const uint64_t linkId = edge.linkId;
- const AssemblyGraph::Link& link = assemblyGraph.links[linkId];
-
- double edgeLength;
- if(link.segmentsAreAdjacent) {
- edgeLength = options.minimumLinkLength;
- } else {
- const int32_t linkSeparation = max(link.separation, 0);
- edgeLength = 3. * options.minimumLinkLength + double(linkSeparation) * options.additionalLinkLengthPerMarker;
- }
- G::edge_descriptor eAuxiliary;
- tie(eAuxiliary, ignore) = add_edge(
- vertexMap[v1].back(),
- vertexMap[v2].front(),
- g);
- edgeLengthMap.insert(make_pair(eAuxiliary, edgeLength));
- }
-
-
-
- // Compute the layout of the auxiliary graph.
- std::map<G::vertex_descriptor, array<double, 2> > positionMap;
- ComputeLayoutReturnCode returnCode = ComputeLayoutReturnCode::Success;
- if(options.layoutMethod == "neato") {
- returnCode = shasta::computeLayoutGraphviz(g, "neato", timeout, positionMap, "", &edgeLengthMap);
- } else if(options.layoutMethod == "custom") {
- returnCode = shasta::computeLayoutCustom(g, edgeLengthMap, positionMap, timeout);
- } else {
- throw runtime_error("Invalid layout method specified: " + options.layoutMethod);
- }
- if(returnCode == ComputeLayoutReturnCode::Timeout) {
- throw runtime_error("Graph layout took too long. "
- "Increase the timeout or decrease the maximum distance.");
- }
- if(returnCode != ComputeLayoutReturnCode::Success) {
- throw runtime_error("Graph layout failed.");
- }
-
-
-
- // Store the layout in the vertices of the localAssemblyGraph.
- BGL_FORALL_VERTICES(v, localAssemblyGraph, LocalAssemblyGraph) {
- LocalAssemblyGraphVertex& vertex = localAssemblyGraph[v];
- vertex.position.clear();
-
- // Locate the auxiliary vertices corresponding to this segment.
- auto it = vertexMap.find(v);
- SHASTA_ASSERT(it != vertexMap.end());
- const array<G::vertex_descriptor, 2>& auxiliaryVertices = it->second;
-
- // Loop over the auxiliary vertices.
- for(const G::vertex_descriptor u: auxiliaryVertices) {
- auto jt = positionMap.find(u);
- SHASTA_ASSERT(jt != positionMap.end());
- const array<double, 2>& p = jt->second;
- vertex.position.push_back(Point(p[0], p[1]));
- }
- }
-}
-
-
-
-void LocalAssemblyGraph::computeSegmentTangents()
-{
- LocalAssemblyGraph& localAssemblyGraph = *this;
- BGL_FORALL_VERTICES(v, localAssemblyGraph, LocalAssemblyGraph) {
- computeSegmentTangents(v);
- }
-}
-
-
-
-
-void LocalAssemblyGraph::computeSegmentTangents(vertex_descriptor v0)
-{
- LocalAssemblyGraph& localAssemblyGraph = *this;
- LocalAssemblyGraphVertex& vertex0 = localAssemblyGraph[v0];
- SHASTA_ASSERT(vertex0.position.size() >= 2);
- const Point& vertex0Start = vertex0.position.front();
- const Point& vertex0End = vertex0.position.back();
-
- Point t = vertex0End;
- boost::geometry::subtract_point(t, vertex0Start);
- const double length = sqrt(t.x() * t.x() + t.y() * t.y());
- boost::geometry::multiply_value(t, 1. / length);
- vertex0.t2 = t;
- boost::geometry::multiply_value(t, -1.);
- vertex0.t1 = t;
-
-
-#if 0
- // This is used if we display segments as Bezier cubics.
-
-
- // To compute t1, average the unit vectors of the backward links.
- array<double, 2> direction = {0., 0.};
- uint64_t n = 0;
- BGL_FORALL_INEDGES(v0, e, localAssemblyGraph, LocalAssemblyGraph) {
- const vertex_descriptor v1 = source(e, localAssemblyGraph);
- LocalAssemblyGraphVertex& vertex1 = localAssemblyGraph[v1];
- SHASTA_ASSERT(vertex1.position.size() >= 2);
- const Point& vertex1Start = vertex1.position.front();
-
- const double dx = vertex1Start.x() - vertex0End.x();
- const double dy = vertex1Start.y() - vertex0End.y();
- const double d = sqrt(dx * dx + dy * dy);
- if(d == 0.) {
- continue;
- }
-
- // Accumulate the unit vector.
- ++n;
- direction[0] += dx / d;
- direction[1] += dy / d;
- }
- // Compute the average,normalized direction.
- double dLength = sqrt(direction[0] * direction[0] + direction[1] * direction[1]);
- if(dLength == 0.) {
- direction[0] = vertex0Start.x() - vertex0End.x();
- direction[1] = vertex0Start.y() - vertex0End.y();
- dLength = sqrt(direction[0] * direction[0] + direction[1] * direction[1]);
- }
- direction[0] /= dLength;
- direction[1] /= dLength;
-
- vertex0.t1.x(direction[0]);
- vertex0.t1.y(direction[1]);
-
-
-
- // To compute the second control point, q2,
- // average the unit vectors of the forward links.
- direction = {0., 0.};
- n = 0;
- BGL_FORALL_OUTEDGES(v0, e, localAssemblyGraph, LocalAssemblyGraph) {
- const vertex_descriptor v1 = target(e, localAssemblyGraph);
- LocalAssemblyGraphVertex& vertex1 = localAssemblyGraph[v1];
- SHASTA_ASSERT(vertex1.position.size() >= 2);
- const Point& vertex1Start = vertex1.position.front();
-
- const double dx = vertex1Start.x() - vertex0End.x();
- const double dy = vertex1Start.y() - vertex0End.y();
- const double d = sqrt(dx * dx + dy * dy);
- if(d == 0.) {
- continue;
- }
-
- // Accumulate the unit vector.
- ++n;
- direction[0] += dx / d;
- direction[1] += dy / d;
- }
- // Compute the average,normalized direction.
- dLength = sqrt(direction[0] * direction[0] + direction[1] * direction[1]);
- if(dLength == 0.) {
- direction[0] = vertex0End.x() - vertex0Start.x();
- direction[1] = vertex0End.y() - vertex0Start.y();
- dLength = sqrt(direction[0] * direction[0] + direction[1] * direction[1]);
- }
- direction[0] /= dLength;
- direction[1] /= dLength;
-
- vertex0.t2.x(direction[0]);
- vertex0.t2.y(direction[1]);
-#endif
-}
-
-
-
-// Return the svg color for a segment.
-string LocalAssemblyGraph::randomSegmentColor(uint64_t segmentId)
-{
- const uint32_t hue = MurmurHash2(&segmentId, sizeof(segmentId), 231) % 360;
- return "hsl(" + to_string(hue) + ",50%,50%)";
-}
-
-
-
-// Find out if the paths of two segments are consecutive.
-bool LocalAssemblyGraph::haveConsecutivePaths(
- vertex_descriptor v0,
- vertex_descriptor v1
-) const
-{
- const LocalAssemblyGraph& localAssemblyGraph = *this;
-
- const LocalAssemblyGraphVertex& vertex0 = localAssemblyGraph[v0];
- const LocalAssemblyGraphVertex& vertex1 = localAssemblyGraph[v1];
-
- const uint64_t segmentId0 = vertex0.segmentId;
- const uint64_t segmentId1 = vertex1.segmentId;
-
- const auto path0 = assemblyGraph.markerGraphPaths[segmentId0];
- const auto path1 = assemblyGraph.markerGraphPaths[segmentId1];
-
- const MarkerGraphEdgeId edgeId0 = path0.back();
- const MarkerGraphEdgeId edgeId1 = path1.front();
-
- const MarkerGraph::Edge& edge0 = markerGraph.edges[edgeId0];
- const MarkerGraph::Edge& edge1 = markerGraph.edges[edgeId1];
-
- return edge0.target == edge1.source;
-}
-
-
-
-// Return the average link separation for the Link
-// described by an edge.
-int32_t LocalAssemblyGraph::linkSeparation(edge_descriptor e) const
-{
- const LocalAssemblyGraph& localAssemblyGraph = *this;
- const uint64_t linkId = localAssemblyGraph[e].linkId;
- return assemblyGraph.links[linkId].separation;
-}
-
-
-
-// Construct the svg options from an html request.
-LocalAssemblyGraph::SvgOptions::SvgOptions(const vector<string>& request)
-{
- // The initial layout method if set to "custom" if
- // command "customLayout" is available, "neato" otherwise.
- static bool firstTime = true;
- static string layoutDefaultMethod = "neato";
- if(firstTime) {
- firstTime = false;
- const string command = "which customLayout";
- const int returnCode = system(command.c_str());
- if(returnCode == 0) {
- layoutDefaultMethod = "custom";
- }
- }
- layoutMethod = layoutDefaultMethod;
-
- HttpServer::getParameterValue(request, "sizePixels", sizePixels);
- HttpServer::getParameterValue(request, "layoutMethod", layoutMethod);
-
- // Segment length and thickness.
- HttpServer::getParameterValue(request, "minimumSegmentLength", minimumSegmentLength);
- HttpServer::getParameterValue(request, "additionalSegmentLengthPerMarker", additionalSegmentLengthPerMarker);
- HttpServer::getParameterValue(request, "minimumSegmentThickness", minimumSegmentThickness);
- HttpServer::getParameterValue(request, "additionalSegmentThicknessPerUnitCoverage", additionalSegmentThicknessPerUnitCoverage);
-
- // Segment coloring
- HttpServer::getParameterValue(request, "segmentColoring", segmentColoring);
- HttpServer::getParameterValue(request, "segmentColor", segmentColor);
- HttpServer::getParameterValue(request, "greenThreshold", greenThreshold);
- HttpServer::getParameterValue(request, "referenceSegmentId", referenceSegmentId);
- HttpServer::getParameterValue(request, "hashSeed", hashSeed);
- HttpServer::getParameterValue(request, "pathStart", pathStart);
- HttpServer::getParameterValue(request, "pathDirection", pathDirection);
-
- string clustersToBeColoredString;
- HttpServer::getParameterValue(request, "clustersToBeColored", clustersToBeColoredString);
- clustersToBeColored.clear();
- if(not clustersToBeColoredString.empty()) {
- vector<string> tokens;
- boost::algorithm::split(tokens, clustersToBeColoredString, boost::algorithm::is_any_of(","));
- for(const string& token: tokens) {
- try {
- const uint64_t clusterId =std::stoi(token);
- clustersToBeColored.push_back(clusterId);
- } catch(const std::exception&) {
- // Neglect it.
- }
- }
- }
-
- // Flag to turn on sequence assembly when coloring a path.
- string assemblePathSequenceString;
- assemblePathSequence = HttpServer::getParameterValue(request, "assemblePathSequence", assemblePathSequenceString);
-
- // Link length and thickness.
- HttpServer::getParameterValue(request, "minimumLinkLength", minimumLinkLength);
- HttpServer::getParameterValue(request, "additionalLinkLengthPerMarker", additionalLinkLengthPerMarker);
- HttpServer::getParameterValue(request, "minimumLinkThickness", minimumLinkThickness);
- HttpServer::getParameterValue(request, "additionalLinkThicknessPerRead", additionalLinkThicknessPerRead);
-}
-
-
-
-// Add rows to the html request form.
-void LocalAssemblyGraph::SvgOptions::addFormRows(ostream& html)
-{
- html <<
- "<tr>"
- "<td>Graphics size in pixels"
- "<td class=centered><input type=text name=sizePixels size=8 style='text-align:center'"
- " value='" << sizePixels <<
- "'>"
-
- "<tr>"
- "<td>Graph layout method"
- "<td class=left>"
- "<input type=radio name=layoutMethod value=neato"
- << (layoutMethod=="neato" ? " checked=checked" : "") <<
- ">Graphviz neato (slow for large graphs)<br>"
- "<input type=radio name=layoutMethod value=custom"
- << (layoutMethod=="custom" ? " checked=checked" : "") <<
- ">Custom (user-provided command <code>customLayout</code>)<br>"
-
- "<tr>"
- "<td>Segments"
- "<td class=centered>"
- "<table>"
- "<tr><td class=left>"
- "Minimum display length "
- "<td><input type=text name=minimumSegmentLength size=8 style='text-align:center'"
- " value='" << minimumSegmentLength << "'>"
- "<tr><td class=left>"
- "Additional display length per marker"
- "<td><input type=text name=additionalSegmentLengthPerMarker size=8 style='text-align:center'"
- " value='" << additionalSegmentLengthPerMarker << "'>"
- "<tr>"
- "<td class=left>Minimum thickness"
- "<td class=centered><input type=text name=minimumSegmentThickness size=8 style='text-align:center'"
- " value='" << minimumSegmentThickness <<
- "'>"
- "<tr>"
- "<td class=left>Additional thickness per unit coverage"
- "<td class=centered><input type=text name=additionalSegmentThicknessPerUnitCoverage size=8 style='text-align:center'"
- " value='" << additionalSegmentThicknessPerUnitCoverage <<
- "'>"
-
-
-
- // Segment coloring.
- "<tr>"
- "<td class = left>Color"
- "<td class=left>"
-
- // Random segment coloring.
- "<input type=radio name=segmentColoring value=random"
- << (segmentColoring=="random" ? " checked=checked" : "") <<
- ">Random<hr>"
-
- // Uniform segment coloring.
- "<input type=radio name=segmentColoring value=uniform"
- << (segmentColoring=="uniform" ? " checked=checked" : "") <<
- ">"
- "<input type=text name=segmentColor size=8 style='text-align:center'"
- " value='" << segmentColor << "'>"
- "<hr>"
-
- // Segment coloring by Jaccard similarity with the reference segment.
- "<input type=radio name=segmentColoring value=byJaccard"
- << (segmentColoring=="byJaccard" ? " checked=checked" : "") <<
- ">By Jaccard similarity with reference segment, without counting short reads"
- "<br>"
-
- // Segment coloring by raw Jaccard similarity with the reference segment.
- "<input type=radio name=segmentColoring value=byRawJaccard"
- << (segmentColoring=="byRawJaccard" ? " checked=checked" : "") <<
- ">By raw Jaccard similarity with reference segment (no special treatment of short reads)"
- "<br>"
-
- // Segment coloring by number of common reads with the reference segment.
- "<input type=radio name=segmentColoring value=byCommonReads"
- << (segmentColoring=="byCommonReads" ? " checked=checked" : "") <<
- ">By number of common supporting oriented reads with reference segment"
- "<div style='text-indent:3em'>"
- "Green if at least "
- "<input type=text name=greenThreshold size=4 style='text-align:center'"
- " value='" << greenThreshold <<
- "'>" " common reads (0 = automatic)"
- "</div>"
-
- // Segment coloring by unexplained fraction on the reference segment.
- "<input type=radio name=segmentColoring value=byUnexplainedFractionOnReferenceSegment"
- << (segmentColoring=="byUnexplainedFractionOnReferenceSegment" ? " checked=checked" : "") <<
- ">By unexplained fraction on the reference segment"
- "<br>"
-
- // Segment coloring by unexplained fraction on the displayed segment.
- "<input type=radio name=segmentColoring value=byUnexplainedFractionOnDisplayedSegment"
- << (segmentColoring=="byUnexplainedFractionOnDisplayedSegment" ? " checked=checked" : "") <<
- ">By unexplained fraction on the displayed segment"
- "<br>"
-
- "Reference segment&nbsp;<input type=text name=referenceSegmentId size=8 style='text-align:center'"
- " value='" << referenceSegmentId << "'><hr>"
-
- // Segment coloring by cluster id.
- "<input type=radio name=segmentColoring value=byCluster"
- << (segmentColoring=="byCluster" ? " checked=checked" : "") <<
- ">By cluster"
- "<br>"
- "Hash seed&nbsp;<input type=text name=hashSeed size=8 style='text-align:center'"
- " value='" << hashSeed << "'><br>"
- "Only color clusters&nbsp;<input type=text name=clustersToBeColored size=8 style='text-align:center'"
- " value='";
- for(const uint64_t clusterId: clustersToBeColored) {
- html << clusterId << ",";
- }
- html << "'><hr>"
-
- // Segment coloring by local cluster
- // (computed by analyzeSubgraph using as input only the segments at
- // distance less than maxDistance).
- "<input type=radio name=segmentColoring value=byLocalCluster"
- << (segmentColoring=="byLocalCluster" ? " checked=checked" : "") <<
- ">By local cluster"
- "<br>";
-
- // Segment coloring using a path.
- html <<
- "<hr>"
- "<input type=radio name=segmentColoring value=path"
- << (segmentColoring=="path" ? " checked=checked" : "") <<
- ">Color an assembly path"
- "<br>"
- "Start the path at segment &nbsp;<input type=text name=pathStart size=8 style='text-align:center'"
- " value='" << pathStart << "'>"
- "<br><input type=radio name=pathDirection value=forward" <<
- (pathDirection=="forward" ? " checked=checked" : "") << "> Forward"
- "<br><input type=radio name=pathDirection value=backward" <<
- (pathDirection=="backward" ? " checked=checked" : "") << "> Backward"
- "<br><input type=radio name=pathDirection value=bidirectional" <<
- (pathDirection=="bidirectional" ? " checked=checked" : "") << "> Both directions" <<
- "<br><input type=checkbox name=assemblePathSequence" <<
- (assemblePathSequence ? " checked=checked" : "") <<
- "> Assemble path sequence.";
-
-
- html << "</table>"
-
-
-
- "<tr>"
- "<td>Links"
- "<td class=centered>"
- "<table>"
- "<tr><td class=left>"
- "Minimum display length "
- "<td><input type=text name=minimumLinkLength size=8 style='text-align:center'"
- " value='" << minimumLinkLength << "'>"
- "<tr><td class=left>"
- "Additional display length per marker"
- "<td><input type=text name=additionalLinkLengthPerMarker size=8 style='text-align:center'"
- " value='" << additionalLinkLengthPerMarker << "'>"
- "<tr>"
- "<td class=left>Minimum thickness"
- "<td class=centered><input type=text name=minimumLinkThickness size=8 style='text-align:center'"
- " value='" << minimumLinkThickness <<
- "'>"
- "<tr>"
- "<td class=left>Additional thickness per read"
- "<td class=centered><input type=text name=additionalLinkThicknessPerRead size=8 style='text-align:center'"
- " value='" << additionalLinkThicknessPerRead <<
- "'>"
- "</table>"
-
- "</table>"
-
-
- ;
-
-}
-
-
-
-// Return true if there were no changes in the options
-// that affect graph layout changed, compared to another
-// SvgOptions object.
-bool LocalAssemblyGraph::SvgOptions::hasSameLayoutOptions(const SvgOptions& that) const
-{
- return
- (layoutMethod == that.layoutMethod) and
- (minimumSegmentLength == that.minimumSegmentLength) and
- (additionalSegmentLengthPerMarker == that.additionalSegmentLengthPerMarker) and
- (minimumLinkLength == that.minimumLinkLength) and
- (additionalLinkLengthPerMarker == that.additionalLinkLengthPerMarker)
- ;
-}
-
-
-
-// Write the local assembly graph in gfa format.
-void LocalAssemblyGraph::writeGfa(const string& fileName) const
-{
- ofstream gfa(fileName);
- writeGfa(gfa);
-}
-void LocalAssemblyGraph::writeGfa(ostream& gfa) const
-{
- const LocalAssemblyGraph& localAssemblyGraph = *this;
-
- // Write the header.
- gfa << "H\tVN:Z:1.0\n";
-
- // Write the segments.
- BGL_FORALL_VERTICES(v, localAssemblyGraph, LocalAssemblyGraph) {
- const uint64_t segmentId = localAssemblyGraph[v].segmentId;
- const auto path = assemblyGraph.markerGraphPaths[segmentId];
- gfa <<
- "S\t" << segmentId << "\t" <<
- "*\tLN:i:" << path.size() << "\n";
- }
-
-
- // Write the links.
- BGL_FORALL_EDGES(e, localAssemblyGraph, LocalAssemblyGraph) {
- const uint64_t linkId = localAssemblyGraph[e].linkId;
- const mode3::AssemblyGraph::Link& link = assemblyGraph.links[linkId];
- gfa << "L\t" <<
- link.segmentId0 << "\t+\t" <<
- link.segmentId1 << "\t+\t0M\n";
- }
-
-}
-
diff --git a/src/mode3-LocalAssemblyGraph.hpp b/src/mode3-LocalAssemblyGraph.hpp
deleted file mode 100644
index a42cc82..0000000
--- a/src/mode3-LocalAssemblyGraph.hpp
+++ /dev/null
@@ -1,186 +0,0 @@
-#ifndef SHASTA_MODE3_LOCAL_ASSEMBLY_GRAPH_HPP
-#define SHASTA_MODE3_LOCAL_ASSEMBLY_GRAPH_HPP
-
-// Shasta.
-#include "mode3.hpp"
-
-// Boost libraries.
-#include <boost/geometry/geometries/point_xy.hpp>
-#include <boost/geometry/algorithms/distance.hpp>
-#include <boost/geometry/arithmetic/arithmetic.hpp>
-#include <boost/graph/adjacency_list.hpp>
-
-
-
-namespace shasta {
- namespace mode3 {
-
- class LocalAssemblyGraph;
- class LocalAssemblyGraphEdge;
- class LocalAssemblyGraphVertex;
-
- using Point = boost::geometry::model::d2::point_xy<double>;
- }
-
-}
-
-
-// Classes used to display in the http server a local portion of the AssemblyGraph.
-class shasta::mode3::LocalAssemblyGraphVertex {
-public:
- uint64_t segmentId;
- uint64_t distance; // From the start vertex.
- LocalAssemblyGraphVertex(
- uint64_t segmentId,
- uint64_t distance);
- LocalAssemblyGraphVertex();
-
- // The positions of the auxiliary graph vertices corresponding
- // to this segment.
- vector<Point> position;
-
- // Unit vectors for the outward pointing tangents at the two ends of the segment.
- // The are computed as averages of the directions of the
- // incoming/outgoing links.
- // They are used to display the segment as a cubic spline.
- Point t1;
- Point t2;
-};
-
-
-
-class shasta::mode3::LocalAssemblyGraphEdge {
-public:
- uint64_t linkId;
- LocalAssemblyGraphEdge(uint64_t linkId=0) :
- linkId(linkId)
- {}
-};
-
-
-
-class shasta::mode3::LocalAssemblyGraph :
- public boost::adjacency_list<boost::listS, boost::listS, boost::bidirectionalS,
- LocalAssemblyGraphVertex, LocalAssemblyGraphEdge> {
-public:
-
- LocalAssemblyGraph(
- const MarkerGraph&,
- const AssemblyGraph&,
- uint64_t startSegmentId,
- uint64_t maxDistance);
-
- const MarkerGraph& markerGraph;
- const AssemblyGraph& assemblyGraph;
- uint64_t maxDistance;
-
- vertex_descriptor addVertex(
- uint64_t segmentId,
- uint64_t distance);
-
-
-
- class SvgOptions {
- public:
-
- double sizePixels = 600.;
- string layoutMethod;
-
-
-
- // Segment length and thickness.
-
- // The display length of a segment is computed as
- // minimumSegmentLength + (n-1) * additionalSegmentLengthPerMarker
- // where n is the path length of the segment, in markers.
- double minimumSegmentLength = 1.;
- double additionalSegmentLengthPerMarker = 0.2;
-
- // The thickness of a segment is computed as
- // minimumSegmentThickness + coverage * additionalSegmentThicknessPerUnitCoverage
- // where coverage is average marker graph edge coverage on the segment path.
- double minimumSegmentThickness = 0.3;
- double additionalSegmentThicknessPerUnitCoverage = 0.005;
-
- // Segment coloring
- string segmentColoring = "random";
- string segmentColor = "Green"; // Only used if segmentColoring is "uniform"
- uint64_t greenThreshold = 0; // Minimum number of common reads to color green (0=automatic).
- uint64_t referenceSegmentId = 0;// Only used if segmentColoring is "byCommonReads"
- uint64_t hashSeed = 0; // Only used if segmentCooring is "byClusterId"
- uint64_t pathStart = 0; // Only used is segmentColoring is "path"
- string pathDirection = "forward"; // Only used is segmentColoring is "path"
-
- // Clusters to be colored, if coloring by cluster id.
- // If empty, all clusters are colored.
- vector<uint64_t> clustersToBeColored;
-
- // Flag to turn on sequence assembly when coloring a path.
- bool assemblePathSequence = false;
-
- // Link length and thickness.
-
- // The display length of a link is computed as follows:
- // - For a link between segments that are consecutive in the marker graph:
- // linkLength = minimumLinkLength
- // - For a link between segments that are not consecutive in the marker graph:
- // linkLength = 3 * minimumLinkLength + linkSeparation * additionalLinkLengthPerMarker
- // (with the linkSeperation replaced with zero if it is negative).
- double minimumLinkLength = 1;
- double additionalLinkLengthPerMarker = 0.2;
-
- // The display thickness of a link is computed as
- // minimumLinkThickness + (n-1) * additionalSegmentLengthPerMarker
- // where n is the path length of the segment, in markers.
- double minimumLinkThickness = 0.05;
- double additionalLinkThicknessPerRead = 0.005;
-
-
-
- // Colors.
- string segmentAtMaxDistanceColor = "LightGray";
- string linkColor = "Black";
-
- // Construct the options from an html request.
- SvgOptions(const vector<string>& request);
-
- // Add rows to the html request form.
- void addFormRows(ostream& html);
-
- // Return true if there were no changes in the options
- // that affect graph layout changed, compared to another
- // SvgOptions object.
- bool hasSameLayoutOptions(const SvgOptions& that) const;
- };
- void writeHtml(ostream& html, const SvgOptions&) const;
- void writeSvg(
- const string& fileName,
- const SvgOptions&,
- vector<mode3::AssemblyGraph::AnalyzeSubgraphClasses::Cluster>&) const;
- void writeSvg(
- ostream&,
- const SvgOptions&,
- vector<mode3::AssemblyGraph::AnalyzeSubgraphClasses::Cluster>&) const;
- void computeLayout(const SvgOptions&, double timeout);
- void computeSegmentTangents();
- void computeSegmentTangents(vertex_descriptor);
-
- // Return the random svg color for a segment.
- static string randomSegmentColor(uint64_t segmentId);
-
-
-
- bool haveConsecutivePaths(
- vertex_descriptor v1,
- vertex_descriptor v2) const;
-
- // Return the average link separation for the Link
- // described by an edge.
- int32_t linkSeparation(edge_descriptor) const;
-
- // Write the local assembly graph in gfa format.
- void writeGfa(const string& fileName) const;
- void writeGfa(ostream&) const;
-};
-#endif
-
diff --git a/src/mode3-PathGraph.cpp b/src/mode3-PathGraph.cpp
deleted file mode 100644
index 200f44b..0000000
--- a/src/mode3-PathGraph.cpp
+++ /dev/null
@@ -1,1393 +0,0 @@
-// Shasta.
-#include "mode3-PathGraph.hpp"
-#include "findLinearChains.hpp"
-#include "MurmurHash2.hpp"
-#include "orderPairs.hpp"
-#include "transitiveReduction.hpp"
-using namespace shasta;
-using namespace mode3;
-
-// Boost libraries.
-#include <boost/graph/iteration_macros.hpp>
-#include <boost/graph/strong_components.hpp>
-#include <boost/icl/interval_set.hpp>
-
-// Standard library.
-#include <bitset>
-#include "fstream.hpp"
-#include "iostream.hpp"
-#include <queue>
-#include <stack>
-
-#include "MultithreadedObject.tpp"
-template class MultithreadedObject<mode3::PathGraph>;
-
-
-
-// Create the PathGraph from the AssemblyGraph.
-// Start with a single segment for each vertex
-// (that is, paths of length 1).
-PathGraph::PathGraph(const AssemblyGraph& assemblyGraph) :
- MultithreadedObject<PathGraph>(*this),
- assemblyGraph(assemblyGraph)
-{
- // HARDWIRED CONSTANTS TO BE EXPOSED WHEN CODE STABILIZES.
- const uint64_t minCoverage = 3;
- const uint64_t partitionMaxDistance = 10;
- const uint64_t minSubgraphSize = 8;
-
- // Create initial vertices from the AssemblyGraph.
- PathGraph& pathGraph = *this;
- createVertices();
-
- // Detangle iteration.
- // At the beginning of each iteration we only have vertices.
- for(uint64_t iteration=0; iteration<6; iteration++) {
-
- createEdges(minCoverage);
- cout << "The path graph at iteration " << iteration << " has " << num_vertices(pathGraph) <<
- " vertices and " << num_edges(pathGraph) << " edges." << endl;
-
- // Compute oriented read journeys.
- computeJourneys();
- // writeJourneys("PathGraphJourneys.csv");
-
- // Partition the PathGraph into subgraphs.
- partition(partitionMaxDistance, minSubgraphSize);
- writeGfa("PathGraph-" + to_string(iteration));
- writeCsvDetailed("PathGraphDetailed-" + to_string(iteration) + ".csv");
-
- // Interactive local detangling, without modifying the PathGraph.
- // Turn this on for debugging.
- while(false) {
- int64_t subgraphId;
- cout << "Enter a subgraph to detangle interactively, -1 to quit, or -2 to continue with detangle:" << endl;
- cin >> subgraphId;
- if(not cin) {
- return;
- }
- if(subgraphId == -1) {
- return;
- }
- if(subgraphId == -2) {
- break;
- }
- vector<PathGraphVertex> newVertices;
- detangleSubgraph(uint64_t(subgraphId), newVertices, true);
- cout << "Detangling subgraph " << subgraphId <<
- " generated " << newVertices.size() << " new vertices." << endl;
- }
-
- // Detangle.
- vector<PathGraphVertex> newVertices;
- detangle(newVertices);
-
- // Recreate the vertices.
- clear();
- createVertices(newVertices);
- }
-}
-
-
-
-// Initial creation of the vertices.
-// Start with a single segment for each vertex
-// (that is, paths of length 1).
-void PathGraph::createVertices() {
-
- PathGraph& pathGraph = *this;
-
-
- // Create a vertex for each segment in the AssemblyGraph.
- for(uint64_t segmentId=0; segmentId<assemblyGraph.markerGraphPaths.size(); segmentId++) {
-
- // Create the vertex.
- const vertex_descriptor v = add_vertex(pathGraph);
- PathGraphVertex& vertex = pathGraph[v];
- vertex.id = nextVertexId++;
-
- // Store the path.
- vertex.path.push_back(segmentId);
-
- // Store the AssemblyGraphJourneyInterval's.
- const span<const pair<OrientedReadId, uint64_t> > journeyInfos =
- assemblyGraph.assemblyGraphJourneyInfos[segmentId];
- for(const pair<OrientedReadId, uint64_t>& p: journeyInfos) {
- const OrientedReadId orientedReadId = p.first;
- const uint64_t position = p.second;
- AssemblyGraphJourneyInterval interval;
- interval.orientedReadId = orientedReadId;
- interval.first = position;
- interval.last = position;
- vertex.journeyIntervals.push_back(
- make_pair(interval, std::numeric_limits<uint64_t>::max()));
- }
- }
-
-}
-
-
-// Creation of vertices after a detangle iteration.
-void PathGraph::createVertices(const vector<PathGraphVertex>& newVertices)
-{
- PathGraph& pathGraph = *this;
-
- nextVertexId = 0;
- for(const PathGraphVertex& newVertex: newVertices) {
- const vertex_descriptor v = boost::add_vertex(newVertex, pathGraph);
- PathGraphVertex& vertex = pathGraph[v];
- vertex.id = nextVertexId++;
- }
-}
-
-
-
-// Recreate all edges from scratch, using only the
-// information stored in the vertices.
-void PathGraph::createEdges(uint64_t minCoverage)
-{
- PathGraph& pathGraph = *this;
-
- // Gather AssemblyGraphJourneyInterval's for all oriented reads.
- vector< vector<pair<AssemblyGraphJourneyInterval, vertex_descriptor> > >
- journeyIntervals(2 * assemblyGraph.readCount());
- BGL_FORALL_VERTICES(v, pathGraph, PathGraph) {
- for(const auto& p: pathGraph[v].journeyIntervals) {
- const AssemblyGraphJourneyInterval& interval = p.first;
- journeyIntervals[interval.orientedReadId.getValue()].push_back(
- make_pair(interval, v));
- }
- }
- for(auto& v: journeyIntervals) {
- sort(v.begin(), v.end(),
- OrderPairsByFirstOnly<AssemblyGraphJourneyInterval, vertex_descriptor>());
- }
-
- // Create the edges.
- for(const auto& orientedReadJourneyIntervals: journeyIntervals) {
-
- for(uint64_t i=1; i<orientedReadJourneyIntervals.size(); i++) {
- const vertex_descriptor v0 = orientedReadJourneyIntervals[i-1].second;
- const vertex_descriptor v1 = orientedReadJourneyIntervals[i ].second;
-
- if(v0 != v1) {
- edge_descriptor e;
- bool edgeExists = false;
- tie(e, edgeExists) = edge(v0, v1, pathGraph);
- if(not edgeExists) {
- tie(e, edgeExists) = add_edge(v0, v1, pathGraph);
- SHASTA_ASSERT(edgeExists);
- }
- ++pathGraph[e].coverage;
- }
- }
- }
-
-
-
- // Remove the low coverage edges.
- vector<edge_descriptor> edgesToBeRemoved;
- BGL_FORALL_EDGES(e, pathGraph, PathGraph) {
- if(pathGraph[e].coverage < minCoverage) {
- edgesToBeRemoved.push_back(e);
- }
- }
- for(const edge_descriptor e: edgesToBeRemoved) {
- boost::remove_edge(e, pathGraph);
- }
-}
-
-
-
-// Compute the journeys of all oriented reads in the PathGraph.
-// The journey of an oriented read in the PathGraph is
-// a sequence of vertex descriptors which is not necessarily a path.
-// Indexed by OrientedReadId::getValue();
-void PathGraph::computeJourneys()
-{
- PathGraph& pathGraph = *this;
- const ReadId readCount = ReadId(assemblyGraph.readCount());
-
- // First create, for each oriented read, a vector
- // of pairs (AssemblyGraphJourneyInterval, vertex_descriptor).
- vector< vector< pair<AssemblyGraphJourneyInterval, vertex_descriptor> > >
- journeyTable(2 * readCount);
- BGL_FORALL_VERTICES(v, pathGraph, PathGraph) {
- for(const auto& p: pathGraph[v].journeyIntervals) {
- const AssemblyGraphJourneyInterval& journeyInterval = p.first;
- journeyTable[journeyInterval.orientedReadId.getValue()].push_back(make_pair(journeyInterval, v));
- }
- }
-
- // Sort them and sanity check.
- for(vector< pair<AssemblyGraphJourneyInterval, vertex_descriptor> >& v: journeyTable) {
- sort(v.begin(), v.end());
-
- // Sanity check.
- if(v.size() > 1) {
- for(uint64_t i=1; i<v.size(); i++) {
- const AssemblyGraphJourneyInterval& previous = v[i-1].first;
- const AssemblyGraphJourneyInterval& current = v[i].first;
- SHASTA_ASSERT(previous.last < current.first);
- }
- }
- }
-
-
- // Store what we got.
- BGL_FORALL_VERTICES(v, pathGraph, PathGraph) {
- pathGraph[v].journeyIntervals.clear();
- }
- journeys.clear();
- journeys.resize(2 * readCount);
- for(ReadId readId=0; readId<readCount; readId++) {
- for(Strand strand=0; strand<2; strand++) {
- const OrientedReadId orientedReadId(readId, strand);
- const uint64_t index = orientedReadId.getValue();
- for(uint64_t position=0; position<journeyTable[index].size(); position++) {
- const auto& p = journeyTable[index][position];
- const AssemblyGraphJourneyInterval& interval = p.first;
- const vertex_descriptor v = p.second;
- journeys[index].push_back(v);
- pathGraph[v].journeyIntervals.push_back(make_pair(interval, position));
- }
- }
- }
-}
-
-
-
-void PathGraph::writeJourneys(const string& fileName) const
-{
- const PathGraph& pathGraph = *this;
- ofstream csv(fileName);
-
- // Loop over all oriented reads.
- const ReadId readCount = ReadId(assemblyGraph.readCount());
- for(ReadId readId=0; readId<readCount; readId++) {
- for(Strand strand=0; strand<2; strand++) {
- const OrientedReadId orientedReadId(readId, strand);
- csv << orientedReadId << ",";
-
- // Write the journey of this oriented read in the PathGraph.
- const auto journey = journeys[orientedReadId.getValue()];
- for(const vertex_descriptor v: journey) {
- csv << pathGraph[v].id << ",";
- }
- csv << "\n";
- }
- }
-}
-
-
-
-// Partition the PathGraph into subgraphs.
-void PathGraph::partition(
- uint64_t maxDistance,
- uint64_t minSubgraphSize)
-{
- PathGraph& pathGraph = *this;
-
- // Mark all vertices as not assigned to any partition.
- BGL_FORALL_VERTICES(v, pathGraph, PathGraph) {
- pathGraph[v].subgraphId = noSubgraph;
- }
-
- // Start at all vertices with zero in-degree,
- // plus the boundary vertices we find that way.
- vector<vertex_descriptor> boundaryVertices;
- std::stack<vertex_descriptor> s;
- BGL_FORALL_VERTICES(v, pathGraph, PathGraph) {
- if(in_degree(v, pathGraph) == 0) {
- s.push(v);
- }
- }
- uint64_t subgraphId = 0;
- while(not s.empty()) {
- const vertex_descriptor v = s.top();
- s.pop();
-
- if(pathGraph[v].subgraphId == noSubgraph) {
- partitionIteration(v, maxDistance, subgraphId++, boundaryVertices);
- for(const vertex_descriptor v: boundaryVertices) {
- s.push(v);
- }
- }
- }
-
-
-
- // In exceptional cases, the above procedure might not assign all
- // vertices to a subgraph.
- // This code takes care of that.
- BGL_FORALL_VERTICES(v, pathGraph, PathGraph) {
- if(pathGraph[v].subgraphId == noSubgraph) {
- partitionIteration(v, maxDistance, subgraphId++, boundaryVertices);
- }
- }
-
-
-
- // Combine small subgraphs with adjacent subgraphs, if possible.
- // This can leave subgraphs with size 0, but we don't worry about that.
- while(true) {
-
- // Gather the subgraphs based on the current settings of
- // the vertices subgraphId.
- gatherSubgraphs();
-
- // Find the small subgraphs.
- std::set<uint64_t> smallSubgraphs;
- for(uint64_t subgraphId=0; subgraphId<subgraphs.size(); subgraphId++) {
- const vector<vertex_descriptor>& subgraph = subgraphs[subgraphId];
- const uint64_t subgraphSize = subgraph.size();
- if((subgraphSize != 0) and (subgraph.size() < minSubgraphSize)) {
- smallSubgraphs.insert(subgraphId);
- }
- }
-
-
-
- // Try and merge small subgraphs with adjacent subgraphs.
-
- // Loop over small subgraphs.
- bool changesWereMade = false;
- for(uint64_t subgraphId0: smallSubgraphs) {
- const vector<vertex_descriptor>& subgraph0 = subgraphs[subgraphId0];
- const uint64_t subgraph0Size = subgraph0.size();
- SHASTA_ASSERT(subgraph0Size < minSubgraphSize);
-
- // Find adjacent subgraphs and their sizes.
- vector< pair<uint64_t, uint64_t> > adjacentSubgraphsTable; // (size, subgraphId) of adjacent.
- for(const vertex_descriptor v0: subgraph0) {
- BGL_FORALL_OUTEDGES(v0, e, pathGraph, PathGraph) {
- const vertex_descriptor v1 = target(e, pathGraph);
- const uint64_t subgraphId1 = pathGraph[v1].subgraphId;
- if(subgraphId1 != subgraphId0){
- adjacentSubgraphsTable.push_back(make_pair(subgraphs[subgraphId1].size(), subgraphId1));
- }
- }
- BGL_FORALL_INEDGES(v0, e, pathGraph, PathGraph) {
- const vertex_descriptor v1 = source(e, pathGraph);
- const uint64_t subgraphId1 = pathGraph[v1].subgraphId;
- if(subgraphId1 != subgraphId0){
- adjacentSubgraphsTable.push_back(make_pair(subgraphs[subgraphId1].size(), subgraphId1));
- }
- }
- }
- if(adjacentSubgraphsTable.empty()) {
- continue;
- }
- sort(adjacentSubgraphsTable.begin(), adjacentSubgraphsTable.end());
-
- // Merge it with the smallest adjacent subgraph.
- const uint64_t subgraphId1 = adjacentSubgraphsTable.front().second;
- smallSubgraphs.erase(subgraphId1);
- for(const vertex_descriptor v0: subgraph0) {
- pathGraph[v0].subgraphId = subgraphId1;
- }
- changesWereMade = true;
- }
-
- if(not changesWereMade) {
- break;
- }
- }
-
-
- // Sort the vertex descriptors in each subgraph.
- for(vector<vertex_descriptor>& subgraph: subgraphs) {
- sort(subgraph.begin(), subgraph.end(), PathGraphOrderVerticesById(pathGraph));
- }
-
-
-
- // Subgraph statistics.
- cout << "Partitioned the path graph into " << subgraphs.size() << " subgraphs." << endl;
- histogramSubgraphs();
-
- // Count the edges across subgraphs.
- uint64_t crossEdgeCount = 0;
- BGL_FORALL_EDGES(e, pathGraph, PathGraph) {
- const vertex_descriptor v0 = source(e, pathGraph);
- const vertex_descriptor v1 = target(e, pathGraph);
- if(pathGraph[v0].subgraphId != pathGraph[v1].subgraphId) {
- ++crossEdgeCount;
- }
- }
- cout << "Number of edges across subgraphs is " << crossEdgeCount << endl;
-}
-
-
-
-// A partition iteration does a single BFS starting at v.
-// It moves forward from v, avoiding vertices already
-// assigned to a subgraph, and up to maxDistance from v.
-// It also returns the boundaryVertices, that is the
-// vertices found in the process that are at distance maxDistance+1
-// from v and are not yet assigned to a subgraph.
-// These can then used as starting points new partition iterations.
-void PathGraph::partitionIteration(
- vertex_descriptor v,
- uint64_t maxDistance,
- uint64_t subgraphId,
- vector<vertex_descriptor>& boundaryVertices)
-{
- PathGraph& pathGraph = *this;
-
- boundaryVertices.clear();
-
- // Initialize the BFS.
- std::queue<vertex_descriptor> q;
- q.push(v);
- PathGraphVertex& vertex = pathGraph[v];
- SHASTA_ASSERT(vertex.subgraphId == noSubgraph);
- vertex.subgraphId = subgraphId;
- vertex.distance = 0;
-
- // BFS loop.
- while(not q.empty()) {
- const vertex_descriptor v0 = q.front();
- q.pop();
-
- const uint64_t distance0 = pathGraph[v0].distance;
- const uint64_t distance1 = distance0 + 1;
- SHASTA_ASSERT(distance0 <= maxDistance);
-
- // Loop over edges starting at v0.
- BGL_FORALL_OUTEDGES(v0, e01, pathGraph, PathGraph) {
- const vertex_descriptor v1 = target(e01, pathGraph);
- PathGraphVertex& vertex1 = pathGraph[v1];
-
- // If v1 is already in a subgraph, skip it.
- if(vertex1.subgraphId != noSubgraph) {
- continue;
- }
-
- // Assign v1 to this subgraph, if it is within maxDistance.
- if(distance1 <= maxDistance) {
- vertex1.subgraphId = subgraphId;
- vertex1.distance = distance1;
- }
-
- // Queue it or add it to the boundary vertices.
- if(distance1 <= maxDistance) {
- q.push(v1);
- } else {
- SHASTA_ASSERT(distance1 == maxDistance + 1);
- boundaryVertices.push_back(v1);
- }
-
- }
-
- }
-}
-
-
-
-// Gather subgraphs using the subgraphId stored in each vertex.
-void PathGraph::gatherSubgraphs()
-{
- PathGraph& pathGraph = *this;
-
- subgraphs.clear();
- BGL_FORALL_VERTICES(v, pathGraph, PathGraph) {
- const uint64_t subgraphId = pathGraph[v].subgraphId;
- SHASTA_ASSERT(subgraphId != noSubgraph);
-
- if(subgraphId >= subgraphs.size()) {
- subgraphs.resize(subgraphId + 1);
- }
-
- subgraphs[subgraphId].push_back(v);
- }
-}
-
-
-
-void PathGraph::histogramSubgraphs()
-{
- vector<uint64_t> histogram;
- for(const vector<vertex_descriptor>& subgraph: subgraphs) {
- const uint64_t subgraphSize = subgraph.size();
- if(subgraphSize >= histogram.size()) {
- histogram.resize(subgraphSize + 1, 0);
- }
- ++histogram[subgraphSize];
- }
-
- ofstream csv("PathGraphSubgraphHistogram.csv");
- csv << "Size,Frequency,Vertices\n";
- for(uint64_t subgraphSize=0; subgraphSize<histogram.size(); subgraphSize++) {
- const uint64_t frequency = histogram[subgraphSize];
- csv << subgraphSize << ",";
- csv << frequency << ",";
- csv << subgraphSize*frequency << "\n";
- }
-}
-
-
-
-
-void PathGraph::writeGfa(const string& baseName) const
-{
- const PathGraph& pathGraph = *this;
-
- // Open the gfa and write the header.
- ofstream gfa(baseName + ".gfa");
- gfa << "H\tVN:Z:1.0\n";
-
- // Open the csv and write the header.
- ofstream csv(baseName + ".csv");
- csv << "PathGraph-VertexId,Color,SubgraphId\n";
-
- // Write each vertex as a segment in the gfa.
- // Note these segments are different from assembly graph segments:
- // here each segment represents a vertex of the path graph.
- BGL_FORALL_VERTICES(v, pathGraph, PathGraph) {
- gfa <<
- "S\t" <<
- pathGraph[v].id << "\t" // Segment name
- "*" // Segment length
- "\n";
-
-
- // Color based on the subgraphId.
- const uint64_t subgraphId = pathGraph[v].subgraphId;
- string color = "LightGrey";
- if(subgraphId != noSubgraph) {
- const uint64_t r = MurmurHash2(&subgraphId, sizeof(subgraphId), 231) &255;
- const uint64_t g = MurmurHash2(&subgraphId, sizeof(subgraphId), 233) &255;
- const uint64_t b = MurmurHash2(&subgraphId, sizeof(subgraphId), 235) &255;
-
- std::ostringstream s;
- s.fill('0');
- s << "#";
- s << hex << std::setw(2) << r;
- s << hex << std::setw(2) << g;
- s << hex << std::setw(2) << b;
- color = s.str();
- }
-
- csv << pathGraph[v].id << "," << color << "," << subgraphId << "\n";
-
- }
-
- // Write each edge as a link.
- BGL_FORALL_EDGES(e, pathGraph, PathGraph) {
- const vertex_descriptor v0 = source(e, pathGraph);
- const vertex_descriptor v1 = target(e, pathGraph);
- gfa <<
- "L\t" <<
- pathGraph[v0].id << "\t+\t" <<
- pathGraph[v1].id << "\t+\t0M\n";
- }
-
-}
-
-
-
-void PathGraph::writeCsvDetailed(const string& fileName) const
-{
- const PathGraph& pathGraph = *this;
- ofstream csv(fileName);
- csv << "PathGraph-VertexId,SubgraphId,SegmentId\n";
-
- // Loop over vertices of the PathGraph.
- BGL_FORALL_VERTICES(v, pathGraph, PathGraph) {
- const PathGraphVertex& vertex = pathGraph[v];
-
- // Write the AssemblyGraph path corresponding to this vertex.
- for(const uint64_t segmentId: vertex.path) {
- csv << vertex.id << ",";
- if(vertex.subgraphId != invalid<uint64_t>) {
- csv << vertex.subgraphId;
- }
- csv << ",";
- csv << segmentId << "\n";
- }
- }
-}
-
-
-
-// Detangling of a subgraph.
-// Returns new vertices for the next detangle iteration.
-// The new vertices can only be used in a new PathGraph
-// created from scratch.
-// Only the path and journeyIntervals are filled in.
-void PathGraph::detangleSubgraph(
- uint64_t subgraphId,
- vector<PathGraphVertex>& newVertices,
- bool debug
-) const
-{
- const vector<vertex_descriptor>& subgraph = subgraphs[subgraphId];
-
- if(subgraph.empty()) {
- newVertices.clear();
- if(debug) {
- cout << "The subgraph to be detangled is empty." << endl;
- }
- return;
- }
-
- // Call the templated function appropriate for the
- // size of this subgraph. This way we use the shortest possible
- // bitmap (with size multiple of 64).
- if(subgraph.size() <= 64) {
- detangleSubgraphTemplate<64>(subgraphId, newVertices, debug);
- } else if(subgraph.size() <= 128) {
- detangleSubgraphTemplate<128>(subgraphId, newVertices, debug);
- } else if(subgraph.size() <= 192) {
- detangleSubgraphTemplate<192>(subgraphId, newVertices, debug);
- } else if(subgraph.size() <= 256) {
- detangleSubgraphTemplate<256>(subgraphId, newVertices, debug);
- } else if(subgraph.size() <= 320) {
- detangleSubgraphTemplate<320>(subgraphId, newVertices, debug);
- } else if(subgraph.size() <= 384) {
- detangleSubgraphTemplate<384>(subgraphId, newVertices, debug);
- } else if(subgraph.size() <= 448) {
- detangleSubgraphTemplate<448>(subgraphId, newVertices, debug);
- } else if(subgraph.size() <= 512) {
- detangleSubgraphTemplate<512>(subgraphId, newVertices, debug);
- } else {
- SHASTA_ASSERT(0);
- }
-}
-
-
-// This code is similar to mode3::AssemblyGraph::analyzeSubgraphTemplate
-// but it operates on a subgraph of the PathGraph, not of the AssemblyGraph.
-template<uint64_t N> void PathGraph::detangleSubgraphTemplate(
- uint64_t subgraphId,
- vector<PathGraphVertex>& newVertices,
- bool debug
-) const
-{
- // EXPOSE WHEN CODE STABILIZES.
- const double fractionThreshold = 0.05;
- const uint64_t minVertexCoverage = 6;
- const uint64_t minClusterCoverage = 6;
-
- const PathGraph& pathGraph = *this;
- const vector<vertex_descriptor>& subgraph = subgraphs[subgraphId];
-
- // The bitmap type used to store which vertices are visited
- // by each journey snippet.
- using BitVector = std::bitset<N>;
- SHASTA_ASSERT(subgraph.size() <= N);
-
- if(debug) {
- cout << "Detangling a PathGraph subgraph consisting of the following " <<
- subgraph.size() << " vertices:" << endl;
- for(const vertex_descriptor v: subgraph) {
- cout << pathGraph[v].id << " ";
- }
- cout << endl;
- }
-
- // Sanity check: we expect the vertices in the subgraph to be sorted by vertex id.
- SHASTA_ASSERT(std::is_sorted(subgraph.begin(), subgraph.end(),
- PathGraphOrderVerticesById(pathGraph)));
-
- // For vertices in the subgraph, gather triplets
- // (orientedReadId, position in path graph journey, vertex_descriptor).
- using Triplet = tuple<OrientedReadId, uint64_t, vertex_descriptor>;
- vector<Triplet> triplets;
- for(const vertex_descriptor v: subgraph) {
- const PathGraphVertex& vertex = pathGraph[v];
-
- // Loop over oriented reads that visit this vertex.
- for(const pair<AssemblyGraphJourneyInterval, uint64_t>& p: vertex.journeyIntervals) {
- const AssemblyGraphJourneyInterval& assemblyGraphJourneyInterval = p.first;
- const uint64_t position = p.second;
- const OrientedReadId orientedReadId = assemblyGraphJourneyInterval.orientedReadId;
- triplets.push_back(Triplet(orientedReadId, position, v));
- }
- }
- sort(triplets.begin(), triplets.end());
-
- // Write the triplets.
- if(debug) {
- ofstream csv("Triplets.csv");
- for(const Triplet& triplet: triplets) {
- csv << get<0>(triplet) << ",";
- csv << get<1>(triplet) << ",";
- csv << pathGraph[get<2>(triplet)].id << "\n";
- }
- }
-
-
-
- // Find streaks for the same OrientedReadId where the position
- // increases by 1 each time.
- // Each streak generates a PathGraphJourneySnippet.
- vector<PathGraphJourneySnippet> snippets;
- for(uint64_t i=0; i<triplets.size(); /* Increment later */) {
- const OrientedReadId orientedReadId = get<0>(triplets[i]);
-
- // Find this streak.
- uint64_t streakBegin = i;
- uint64_t streakEnd = streakBegin + 1;
- for(; streakEnd<triplets.size(); streakEnd++) {
- if(get<0>(triplets[streakEnd]) != orientedReadId) {
- break;
- }
- if(get<1>(triplets[streakEnd]) != get<1>(triplets[streakEnd-1]) + 1) {
- break;
- }
- }
-
- // Add a snippet.
- PathGraphJourneySnippet snippet;
- snippet.orientedReadId = orientedReadId;
- snippet.firstPosition = get<1>(triplets[streakBegin]);
- for(uint64_t j=streakBegin; j!=streakEnd; ++j) {
- snippet.vertices.push_back(get<2>(triplets[j]));
- }
- snippets.push_back(snippet);
-
- // Prepare to process the next streak.
- i = streakEnd;
- }
-
-
-
-
- // Write the snippets.
- if(debug) {
- ofstream csv("PathGraphJourneySnippets.csv");
- csv << "SnippetIndex,OrientedReadId,First position,LastPosition,Vertices\n";
- for(uint64_t snippetIndex=0; snippetIndex<snippets.size(); snippetIndex++) {
- const PathGraphJourneySnippet& snippet = snippets[snippetIndex];
- csv << snippetIndex << ",";
- csv << snippet.orientedReadId << ",";
- csv << snippet.firstPosition << ",";
- csv << snippet.lastPosition() << ",";
- for(const vertex_descriptor v: snippet.vertices) {
- csv << pathGraph[v].id << ",";
- }
- csv << "\n";
- }
- }
-
-
-
- // For each snippet, create a BitVector that describes the segments
- // the snippet visits.
- const uint64_t snippetCount = snippets.size();
- vector<BitVector> bitVectors(snippetCount);
- vector<uint64_t> bitVectorsPopCount(snippetCount); // The number of bits set in each of the bit vectors.
- for(uint64_t snippetIndex=0; snippetIndex<snippetCount; snippetIndex++) {
- const PathGraphJourneySnippet& snippet = snippets[snippetIndex];
- BitVector& bitVector = bitVectors[snippetIndex];
-
- for(const vertex_descriptor v: snippet.vertices) {
- auto it = lower_bound(subgraph.begin(), subgraph.end(), v, PathGraphOrderVerticesById(pathGraph));
- SHASTA_ASSERT(it != subgraph.end());
- SHASTA_ASSERT(*it == v);
- const uint64_t bitIndex = it - subgraph.begin();
- bitVector.set(bitIndex);
- }
- bitVectorsPopCount[snippetIndex] = bitVector.count();
- }
-
-
-
- if(debug) {
- ofstream csv("SnippetBitVector.csv");
- csv << "Snippet,OrientedReadId,";
- for(uint64_t i=0; i<subgraph.size(); i++) {
- const vertex_descriptor v = subgraph[i];
- csv << pathGraph[v].id << ",";
- }
- csv << "\n";
- for(uint64_t snippetIndex=0; snippetIndex<snippetCount; snippetIndex++) {
- const PathGraphJourneySnippet& snippet = snippets[snippetIndex];
- csv << snippetIndex << ",";
- csv << snippet.orientedReadId << ",";
- const BitVector& bitVector = bitVectors[snippetIndex];
- for(uint64_t i=0; i<subgraph.size(); i++) {
- csv << bitVector[i] << ",";
- }
- csv << "\n";
- }
- }
-
-
-
- // Create the SnippetGraph.
- // A vertex represents a set of snippets and stores
- // the corresponding snippet indexes.
- // An edge x->y is created if there is at least one snippet in y
- // that is an approximate subset of a snippet in x.
- // We express this condition as |y-x| < fractionThreshold * |y|
- // We start with one snippet per vertex.
- SnippetGraph snippetGraph;
- vector<SnippetGraph::vertex_descriptor> vertexTable;
- std::map<SnippetGraph::vertex_descriptor, uint64_t> vertexMap;
- for(uint64_t snippetIndex=0; snippetIndex<snippetCount; snippetIndex++) {
- const auto v = add_vertex(SnippetGraphVertex(snippetIndex), snippetGraph);
- vertexTable.push_back(v);
- vertexMap.insert(make_pair(v, snippetIndex));
- }
- for(uint64_t iy=0; iy<snippetCount; iy++) {
- const BitVector& y = bitVectors[iy];
- const uint64_t threshold = uint64_t(std::round(fractionThreshold * double(bitVectorsPopCount[iy])));
- const SnippetGraph::vertex_descriptor vy = vertexTable[iy];
- for(uint64_t ix=0; ix<snippetCount; ix++) {
- if(ix == iy) {
- continue;
- }
- const BitVector& x = bitVectors[ix];
-
- // Compute z = y-x.
- BitVector z = y;
- z &= (~x);
-
- if(z.count() <= threshold) {
- const SnippetGraph::vertex_descriptor vx = vertexTable[ix];
- add_edge(vx, vy, snippetGraph);
- }
- }
- }
- if(debug) {
- snippetGraph.writeGraphviz("SnippetGraph-Initial.dot");
- }
-
-
-
- // Compute strongly connected components of the SnippetGraph.
- std::map<SnippetGraph::vertex_descriptor, uint64_t> componentMap;
- const uint64_t componentCount = boost::strong_components(
- snippetGraph,
- boost::make_assoc_property_map(componentMap),
- boost::vertex_index_map(boost::make_assoc_property_map(vertexMap)));
- // cout << "Found " << componentCount << " strongly connected components." << endl;
-
- // Gather the vertices of each strongly connected component.
- vector< vector<SnippetGraph::vertex_descriptor> > components(componentCount);
- BGL_FORALL_VERTICES_T(v, snippetGraph, SnippetGraph) {
- const uint64_t componentId = componentMap[v];
- SHASTA_ASSERT(componentId < componentCount);
- components[componentId].push_back(v);
- }
- if(false) {
- cout << "Strongly connected components:\n";
- for(uint64_t componentId=0; componentId<componentCount; componentId++) {
- cout << componentId << ": ";
- for(const SnippetGraph::vertex_descriptor v: components[componentId]) {
- cout << vertexMap[v] << " ";
- }
- cout << "\n";
- }
- }
-
-
-
- // Condense the strongly connected components.
- // After this, the SnippetGraph is guaranteed to be acyclic.
- for(const vector<SnippetGraph::vertex_descriptor>& component: components) {
- if(component.size() == 1) {
- continue;
- }
-
- // Create a new vertex to represent this component.
- const auto vNew = add_vertex(snippetGraph);
- vector<uint64_t>& snippetsNew = snippetGraph[vNew].snippetIndexes;
- for(const vertex_descriptor v: component) {
- const vector<uint64_t>& snippets = snippetGraph[v].snippetIndexes;
- SHASTA_ASSERT(snippets.size() == 1);
- snippetsNew.push_back(snippets.front());
- }
-
- // Create the new edges.
- for(const vertex_descriptor v0: component) {
-
- // Out-edges.
- BGL_FORALL_OUTEDGES_T(v0, e01, snippetGraph, SnippetGraph) {
- const vertex_descriptor v1 = target(e01, snippetGraph);
- if(v1 != vNew) {
- add_edge(vNew, v1, snippetGraph);
- }
- }
-
- // In-edges.
- BGL_FORALL_INEDGES_T(v0, e10, snippetGraph, SnippetGraph) {
- const vertex_descriptor v1 = source(e10, snippetGraph);
- if(v1 != vNew) {
- add_edge(v1, vNew, snippetGraph);
- }
- }
- }
-
- // Remove the old vertices and their edges.
- for(const vertex_descriptor v: component) {
- clear_vertex(v, snippetGraph);
- remove_vertex(v, snippetGraph);
- }
- }
-
-
-
- // Compute which maximal vertices each vertex is a descendant of.
- std::map<SnippetGraph::vertex_descriptor, vector<SnippetGraph::vertex_descriptor> > ancestorMap;
- BGL_FORALL_VERTICES_T(v0, snippetGraph, SnippetGraph) {
- if(in_degree(v0, snippetGraph) != 0) {
- continue; // Not a maximal vertex.
- }
-
- // Find the descendants of this maximal vertex.
- vector<vertex_descriptor> descendants;
- snippetGraph.findDescendants(v0, descendants);
-
- // Update the ancestor map.
- for(const vertex_descriptor v1: descendants) {
- ancestorMap[v1].push_back(v0);
- }
- }
-
-
-
- // Each maximal vertex generates a cluster consisting of the vertices
- // that descend from it and from no other maximal vertex.
- // Gather the vertices in each cluster.
- std::map<SnippetGraph::vertex_descriptor, vector<SnippetGraph::vertex_descriptor> > clusterMap;
- uint64_t unclusterVertexCount = 0;
- BGL_FORALL_VERTICES_T(v1, snippetGraph, SnippetGraph) {
- const vector<SnippetGraph::vertex_descriptor>& ancestors = ancestorMap[v1];
- if(ancestors.size() == 1) {
- const vertex_descriptor v0 = ancestors.front();
- clusterMap[v0].push_back(v1);
- } else {
- ++unclusterVertexCount;
- }
- }
- if(debug or unclusterVertexCount>0) {
- cout << "Subgraph " << subgraphId << " has " << unclusterVertexCount <<
- " unclustered snippets out of " << snippetCount << " total." << endl;
- }
-
-
-
- // Gather the snippets in each cluster.
- vector<PathGraphJourneySnippetCluster> clusters;
- for(const auto& p: clusterMap) {
- const vector<SnippetGraph::vertex_descriptor>& clusterVertices = p.second;
- clusters.resize(clusters.size() + 1);
- PathGraphJourneySnippetCluster& cluster = clusters.back();
-
- vector<uint64_t> clusterSnippetIndexes; // Only used for debug output.
- for(const SnippetGraph::vertex_descriptor v: clusterVertices) {
- const vector<uint64_t>& snippetIndexes = snippetGraph[v].snippetIndexes;
- for(const uint64_t snippetIndex: snippetIndexes) {
- cluster.snippets.push_back(snippets[snippetIndex]);
- clusterSnippetIndexes.push_back(snippetIndex);
- }
- }
- cluster.constructVertices(pathGraph);
- cluster.cleanupVertices(minVertexCoverage);
- if(debug) {
- cout << "Found a cluster candidate with " <<
- clusterVertices.size() << " vertices and " <<
- cluster.snippets.size() << " snippets:" << endl;
- for(const uint64_t snippetIndex: clusterSnippetIndexes) {
- cout << snippetIndex << " ";
- }
- cout << endl;
- }
-
- // If coverage on this cluster is too low, discard it.
- if(cluster.coverage() < minClusterCoverage) {
- clusters.resize(clusters.size() - 1);
- if(debug) {
- cout << "This cluster candidate was discarded because of low coverage." << endl;
- }
- continue;
- }
-
- // This cluster will be stored and is assigned this clusterId;
- const uint64_t clusterId = clusters.size() - 1;
-
- if(debug) {
-
- cout << "This cluster was stored as cluster " << clusterId << endl;
- cout << "Vertex(coverage) for this cluster:\n";
- for(const auto& p: cluster.vertices) {
- cout << pathGraph[p.first].id << "(" << p.second << ") ";
- }
- cout << endl;
- }
-
- // Mark the vertices of this cluster.
- for(const SnippetGraph::vertex_descriptor v: clusterVertices) {
- snippetGraph[v].clusterId = clusterId;
- }
- }
- snippetGraph.clusterCount = clusters.size();
-
-
-
-
- // Write out the SnippetGraph.
- if(debug) {
- snippetGraph.writeGraphviz("SnippetGraph.dot");
- }
-
-
-
- // Find the paths of each cluster.
- // Each of these paths generates a new vertex for the next detangle iteration.
- newVertices.clear();
- if(debug) {
- cout << "Kept " << clusters.size() << " clusters." << endl;
- }
- for(uint64_t clusterId=0; clusterId<clusters.size(); clusterId++) {
- PathGraphJourneySnippetCluster& cluster = clusters[clusterId];
- vector< vector<vertex_descriptor> > paths;
- ofstream graphOut;
- if(debug) {
- graphOut.open("Cluster-" + to_string(clusterId) + ".dot");
- cout << "Finding paths generated by cluster " << clusterId << endl;
- }
- findClusterPaths(cluster, paths, debug ? &graphOut : 0, debug);
-
- // Construct the clusterSet for this cluster.
- // It is set of all pairs (orientedReadId, vertex) covered by this cluster.
- cluster.createClusterSet();
-
- // For each path, generate a new vertex for the next detangle iteration.
- for(const vector<vertex_descriptor>& path: paths) {
- newVertices.emplace_back();
- PathGraphVertex& newVertex = newVertices.back();
-
- // Construct the assembly graph path for the new vertex.
- for(const vertex_descriptor v: path) {
- const PathGraphVertex& vertex = pathGraph[v];
- copy(vertex.path.begin(), vertex.path.end(), back_inserter(newVertex.path));
- }
-
- // Intersect the clusterSet of this cluster with this path.
- std::set<vertex_descriptor> pathVertices;
- for(const vertex_descriptor v: path) {
- pathVertices.insert(v);
- }
- std::set< pair<OrientedReadId, vertex_descriptor> > pathSet;
- for(const auto& p: cluster.clusterSet) {
- if(pathVertices.contains(p.second)) {
- pathSet.insert(p);
- }
- }
-
- // Write out this pathSet.
- if(debug) {
- cout << "pathSet for this path:" << endl;
- for(const auto& p: pathSet) {
- const OrientedReadId orientedReadId = p.first;
- const vertex_descriptor v = p.second;
- const PathGraphVertex& vertex = pathGraph[v];
- for(const pair<AssemblyGraphJourneyInterval, uint64_t>& p: vertex.journeyIntervals) {
- const AssemblyGraphJourneyInterval& interval = p.first;
- if(interval.orientedReadId == orientedReadId) {
- cout << orientedReadId << " " << interval.first << " " << interval.last << endl;
- }
- }
- }
- }
-
- // Describe the pathSet as an interval map for each oriented read.
- std::map< OrientedReadId, boost::icl::interval_set<uint64_t> > pathSetMap;
- for(const auto& p: pathSet) {
- const OrientedReadId orientedReadId = p.first;
- const vertex_descriptor v = p.second;
- const PathGraphVertex& vertex = pathGraph[v];
- for(const pair<AssemblyGraphJourneyInterval, uint64_t>& p: vertex.journeyIntervals) {
- const AssemblyGraphJourneyInterval& assemblyGraphJourneyInterval = p.first;
- if(assemblyGraphJourneyInterval.orientedReadId == orientedReadId) {
- auto interval = boost::icl::interval<uint64_t>::right_open(
- assemblyGraphJourneyInterval.first,
- assemblyGraphJourneyInterval.last + 1);
- pathSetMap[orientedReadId].insert(interval);
- }
- }
- }
- if(debug) {
- cout << "pathSetMap:" << endl;
- for(const auto& p: pathSetMap) {
- const OrientedReadId orientedReadId = p.first;
- const boost::icl::interval_set<uint64_t>& intervals = p.second;
- for(const auto& interval: intervals) {
- cout << orientedReadId << " " << interval.lower() << " " << interval.upper() << endl;
- }
- }
- }
-
- // With this information we can construct the AssemblyGraphJourneyInterval's for the new vertex.
- for(const auto& p: pathSetMap) {
- const OrientedReadId orientedReadId = p.first;
- const boost::icl::interval_set<uint64_t>& intervals = p.second;
- for(const auto& interval: intervals) {
- AssemblyGraphJourneyInterval assemblyGraphJourneyInterval;
- assemblyGraphJourneyInterval.orientedReadId = orientedReadId;
- assemblyGraphJourneyInterval.first = interval.lower();
- assemblyGraphJourneyInterval.last = interval.upper() - 1;
- newVertex.journeyIntervals.push_back(make_pair(assemblyGraphJourneyInterval, invalid<uint64_t>));
- }
- }
- }
- }
-}
-
-
-
-// Detangle all the subgraphs.
-// This does not modify the PathGraph.
-// Instead, it creates vertices to be used for next detangle iteration.
-void PathGraph::detangle(vector<PathGraphVertex>& allNewVertices) const
-{
- allNewVertices.clear();
- vector<PathGraphVertex> newVertices;
- for(uint64_t subgraphId=0; subgraphId<subgraphs.size(); subgraphId++) {
- detangleSubgraph(subgraphId, newVertices, false);
- copy(newVertices.begin(), newVertices.end(), back_inserter(allNewVertices));
- }
-}
-
-
-
-// Construct a set of all pairs (orientedReadId, vertex) covered by this cluster.
-void PathGraphJourneySnippetCluster::createClusterSet()
-{
- clusterSet.clear();
- for(const PathGraphJourneySnippet& snippet: snippets) {
- for(const PathGraphBaseClass::vertex_descriptor v: snippet.vertices) {
- clusterSet.insert(make_pair(snippet.orientedReadId, v));
- }
- }
-}
-
-
-
-void SnippetGraph::findDescendants(
- const vertex_descriptor vStart,
- vector<vertex_descriptor>& descendants) const
-{
- const SnippetGraph& graph = *this;
-
- // Initialize the BFS.
- std::queue<vertex_descriptor> q;
- q.push(vStart);
- std::set<vertex_descriptor> descendantsSet;
- descendantsSet.insert(vStart);
-
- // BFS loop.
- while(not q.empty()) {
- const vertex_descriptor v0 = q.front();
- q.pop();
-
- BGL_FORALL_OUTEDGES(v0, e01, graph, SnippetGraph) {
- const vertex_descriptor v1 = target(e01, graph);
- if(descendantsSet.find(v1) == descendantsSet.end()) {
- q.push(v1);
- descendantsSet.insert(v1);
- }
- }
- }
-
- descendants.clear();
- copy(descendantsSet.begin(), descendantsSet.end(), back_inserter(descendants));
-}
-
-
-
-void SnippetGraph::writeGraphviz(
- const string& fileName) const
-{
- const SnippetGraph& graph = *this;
-
- ofstream dot(fileName);
- dot << "digraph SnippetGraph{\n"
- "node [shape=rectangle];\n";
- BGL_FORALL_VERTICES(v, graph, SnippetGraph) {
- dot << "\"" << v << "\" [label=\"";
- const vector<uint64_t>& snippetIndexes = graph[v].snippetIndexes;
- for(const uint64_t snippetIndex: snippetIndexes) {
- dot << snippetIndex;
- dot << "\\n";
- }
- dot << "\"";
- const uint64_t clusterId = graph[v].clusterId;
- if(clusterId != invalid<uint64_t>) {
- dot << " style=filled fillcolor=\"" <<
- float(clusterId)/float(clusterCount) <<
- ",0.3,1\"";
- }
- dot << "];\n";
- }
- BGL_FORALL_EDGES(e, graph, SnippetGraph) {
- const vertex_descriptor vx = source(e, graph);
- const vertex_descriptor vy = target(e, graph);
- dot << "\"" << vx << "\"->\"" << vy << "\";\n";
- }
- dot << "}\n";
-
-}
-
-
-
-vector<PathGraphBaseClass::vertex_descriptor> PathGraphJourneySnippetCluster::getVertices() const
-{
- vector<PathGraphBaseClass::vertex_descriptor> v;
- for(const auto& p: vertices) {
- v.push_back(p.first);
- }
- return v;
-}
-
-
-
-void PathGraphJourneySnippetCluster::cleanupVertices(uint64_t minVertexCoverage)
-{
- vector< pair<PathGraphBaseClass::vertex_descriptor, uint64_t > > newVertices;
- for(const auto& p: vertices) {
- if(p.second >= minVertexCoverage) {
- newVertices.push_back(p);
- }
- }
- vertices.swap(newVertices);
-}
-
-
-
-void PathGraphJourneySnippetCluster::constructVertices(const PathGraph& pathGraph)
-{
- // A map with Key=vertex_descriptor, value = coverage.
- auto vertexMap = std::map<PathGraphBaseClass::vertex_descriptor, uint64_t, PathGraphOrderVerticesById>(
- PathGraphOrderVerticesById(pathGraph));
-
- for(const PathGraphJourneySnippet& snippet: snippets) {
- for(const PathGraphBaseClass::vertex_descriptor v: snippet.vertices) {
- auto it = vertexMap.find(v);
- if(it == vertexMap.end()) {
- vertexMap.insert(make_pair(v, 1));
- } else {
- ++(it->second);
- }
- }
- }
-
- vertices.clear();
- copy(vertexMap.begin(), vertexMap.end(), back_inserter(vertices));
-}
-
-
-
-// Given a PathGraphJourneySnippetCluster, find a plausible
-// path for it in the PathGraph.
-void PathGraph::findClusterPaths(
- const PathGraphJourneySnippetCluster& cluster,
- vector< vector<vertex_descriptor> >& paths,
- ostream* graphOut,
- bool debug) const
-{
- const PathGraph& pathGraph = *this;
-
- // Map vertex descriptors to indexes in cluster.vertices.
- std::map<vertex_descriptor, uint64_t> vertexMap;
- for(uint64_t i=0; i<cluster.vertices.size(); i++) {
- const vertex_descriptor v = cluster.vertices[i].first;
- vertexMap.insert(make_pair(v, i));
- }
-
- // Construct the subgraph induced by the vertices of the cluster.
- using Subgraph = boost::adjacency_list<boost::listS, boost::vecS, boost::bidirectionalS>;
- Subgraph subgraph(vertexMap.size());
- for(const auto& p: vertexMap) {
- const vertex_descriptor v0 = p.first;
- const uint64_t i0 = p.second;
- BGL_FORALL_OUTEDGES(v0, e, pathGraph, PathGraph) {
- const vertex_descriptor v1 = target(e, pathGraph);
- const auto it = vertexMap.find(v1);
- if(it == vertexMap.end()) {
- continue;
- }
- const uint64_t i1 = it->second;
- add_edge(i0, i1, subgraph);
- }
- }
-
- // Compute strong connected components of this subgraph.
- const auto indexMap = get(boost::vertex_index, subgraph);
- vector<uint64_t> strongComponent(num_vertices(subgraph));
- boost::strong_components(
- subgraph,
- boost::make_iterator_property_map(strongComponent.begin(), indexMap));
-
- // Remove edges internal to strong components.
- vector<Subgraph::edge_descriptor> edgesToBeRemoved;
- BGL_FORALL_EDGES(e, subgraph, Subgraph) {
- const uint64_t i0 = source(e, subgraph);
- const uint64_t i1 = target(e, subgraph);
- if(strongComponent[i0] == strongComponent[i1]) {
- edgesToBeRemoved.push_back(e);
- }
- }
- for(const Subgraph::edge_descriptor e: edgesToBeRemoved) {
- boost::remove_edge(e, subgraph);
- }
-
- // Transitive reduction.
- transitiveReduction(subgraph);
-
-
- // Write it out.
- if(graphOut) {
- (*graphOut) << "digraph cluster {\n";
- for(uint64_t i=0; i<vertexMap.size(); i++) {
- const auto& p = cluster.vertices[i];
- const vertex_descriptor v = p.first;
- const uint64_t coverage = p.second;
- (*graphOut) << pathGraph[v].id;
- (*graphOut) << " [label=\"" << pathGraph[v].id << "\\n" << coverage << "\"]";
- (*graphOut) << ";\n";
- }
- BGL_FORALL_EDGES(e, subgraph, Subgraph) {
- const uint64_t i0 = source(e, subgraph);
- const uint64_t i1 = target(e, subgraph);
- const vertex_descriptor v0 = cluster.vertices[i0].first;
- const vertex_descriptor v1 = cluster.vertices[i1].first;
- (*graphOut) << pathGraph[v0].id << "->" << pathGraph[v1].id;
- (*graphOut) << ";\n";
- }
- (*graphOut) << "}\n";
-
- }
-
-
- // Find linear chains of vertices.
- vector< vector<Subgraph::vertex_descriptor> > chains;
- findLinearVertexChains(subgraph, chains);
- if(debug) {
- cout << "Found the following paths:" << endl;
- for(const vector<Subgraph::vertex_descriptor>& chain: chains) {
- for(const Subgraph::vertex_descriptor v: chain) {
- const PathGraph::vertex_descriptor u = cluster.vertices[v].first;
- cout << pathGraph[u].id << " ";
- }
- cout << endl;
- }
- }
-
- // Store a path for each chain.
- paths.clear();
- for(const vector<Subgraph::vertex_descriptor>& chain: chains) {
- vector<PathGraph::vertex_descriptor> path;
- for(const Subgraph::vertex_descriptor v: chain) {
- const PathGraph::vertex_descriptor u = cluster.vertices[v].first;
- path.push_back(u);
- }
- paths.push_back(path);
- }
-
-}
diff --git a/src/mode3-PathGraph.hpp b/src/mode3-PathGraph.hpp
deleted file mode 100644
index d5181f2..0000000
--- a/src/mode3-PathGraph.hpp
+++ /dev/null
@@ -1,286 +0,0 @@
-#ifndef SHASTA_MODE3_PATH_GRAPH_HPP
-#define SHASTA_MODE3_PATH_GRAPH_HPP
-
-/*******************************************************************************
-
-The mode3::PathGraph is a directed graph in which each vertex represents
-a path in the mode3::AssemblyGraph.
-
-*******************************************************************************/
-
-// Shasta.
-#include "mode3.hpp"
-#include "MultithreadedObject.hpp"
-
-// Boost libraries.
-#include <boost/graph/adjacency_list.hpp>
-
-// Standard libraries.
-#include <limits>
-#include "vector.hpp"
-
-namespace shasta {
- namespace mode3 {
- class PathGraph;
- class PathGraphVertex;
- class PathGraphEdge;
- class PathGraphOrderVerticesById;
- class PathGraphJourneySnippet;
- class PathGraphJourneySnippetCluster;
- class SnippetGraph;
- class SnippetGraphVertex;
-
- using PathGraphBaseClass = boost::adjacency_list<
- boost::listS,
- boost::listS,
- boost::bidirectionalS,
- PathGraphVertex, PathGraphEdge>;
- using SnippetGraphBaseClass =
- boost::adjacency_list<boost::setS, boost::listS, boost::bidirectionalS, SnippetGraphVertex>;
-
- }
-
- extern template class MultithreadedObject<mode3::PathGraph>;
-}
-
-
-
-// A PathGraphJourneySnippet describes a sequence of consecutive positions
-// of the path graph journey of an oriented read.
-// An OrientedReadId can have than more one PathGraphJourneySnippet in a given subgraph,
-// but this is not common. It can happen if the PathGraph contains a cycle.
-class shasta::mode3::PathGraphJourneySnippet {
-public:
-
- // The OrientedReadId this refers to.
- OrientedReadId orientedReadId;
-
- // The sequence of vertices encountered.
- vector<PathGraphBaseClass::vertex_descriptor> vertices;
-
- // The first and last position of this snippet
- // in the path graph journey of this OrientedReadId.
- uint64_t firstPosition;
- uint64_t lastPosition() const
- {
- return firstPosition + vertices.size() - 1;
- }
-};
-
-
-
-class shasta::mode3::PathGraphJourneySnippetCluster {
-public:
-
- // The snippets in this cluster.
- vector<PathGraphJourneySnippet> snippets;
- uint64_t coverage() const
- {
- return snippets.size();
- }
-
- // The PathGraph vertices visited by the snippets of this cluster,
- // each stored with its coverage (number of snippets);
- vector< pair<PathGraphBaseClass::vertex_descriptor, uint64_t > > vertices;
- vector<PathGraphBaseClass::vertex_descriptor> getVertices() const;
-
- // Remove vertices with coverage less than the specified value.
- void cleanupVertices(uint64_t minClusterCoverage);
-
- // Construct the vertices given the snippets.
- void constructVertices(const PathGraph&);
-
- // Construct a set of all pairs (orientedReadId, vertex) covered by this cluster.
- std::set< pair<OrientedReadId, PathGraphBaseClass::vertex_descriptor> > clusterSet;
- void createClusterSet();
-};
-
-
-
-// The SnippetGraph is used by PathGraph::detangleSubgraph.
-// A vertex represents a set of snippets and stores
-// the corresponding snippet indexes.
-// An edge x->y is created if there is at least one snippet in y
-// that is an approximate subset of a snippet in x.
-// Strongly connected components are condensed, so after that
-// the graph is guaranteed to have no cycles.
-class shasta::mode3::SnippetGraphVertex {
- public:
- vector<uint64_t> snippetIndexes;
- uint64_t clusterId = std::numeric_limits<uint64_t>::max();
- SnippetGraphVertex() {}
- SnippetGraphVertex(uint64_t snippetIndex) :
- snippetIndexes(1, snippetIndex) {}
- };
-class shasta::mode3::SnippetGraph : public SnippetGraphBaseClass {
-public:
- uint64_t clusterCount = 0;
- void findDescendants(const vertex_descriptor, vector<vertex_descriptor>&) const;
- void writeGraphviz(const string& fileName) const;
-};
-
-
-
-// Each vertex of the PathGraph describes a path
-// in the mode3::AssemblyGraph.
-class shasta::mode3::PathGraphVertex {
-public:
-
- // The segment ids of the mode3::AssemblyGraph path
- // that this vertex describes.
- vector<uint64_t> path;
-
- // We also store the assembly graph journey intervals
- // for the oriented reads that are believed to follow this path.
- // Note that an oriented read can have more than one journey interval
- // (e. g. if it goes around in a cycle).
- // The second item in the pair is the ordinal
- // of this vertex in the path graph journey of the oriented read.
- // It is filled in by computeJourneys.
- vector<pair<AssemblyGraphJourneyInterval, uint64_t> > journeyIntervals;
-
- // The vertex id is only used to help keep track of vertices
- // for testing and debugging.
- uint64_t id;
-
- // The partition this vertex was assigned to.
- uint64_t subgraphId = invalid<uint64_t>;
-
- // Distance from the start vertex of the BFS.
- // Only used during the BFS.
- uint64_t distance = 0;
-};
-
-
-
-class shasta::mode3::PathGraphEdge {
-public:
- uint64_t coverage = 0;
-};
-
-
-
-class shasta::mode3::PathGraph :
- public PathGraphBaseClass,
- public MultithreadedObject<PathGraph> {
-public:
-
- // Create the PathGraph from the AssemblyGraph.
- PathGraph(const AssemblyGraph&);
-
- // This writes a GFA representation of the PathGraph,
- // with one GFA segment per vertex.
- // It also writes an accompanying csv file that can be loaded in Bandage.
- void writeGfa(const string& baseName) const;
-
- // This writes a detailed csv file containing the path corresponding
- // to each vertex.
- void writeCsvDetailed(const string& fileName) const;
-
-private:
-
- // The AssemblyGraph this PathGraph refers to.
- const AssemblyGraph& assemblyGraph;
-
- // Initial creation of the vertices.
- // Start with a single segment for each vertex
- // (that is, paths of length 1).
- void createVertices();
-
- // Creation of vertices after a detangle iteration.
- void createVertices(const vector<PathGraphVertex>&);
-
- // Recreate all edges from scratch, using only the
- // information stored in the vertices.
- void createEdges(uint64_t minCoverage);
-
- // The id of the next vertex to be added.
- // Vertex ids are only used to help keep track of vertices
- // for testing and debugging.
- uint64_t nextVertexId = 0;
-
- // The journeys of all oriented reads in the PathGraph.
- // The journey of an oriented read in the PathGraph is
- // a sequence of vertex descriptors which is not necessarily a path.
- // Indexed by OrientedReadId::getValue();
- vector< vector<vertex_descriptor> > journeys;
- void computeJourneys();
- void writeJourneys(const string& fileName) const;
-
- // Partition the PathGraph into subgraphs.
- void partition(
- uint64_t maxDistance,
- uint64_t minSubgraphSize);
- static const uint64_t noSubgraph = std::numeric_limits<uint64_t>::max();
-
- // Gather subgraphs using the subgraphId stored in each vertex.
- // A subgraph can have size 0, and in that case it should be ignored.
- void gatherSubgraphs();
- void histogramSubgraphs();
- vector< vector<vertex_descriptor> > subgraphs;
-
- // A partition iteration does a single BFS starting at v.
- // It moves forward from v, avoiding vertices already
- // assigned to a subgraph, and up to maxDistance from v.
- // It also returns the boundaryVertices, that is the
- // vertices found in the process that are at distance maxDistance+1
- // from v and are nto yet assigned to a subgraph.
- // These can then used as starting points new partition iterations.
- void partitionIteration(
- vertex_descriptor v,
- uint64_t maxDistance,
- uint64_t subgraphId,
- vector<vertex_descriptor>& boundaryVertices);
-
-
-
- // Detangling of a subgraph.
- // Returns new vertices for the next detangle iteration.
- // The new vertices can only be used in a new PathGraph
- // created from scratch.
- void detangleSubgraph(
- uint64_t subgraphId,
- vector<PathGraphVertex>& newVertices,
- bool debug
- ) const;
- template<uint64_t N> void detangleSubgraphTemplate(
- uint64_t subgraphId,
- vector<PathGraphVertex>& newVertices,
- bool debug
- ) const;
-
- // Detangle all the subgraphs.
- // This does not modify the PathGraph.
- // Instead, it creates vertices to be used for next detangle iteration.
- void detangle(vector<PathGraphVertex>& newVertices) const;
-
- // Given a PathGraphJourneySnippetCluster, find plausible
- // paths for it in the PathGraph.
- void findClusterPaths(
- const PathGraphJourneySnippetCluster&,
- vector< vector<vertex_descriptor> >& path,
- ostream*,
- bool debug) const;
-};
-
-
-
-// Class used to order/sort PathGraph vertex descriptors
-// by increasing vertex id.
-class shasta::mode3::PathGraphOrderVerticesById {
-public:
- PathGraphOrderVerticesById(const PathGraph& pathGraph) :
- pathGraph(pathGraph) {}
- const PathGraph& pathGraph;
-
- bool operator()(
- PathGraph::vertex_descriptor v0,
- PathGraph::vertex_descriptor v1) const
- {
- return pathGraph[v0].id < pathGraph[v1].id;
- }
-};
-
-
-
-#endif
diff --git a/src/mode3-PhasedComponent.cpp b/src/mode3-PhasedComponent.cpp
new file mode 100644
index 0000000..92698cd
--- /dev/null
+++ b/src/mode3-PhasedComponent.cpp
@@ -0,0 +1,31 @@
+#include "mode3-PhasedComponent.hpp"
+#include "orderPairs.hpp"
+#include "SHASTA_ASSERT.hpp"
+using namespace shasta;
+using namespace mode3;
+
+#include "algorithm.hpp"
+#include "limits"
+
+
+
+void PhasedComponent::sort()
+{
+ SHASTA_ASSERT(size() > 1);
+ std::sort(begin(), end(), OrderPairsByFirstOnly<uint64_t, int64_t>());
+ minPositionInBubbleChain = front().first;
+ maxPositionInBubbleChain = back().first;
+}
+
+
+
+void PhasedComponent::computePositionRange()
+{
+ minPositionInBubbleChain = std::numeric_limits<uint64_t>::max();
+ maxPositionInBubbleChain = 0;
+ for(const auto& p: *this) {
+ const uint64_t positionInBubbleChain = p.first;
+ minPositionInBubbleChain = min(minPositionInBubbleChain, positionInBubbleChain);
+ maxPositionInBubbleChain = max(maxPositionInBubbleChain, positionInBubbleChain);
+ }
+}
diff --git a/src/mode3-PhasedComponent.hpp b/src/mode3-PhasedComponent.hpp
new file mode 100644
index 0000000..f4b0e87
--- /dev/null
+++ b/src/mode3-PhasedComponent.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "cstdint.hpp"
+#include "utility.hpp"
+#include "vector.hpp"
+
+namespace shasta {
+ namespace mode3 {
+ class PhasedComponent;
+ }
+}
+
+
+
+// A PhasedComponent is a set of phased diploid bubbles
+// in a BubbleChain.
+// It is a vector of (bubble position in bubble chain, phase),
+// sorted by bubble position in bubble chain.
+// The phase can be -1 or +1.
+// PhasedComponents are created in such a way that their position ranges
+// in the bubble chain are not overlapping.
+class shasta::mode3::PhasedComponent : public vector< pair<uint64_t, int64_t> > {
+public:
+ uint64_t minPositionInBubbleChain;
+ uint64_t maxPositionInBubbleChain;
+ void sort();
+ void computePositionRange();
+};
+
diff --git a/src/mode3-PhasingTable.cpp b/src/mode3-PhasingTable.cpp
new file mode 100644
index 0000000..fd1c937
--- /dev/null
+++ b/src/mode3-PhasingTable.cpp
@@ -0,0 +1,1258 @@
+// Shasta.
+
+#include "Assembler.hpp"
+#include "bits/stdint-uintn.h"
+#include "mode3-AssemblyGraph.hpp"
+#include "mode3-PhasingTable.hpp"
+#include "MarkerGraph.hpp"
+#include "MarkerInterval.hpp"
+#include "orderPairs.hpp"
+#include "PngImage.hpp"
+#include "shastaTypes.hpp"
+#include "SHASTA_ASSERT.hpp"
+using namespace shasta;
+using namespace mode3;
+
+// Boost libraries
+#include <boost/graph/iteration_macros.hpp>
+#include <boost/graph/graph_traits.hpp>
+#include <boost/multi_index/detail/bidir_node_iterator.hpp>
+#include <boost/multi_index/detail/ord_index_impl.hpp>
+#include <boost/operators.hpp>
+
+// Standard library.
+#include "algorithm.hpp"
+#include "filesystem.hpp"
+#include "iostream.hpp"
+#include <limits>
+#include <set>
+#include "stdexcept.hpp"
+#include "string.hpp"
+#include "tuple.hpp"
+#include "utility.hpp"
+#include "vector.hpp"
+
+
+
+void AssemblyGraph::writeBubbleChainsPhasingTables(
+ const string& fileNamePrefix,
+ double phaseErrorThreshold) const
+{
+ const AssemblyGraph& cGraph = *this;
+
+ const string directoryName = fileNamePrefix + "-PhasingTables";
+ if(not std::filesystem::create_directory(directoryName)) {
+ throw runtime_error("Could not create directory " + directoryName);
+ }
+
+
+ // Loop over all BubbleChains.
+ BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) {
+ const AssemblyGraphEdge& edge = cGraph[ce];
+ const BubbleChain& bubbleChain = edge;
+
+ // Create the phasing table for this bubble chain.
+ PhasingTable phasingTable(bubbleChain, assembler.markerGraph, phaseErrorThreshold);
+
+ if(phasingTable.empty()) {
+ continue;
+ }
+ if(phasingTable.bubbleCount() < 2) {
+ continue;
+ }
+
+ cout << "Phasing table for " << bubbleChainStringId(ce) <<
+ " has " << phasingTable.entryCount() <<
+ " entries (of which " << phasingTable.ambiguousEntryCount() <<
+ " ambiguous) for " <<
+ phasingTable.bubbleCount() << " bubbles and " <<
+ phasingTable.orientedReadCount() << " oriented reads." << endl;
+
+ const string fileNamePrefix = directoryName + "/" + bubbleChainStringId(ce);
+ phasingTable.writeCsv(fileNamePrefix);
+ phasingTable.writePng(fileNamePrefix + "-RelativePhase.png",
+ PhasingTable::ColoringMethod::byRelativePhase);
+ phasingTable.writePng(fileNamePrefix + "-DiscreteRelativePhase.png",
+ PhasingTable::ColoringMethod::byDiscreteRelativePhase);
+
+ phasingTable.greedyPhasing();
+ phasingTable.writePng(fileNamePrefix + "-Consistency.png",
+ PhasingTable::ColoringMethod::byConsistency);
+
+#if 0
+ for(uint64_t i=0; i<6; i++) {
+ cout << "Discordant count before sweep " << i << " = " << phasingTable.discordantCount() << endl;
+ phasingTable.flipSweep();
+ }
+ cout << "Final discordant count = " << phasingTable.discordantCount() << endl;
+ phasingTable.writePng(directoryName + "/" + bubbleChainStringId(ce) + "-sweep.png", false);
+ phasingTable.writePng(directoryName + "/" + bubbleChainStringId(ce) + "-sweep-byType.png", true);
+#endif
+ }
+}
+
+
+PhasingTable::PhasingTable(
+ const BubbleChain& bubbleChain,
+ const MarkerGraph& markerGraph,
+ double phaseErrorThreshold)
+{
+ fill(bubbleChain, markerGraph, phaseErrorThreshold);
+ gatherOrientedReads();
+ gatherBubbles();
+ fillIndexes();
+}
+
+
+
+void PhasingTable::fill(
+ const BubbleChain& bubbleChain,
+ const MarkerGraph& markerGraph,
+ double phaseErrorThreshold)
+{
+ clear();
+
+ // Loop over the bubbles in this bubble chain.
+ for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) {
+ const mode3::Bubble& bubble = bubbleChain[positionInBubbleChain];
+
+ // If this bubble is not diploid, skip it.
+ if(not bubble.isDiploid()) {
+ continue;
+ }
+
+ // Loop over the two chains of this diploid bubble.
+ for(uint64_t chainIndexInBubble=0; chainIndexInBubble<bubble.size(); chainIndexInBubble++) {
+ SHASTA_ASSERT(chainIndexInBubble < 2);
+ const Chain& chain = bubble[chainIndexInBubble];
+
+
+ // Loop over marker graph edges of this chain, excluding the terminal ones.
+ SHASTA_ASSERT(chain.size() >= 2);
+ for(uint64_t i=1; i<chain.size()-1; i++) {
+ const MarkerGraphEdgeId markerGraphEdgeId = chain[i];
+
+ // Loop over MarkerIntervals of this marker graph edge.
+ const span<const MarkerInterval> markerIntervals = markerGraph.edgeMarkerIntervals[markerGraphEdgeId];
+ for(const MarkerInterval& markerInterval: markerIntervals) {
+ const OrientedReadId orientedReadId = markerInterval.orientedReadId;
+
+ // Access the PhasingTableEntry for this OrientedReadId and
+ // position in the bubble chain, creating it if necessary.
+ auto it = indexByBoth().find(make_tuple(orientedReadId, positionInBubbleChain));
+ if(it == indexByBoth().end()) {
+ tie(it, ignore) = insert(PhasingTableEntry(orientedReadId, positionInBubbleChain));
+ }
+ // Access it as non-const so we can update the frequency array.
+ // We can do a const_cast because we only update the frequency,
+ // which does not participate in any field used to index the PhasingTable.
+ PhasingTableEntry& entry = const_cast<PhasingTableEntry&>(*it);
+
+ // Increment the PhasingTableEntry for this OrientedReadId and positionInBubbleChain.
+ ++entry.frequency[chainIndexInBubble];
+ }
+ }
+ }
+ }
+
+ // Compute the relative phase of all PhasingTableEntries.
+ for(const PhasingTableEntry& phasingTableEntry: indexByBoth()) {
+ PhasingTableEntry& nonConstPhasingTableEntry = const_cast<PhasingTableEntry&>(phasingTableEntry);
+ nonConstPhasingTableEntry.storeRelativePhase(phaseErrorThreshold);
+ }
+}
+
+
+
+void PhasingTable::gatherOrientedReads()
+{
+
+ // Gather the distinct OrientedReadIds that appear in this PhasingTable.
+ std::set<OrientedReadId> orientedReadIds;
+ for(const PhasingTableEntry& phasingTableEntry: indexByBoth()) {
+ orientedReadIds.insert(phasingTableEntry.orientedReadId);
+ }
+
+ // Store them in the orientedReads vector.
+ orientedReads.clear();
+ for(const OrientedReadId orientedReadId: orientedReadIds) {
+ OrientedRead orientedRead;
+ orientedRead.id = orientedReadId;
+ orientedReads.push_back(orientedRead);
+ }
+
+ // Fill in the min/max positions in the bubble chain.
+ for(OrientedRead& orientedRead: orientedReads) {
+ orientedRead.minPositionInBubbleChain = std::numeric_limits<uint64_t>::max();
+ orientedRead.maxPositionInBubbleChain = 0;
+ for(auto it=indexByOrientedReadId().find(orientedRead.id);
+ it!=indexByOrientedReadId().end() and it->orientedReadId == orientedRead.id; ++it) {
+ const uint64_t positionInBubbleChain = it->positionInBubbleChain;
+ orientedRead.minPositionInBubbleChain = min(orientedRead.minPositionInBubbleChain, positionInBubbleChain);
+ orientedRead.maxPositionInBubbleChain = max(orientedRead.maxPositionInBubbleChain, positionInBubbleChain);
+ }
+ }
+
+ // Sort the orientedReads vector by average position.
+ vector< pair<uint64_t, uint64_t> > orientedReadsTable; // (index, minPosition + maxPosition)
+ for(uint64_t i=0; i<orientedReads.size(); i++) {
+ const OrientedRead& orientedRead = orientedReads[i];
+ orientedReadsTable.push_back({i, orientedRead.minPositionInBubbleChain + orientedRead.maxPositionInBubbleChain});
+ }
+ sort(orientedReadsTable.begin(), orientedReadsTable.end(),
+ OrderPairsBySecondOnly<uint64_t, uint64_t>());
+ vector<OrientedRead> sortedOrientedReads;
+ for(const auto& p: orientedReadsTable) {
+ sortedOrientedReads.push_back(orientedReads[p.first]);
+ }
+ orientedReads.swap(sortedOrientedReads);
+
+ // Fill in the orientedReadIdsMap map.
+ orientedReadsMap.clear();
+ for(uint64_t i=0; i<orientedReads.size(); i++) {
+ orientedReadsMap.insert({orientedReads[i].id, i});
+ }
+}
+
+
+
+void PhasingTable::gatherBubbles()
+{
+
+ // Gather the positions in the bubble chains of the diploid bubbles
+ // that the oriented reads appear in.
+ std::set<uint64_t> positionsInBubbleChain;
+ for(const PhasingTableEntry& phasingTableEntry: indexByBoth()) {
+ positionsInBubbleChain.insert(phasingTableEntry.positionInBubbleChain);
+ }
+
+ // Store them in the bubbles vector.
+ bubbles.clear();
+ for(const uint64_t positionInBubbleChain: positionsInBubbleChain) {
+ bubbles.push_back({positionInBubbleChain});
+ }
+
+ // Check that the bubbles are sorted by position.
+ for(uint64_t i1=1; i1<bubbles.size(); i1++) {
+ const uint64_t i0 = i1 - 1;
+ const Bubble& bubble0 = bubbles[i0];
+ const Bubble& bubble1 = bubbles[i1];
+ SHASTA_ASSERT(bubble0.positionInBubbleChain < bubble1.positionInBubbleChain);
+ }
+
+ // Fill in the bubble map.
+ bubblesMap.clear();
+ for(uint64_t i=0; i<bubbles.size(); i++) {
+ bubblesMap.insert({bubbles[i].positionInBubbleChain, i});
+ }
+
+}
+
+
+
+// Fill the orientedReadIndex and bubbleIndex in all PhasingTableEntries.
+// This can only be done after gatherOrientedReads and gatherBubbles
+// have been called.
+void PhasingTable::fillIndexes()
+{
+ for(const PhasingTableEntry& phasingTableEntry: indexByBoth()) {
+
+ // Access the PhasingTableEntry as a non-const reference.
+ // This is ok because we will not modify the fields that participate
+ // in the PhasingTable indexes.
+ PhasingTableEntry& entry = const_cast<PhasingTableEntry&>(phasingTableEntry);
+
+ entry.orientedReadIndex = orientedReadsMap[entry.orientedReadId];
+ entry.bubbleIndex = bubblesMap[entry.positionInBubbleChain];
+ }
+
+}
+
+#if 0
+
+void PhasingTable::write(const string& fileNamePrefix) const
+{
+ writeCsv(fileNamePrefix);
+ writeHtml(fileNamePrefix);
+ writePng(fileNamePrefix + ".png", true);
+}
+#endif
+
+
+
+void PhasingTable::writeCsv(const string& fileNamePrefix) const
+{
+ writeOrientedReadsCsv(fileNamePrefix);
+ writeBubblesCsv(fileNamePrefix, true);
+ writeDetailsCsv(fileNamePrefix);
+}
+
+
+
+void PhasingTable::writeOrientedReadsCsv(const string& fileNamePrefix) const
+{
+ ofstream csv(fileNamePrefix + "-OrientedReads.csv");
+ csv << "OrientedReadId,Min position in bubble chain,Max position in bubble chain,"
+ "Oriented read index,Min bubble index,Max bubble Index,\n";
+
+ for(uint64_t i=0; i<orientedReads.size(); i++) {
+ const OrientedRead& orientedRead = orientedReads[i];
+ csv << orientedRead.id << ",";
+ csv << orientedRead.minPositionInBubbleChain << ",";
+ csv << orientedRead.maxPositionInBubbleChain << ",";
+ csv << i << ",";
+ csv << bubblesMap.find(orientedRead.minPositionInBubbleChain)->second << ",";
+ csv << bubblesMap.find(orientedRead.maxPositionInBubbleChain)->second << ",";
+ csv << "\n";
+ }
+}
+
+
+
+void PhasingTable::writeBubblesCsv(
+ const string& fileNamePrefix,
+ bool writePhasingInformation) const
+{
+ ofstream csv(fileNamePrefix + "-Bubbles.csv");
+ csv << "Position in bubble chain,Bubble index,Unambiguous,Ambiguous,";
+ if(writePhasingInformation) {
+ csv << "Consistent,Inconsistent,Error rate,";
+ }
+ csv << "\n";
+
+ for(uint64_t i=0; i<bubbles.size(); i++) {
+ csv << bubbles[i].positionInBubbleChain << ",";
+ csv << i << ",";
+
+ uint64_t unambiguous;
+ uint64_t ambiguous;
+ tie(unambiguous, ambiguous) = countEntriesForBubble(bubbles[i].positionInBubbleChain);
+ csv << unambiguous << ",";
+ csv << ambiguous << ",";
+
+ if(writePhasingInformation) {
+ uint64_t consistent;
+ uint64_t inconsistent;
+ tie(consistent, inconsistent) = countConsistentEntriesForBubble(bubbles[i].positionInBubbleChain);
+ csv << consistent << ",";
+ csv << inconsistent << ",";
+ csv << double(inconsistent) / double(consistent + inconsistent) << ",";
+ }
+
+ csv << "\n";
+ }
+}
+
+
+
+void PhasingTable::writeDetailsCsv(const string& fileNamePrefix) const
+{
+ ofstream csv(fileNamePrefix + "-Details.csv");
+
+ csv << "Position in bubble chain,OrientedReadId,Bubble index,Oriented read index,Frequency0,Frequency1,"
+ "Relative phase,DiscreteRelative phase\n";
+
+ for(const OrientedRead& orientedRead: orientedReads) {
+ const OrientedReadId orientedReadId = orientedRead.id;
+ for(auto it=indexByOrientedReadId().find(orientedReadId);
+ it!=indexByOrientedReadId().end() and it->orientedReadId == orientedReadId; ++it) {
+ const PhasingTableEntry& phasingTableEntry = *it;
+ phasingTableEntry.writeCsv(csv);
+ csv << "\n";
+ }
+ }
+}
+
+
+
+void PhasingTableEntry::writeCsv(ostream& csv) const
+{
+ csv << positionInBubbleChain << ",";
+ csv << orientedReadId << ",";
+ csv << bubbleIndex << ",";
+ csv << orientedReadIndex << ",";
+ csv << frequency[0] << ",";
+ csv << frequency[1] << ",";
+ csv << relativePhase << ",";
+ csv << discreteRelativePhase << ",";
+}
+
+
+
+void PhasingTable::writePng(const string& fileName, ColoringMethod coloringMethod) const
+{
+ PngImage image{int(bubbleCount()), int(orientedReadCount())};
+ for(uint64_t x=0; x<bubbleCount(); x++) {
+ for(uint64_t y=0; y<orientedReadCount(); y++) {
+ image.setPixel(int(x), int(y), 255, 255, 255);
+ }
+ }
+
+ for(const PhasingTableEntry& entry: indexByBoth()) {
+
+ int r, g, b;
+ if(coloringMethod == ColoringMethod::byDiscreteRelativePhase) {
+ switch(entry.discreteRelativePhase) {
+ case 0:
+ // Ambiguous: black
+ r = 0;
+ g = 0;
+ b = 0;
+ break;
+ case +1:
+ // In-phase: red.
+ r = 255;
+ g = 0;
+ b = 0;
+ break;
+ case -1:
+ // Out-of-phase: blue.
+ r = 0;
+ g = 0;
+ b = 255;
+ break;
+ default:
+ SHASTA_ASSERT(0);
+ }
+
+ } else if(coloringMethod == ColoringMethod::byRelativePhase) {
+
+ // Compute (r, g, b) values that give:
+ // - Green if relativePhase is 1 (in-phase).
+ // - Red if relativePhase is -1 (out-of-phase).
+ if(entry.relativePhase >= 0.) {
+ r = 255;
+ g = 0;
+ b = int(std::round((1. - entry.relativePhase) * 255.));
+ } else {
+ r = int(std::round((1. + entry.relativePhase) * 255.));
+ g = 0;
+ b = 255;
+ }
+ } else if(coloringMethod == ColoringMethod::byConsistency) {
+ const int64_t state = consistencyState(entry);
+ switch(state) {
+ case +1:
+ r = 0;
+ g = 255;
+ b = 0;
+ break;
+ case -1:
+ r = 255;
+ g = 0;
+ b = 0;
+ break;
+ case 0:
+ r = 255;
+ g = 255;
+ b = 0;
+ break;
+ default:
+ SHASTA_ASSERT(0);
+ }
+
+ } else {
+ SHASTA_ASSERT(0);
+ }
+
+ image.setPixel(int(entry.bubbleIndex), int(entry.orientedReadIndex), r, g, b);
+ }
+
+ image.write(fileName);
+}
+
+
+
+uint64_t PhasingTable::unambiguousEntryCount() const
+{
+ const auto& indexByBoth = get<0>();
+
+ uint64_t n = 0;
+ for(const PhasingTableEntry& entry: indexByBoth) {
+ if(entry.discreteRelativePhase != 0) {
+ ++n;
+ }
+ }
+ return n;
+}
+
+
+
+uint64_t PhasingTable::ambiguousEntryCount() const
+{
+ const auto& indexByBoth = get<0>();
+
+ uint64_t n = 0;
+ for(const PhasingTableEntry& entry: indexByBoth) {
+ if(entry.discreteRelativePhase == 0) {
+ ++n;
+ }
+ }
+ return n;
+}
+
+
+
+// Compute the consistency state of a PhasingTableEntry relative
+// to the current phases of its oriented read and bubble.
+// It can be +1 (consistent), -1 (inconsistent), or 0 (unassigned or ambiguous).
+int64_t PhasingTable::consistencyState(const PhasingTableEntry& entry) const
+{
+ if(entry.discreteRelativePhase == 0) {
+ return 0;
+ }
+
+ const int64_t orientedReadPhase = orientedReads[entry.orientedReadIndex].phase;
+ if(orientedReadPhase == 0) {
+ return 0;
+ }
+
+ const int64_t bubblePhase = bubbles[entry.bubbleIndex].phase;
+ if(bubblePhase == 0) {
+ return 0;
+ }
+
+ if(entry.discreteRelativePhase == 1) {
+ if(orientedReadPhase == bubblePhase) {
+ return +1;
+ } else {
+ return -1;
+ }
+ } else {
+ if(orientedReadPhase == bubblePhase) {
+ return -1;
+ } else {
+ return +1;
+ }
+ }
+}
+
+
+
+// Count the number of (consistent,inconsistent) PhasingTableEntries
+// for an oriented read based on the phases currently assigned
+// to bubbles and oriented reads.
+pair<uint64_t, uint64_t> PhasingTable::countConsistentEntriesForOrientedRead(
+ OrientedReadId orientedReadId) const
+{
+ uint64_t consistentCount = 0;
+ uint64_t inconsistentCount = 0;
+
+ for(auto it=indexByOrientedReadId().find(orientedReadId);
+ it!=indexByOrientedReadId().end() and it->orientedReadId == orientedReadId; ++it) {
+ const PhasingTableEntry& entry = *it;
+
+ const int64_t s = consistencyState(entry);
+ switch(s) {
+ case +1:
+ ++consistentCount;
+ break;
+ case -1:
+ ++inconsistentCount;
+ break;
+ case 0:
+ break;
+ default:
+ SHASTA_ASSERT(0);
+ }
+ }
+
+ return {consistentCount, inconsistentCount};
+}
+
+
+
+// Count the number of (consistent,inconsistent) PhasingTableEntries
+// for the bubble at a given bubble chain position based on the phases currently assigned
+// to bubbles and oriented reads.
+pair<uint64_t, uint64_t> PhasingTable::countConsistentEntriesForBubble(uint64_t positionInBubbleChain) const
+{
+ uint64_t consistentCount = 0;
+ uint64_t inconsistentCount = 0;
+
+ for(auto it=indexByPositionInBubbleChain().find(positionInBubbleChain);
+ it!=indexByPositionInBubbleChain().end() and it->positionInBubbleChain == positionInBubbleChain; ++it) {
+ const PhasingTableEntry& entry = *it;
+
+ const int64_t s = consistencyState(entry);
+ switch(s) {
+ case +1:
+ ++consistentCount;
+ break;
+ case -1:
+ ++inconsistentCount;
+ break;
+ case 0:
+ break;
+ default:
+ SHASTA_ASSERT(0);
+ }
+ }
+
+ return {consistentCount, inconsistentCount};
+
+}
+
+
+
+pair<uint64_t, uint64_t> PhasingTable::countEntriesForBubble(uint64_t positionInBubbleChain) const
+{
+ uint64_t unambiguous = 0;
+ uint64_t ambiguous = 0;
+
+ for(auto it=indexByPositionInBubbleChain().find(positionInBubbleChain);
+ it!=indexByPositionInBubbleChain().end() and it->positionInBubbleChain == positionInBubbleChain; ++it) {
+ const PhasingTableEntry& entry = *it;
+
+ if(entry.discreteRelativePhase == 0) {
+ ++ambiguous;
+ } else {
+ ++unambiguous;
+ }
+ }
+
+ return {unambiguous, ambiguous};
+
+}
+
+
+// Count the number of (consistent,inconsistent) PhasingTableEntries
+// based on the phases currently assigned
+// to bubbles and oriented reads.
+pair<uint64_t, uint64_t> PhasingTable::countConsistentEntries() const
+{
+ uint64_t consistentCount = 0;
+ uint64_t inconsistentCount = 0;
+
+ for(const PhasingTableEntry& entry: indexByBoth()) {
+
+ const int64_t s = consistencyState(entry);
+ switch(s) {
+ case +1:
+ ++consistentCount;
+ break;
+ case -1:
+ ++inconsistentCount;
+ break;
+ case 0:
+ break;
+ default:
+ SHASTA_ASSERT(0);
+ }
+ }
+
+ return {consistentCount, inconsistentCount};
+
+}
+
+
+
+// Iteratively optimize the phases of the oriented reads and of the bubbles.
+// Experimental. Do not use.
+void PhasingTable::simpleIterativePhasing1()
+{
+ // Start with the phases of all oriented reads and bubbles set to +1.
+ for(OrientedRead& orientedRead: orientedReads) {
+ orientedRead.phase = +1;
+ }
+ for(Bubble& bubble: bubbles) {
+ bubble.phase = +1;
+ }
+
+
+ // Iteration loop.
+ uint64_t consistentCount;
+ uint64_t inconsistentCount;
+ tie(consistentCount, inconsistentCount) = countConsistentEntries();
+ const uint64_t unassignedCount = size() - (consistentCount + inconsistentCount);
+ uint64_t oldInconsistentCount = inconsistentCount;
+ cout << "Initial consistency statistics: consistent " << consistentCount <<
+ ", inconsistent " << inconsistentCount <<
+ ", unassigned " << unassignedCount << endl;
+ for(uint64_t iteration=0; ; iteration++) {
+
+ // Set the oriented read phases based on the current bubble phases.
+ for(OrientedRead& orientedRead: orientedReads) {
+
+ // Count the number of consistent/inconsistent PhasingTableEntries
+ // for this bubble.
+ tie(consistentCount, inconsistentCount) =
+ countConsistentEntriesForOrientedRead(orientedRead.id);
+
+ // Set the phase of this oriented read accordingly.
+ if(consistentCount >= inconsistentCount) {
+ // Do nothing.
+ } else {
+ // Flip it.
+ orientedRead.phase = - orientedRead.phase;
+ }
+ }
+
+ // Set the bubble phases based on the current oriented read phases.
+ for(Bubble& bubble: bubbles) {
+
+ // Count the number of consistent/inconsistent PhasingTableEntries
+ // for this bubble.
+ tie(consistentCount, inconsistentCount) =
+ countConsistentEntriesForBubble(bubble.positionInBubbleChain);
+
+ const double consistentFraction = double(consistentCount) / double(consistentCount + inconsistentCount);
+
+ // Set the phase of this bubble accordingly.
+ if(consistentFraction > 0.2) {
+ // Do nothing.
+ } else {
+ // Flip it.
+ bubble.phase = - bubble.phase;
+ }
+ }
+
+ tie(consistentCount, inconsistentCount) = countConsistentEntries();
+ const uint64_t unassignedCount = size() - (consistentCount + inconsistentCount);
+ cout << "Consistency statistics after phasing iteration " << iteration <<
+ ": consistent " << consistentCount <<
+ ", inconsistent " << inconsistentCount <<
+ ", unassigned " << unassignedCount << endl;
+ SHASTA_ASSERT(inconsistentCount <= oldInconsistentCount);
+ if(inconsistentCount == oldInconsistentCount) {
+ break;
+ }
+ oldInconsistentCount = inconsistentCount;
+ }
+}
+
+
+
+// Iteratively optimize the phases of the oriented reads and of the bubbles.
+// Experimental. Do not use.
+void PhasingTable::simpleIterativePhasing2()
+{
+ // Start with the phases of all oriented reads and bubbles set to +1.
+ for(OrientedRead& orientedRead: orientedReads) {
+ orientedRead.phase = +1;
+ }
+ for(Bubble& bubble: bubbles) {
+ bubble.phase = +1;
+ }
+
+
+ // Iteration loop.
+ uint64_t consistentCount;
+ uint64_t inconsistentCount;
+ tie(consistentCount, inconsistentCount) = countConsistentEntries();
+ const uint64_t unassignedCount = size() - (consistentCount + inconsistentCount);
+ cout << "Initial consistency statistics: consistent " << consistentCount <<
+ ", inconsistent " << inconsistentCount <<
+ ", unassigned " << unassignedCount << endl;
+ vector<uint64_t> consistentBubbles;
+ vector<uint64_t> inconsistentBubbles;
+ for(uint64_t iteration=0; iteration<6; iteration++) {
+
+ // Loop over oriented reads.
+ for(OrientedRead& orientedRead: orientedReads) {
+
+ // Gather the bubbles that have a consistent/inconsistent
+ // PhasingTableEntry with this oriented read.
+ // Gather the bubbles where this oriented read appears with phase +1 or -1
+ // (with tolerance equal to phaseError).
+ consistentBubbles.clear();
+ inconsistentBubbles.clear();
+ for(auto it=indexByOrientedReadId().find(orientedRead.id);
+ it!=indexByOrientedReadId().end() and it->orientedReadId == orientedRead.id; ++it) {
+ const PhasingTableEntry& phasingTableEntry = *it;
+ const int64_t s = consistencyState(phasingTableEntry);
+
+ if(s == +1) {
+ consistentBubbles.push_back(phasingTableEntry.bubbleIndex);
+ } else if(s == -1) {
+ inconsistentBubbles.push_back(phasingTableEntry.bubbleIndex);
+ }
+ }
+
+ // If there are more consistentBubbles than inconsistentBubbles, flip the minusBubbles.
+ // If there are more inconsistentBubbles than consistentBubbles, flip the plusBubbles.
+ if(consistentBubbles.size() == inconsistentBubbles.size()) {
+ continue;
+ }
+ const vector<uint64_t>& bubblesToFlip =
+ (consistentBubbles.size() > inconsistentBubbles.size()) ? inconsistentBubbles : consistentBubbles;
+ for(const uint64_t bubbleIndex: bubblesToFlip) {
+ Bubble& bubble = bubbles[bubbleIndex];
+ bubble.phase = -bubble.phase;
+ }
+ if(inconsistentBubbles.size() > consistentBubbles.size()) {
+ orientedRead.phase = - orientedRead.phase;
+ }
+ }
+
+ tie(consistentCount, inconsistentCount) = countConsistentEntries();
+ const uint64_t unassignedCount = size() - (consistentCount + inconsistentCount);
+ cout << "Consistency statistics after phasing iteration " << iteration <<
+ ": consistent " << consistentCount <<
+ ", inconsistent " << inconsistentCount <<
+ ", unassigned " << unassignedCount << endl;
+ }
+}
+
+
+
+void PhasingTable::greedyPhasing()
+{
+ const bool debug = false;
+
+ class OrientedReadInfo {
+ public:
+
+ // Index of this oriented read in the orientedReads vector.
+ uint64_t orientedReadIndex;
+
+ // The total number of unambiguous PhasingTableEntries for this oriented read.
+ uint64_t unambiguousBubbleCount = 0;
+
+ // The number of bubbles that have already been phased and that have an
+ // unambiguous PhasingTableEntry with this oriented read.
+ uint64_t phasedUnambiguousBubbleCount = 0;
+
+ OrientedReadInfo(uint64_t orientedReadIndex) :
+ orientedReadIndex(orientedReadIndex) {}
+ };
+
+ // The OrientedReadTable is a container of OrientedReadInfo
+ // used to keep track of unphased oriented reads by various criteria.
+ class OrientedReadTable : public boost::multi_index_container<OrientedReadInfo,
+ boost::multi_index::indexed_by <
+
+ // Index by orientedReadIndex (unique).
+ boost::multi_index::ordered_unique<boost::multi_index::member<
+ OrientedReadInfo,
+ uint64_t,
+ &OrientedReadInfo::orientedReadIndex> >,
+
+ // Index by unambiguousBubbleCount (non-unique, largest first).
+ boost::multi_index::ordered_non_unique<boost::multi_index::member<
+ OrientedReadInfo,
+ uint64_t,
+ &OrientedReadInfo::unambiguousBubbleCount>,
+ std::greater<uint64_t> >,
+
+ // Index by phasedUnambiguousBubbleCount (non-unique, largest first).
+ boost::multi_index::ordered_non_unique<boost::multi_index::member<
+ OrientedReadInfo,
+ uint64_t,
+ &OrientedReadInfo::phasedUnambiguousBubbleCount>,
+ std::greater<uint64_t> >
+ > > {
+ };
+ OrientedReadTable orientedReadTable;
+
+
+
+ // Initialize the OrientedReadTable.
+ for(uint64_t orientedReadIndex=0; orientedReadIndex<orientedReadCount(); orientedReadIndex++) {
+ const OrientedReadId orientedReadId = orientedReads[orientedReadIndex].id;
+
+ OrientedReadInfo orientedReadInfo(orientedReadIndex);
+ for(auto it=indexByOrientedReadId().find(orientedReadId);
+ it!=indexByOrientedReadId().end() and it->orientedReadId == orientedReadId; ++it) {
+ const PhasingTableEntry& phasingTableEntry = *it;
+ if(phasingTableEntry.discreteRelativePhase != 0) {
+ ++orientedReadInfo.unambiguousBubbleCount;
+ }
+ }
+ orientedReadTable.insert(orientedReadInfo);
+ }
+
+
+ // Initialize the phases and phasing components of all oriented reads and bubbles.
+ for(OrientedRead& orientedRead: orientedReads) {
+ orientedRead.phase = 0;
+ orientedRead.phasingComponent = invalid<uint64_t>;
+ }
+ for(Bubble& bubble: bubbles) {
+ bubble.phase = 0;
+ bubble.phasingComponent = invalid<uint64_t>;
+ }
+
+
+
+ // Outer loop is over phasing components.
+ for(uint64_t phasingComponent=0; ; phasingComponent++) {
+ if(orientedReadTable.empty()) {
+ break;
+ }
+
+ // Find the starting oriented read for this phasing component.
+ const auto it = orientedReadTable.get<1>().begin();
+ const OrientedReadInfo& orientedReadInfo = *it;
+ OrientedRead& orientedRead = orientedReads[orientedReadInfo.orientedReadIndex];
+
+ const uint64_t minPositionInBubbleChain = orientedRead.minPositionInBubbleChain;
+ const uint64_t maxPositionInBubbleChain = orientedRead.maxPositionInBubbleChain;
+ const uint64_t minBubbleIndex = bubblesMap[minPositionInBubbleChain];
+ const uint64_t maxBubbleIndex = bubblesMap[maxPositionInBubbleChain];
+
+ if(debug) {
+ cout << "Begin phasing component " << phasingComponent << endl;
+ cout << "Phasing group begins at " << orientedRead.id <<
+ ", index " << orientedReadInfo.orientedReadIndex <<
+ " with " << orientedReadInfo.unambiguousBubbleCount << " unambiguous bubbles." << endl;
+ cout << "Bubble index range for this oriented read is [" <<
+ minBubbleIndex << "," << maxBubbleIndex << "]." << endl;
+ }
+
+ if(orientedReadInfo.unambiguousBubbleCount == 0) {
+ break;
+ }
+
+ // Assign phase +1 in this phasing group to this starting read for this phasing component.
+ SHASTA_ASSERT(orientedRead.phase == 0);
+ SHASTA_ASSERT(orientedRead.phasingComponent == invalid<uint64_t>);
+ orientedRead.phase = +1;
+ orientedRead.phasingComponent = phasingComponent;
+
+ // Assign to all unambiguous bubbles of this oriented read a phase consistent with it.
+ for(auto it=indexByOrientedReadId().find(orientedRead.id);
+ it!=indexByOrientedReadId().end() and it->orientedReadId == orientedRead.id; ++it) {
+ const PhasingTableEntry& phasingTableEntry = *it;
+ Bubble& bubble = bubbles[phasingTableEntry.bubbleIndex];
+ SHASTA_ASSERT(bubble.phase == 0);
+ SHASTA_ASSERT(bubble.phasingComponent == invalid<uint64_t>);
+
+ // Skip it if it is ambiguous.
+ if(phasingTableEntry.discreteRelativePhase == 0) {
+ continue;
+ }
+
+ // Set the phase of this bubble to a phase consistent with the +1 phase
+ // we assigned to the starting oriented read.
+ bubble.phase = phasingTableEntry.discreteRelativePhase;
+ bubble.phasingComponent = phasingComponent;
+
+ // Update the OrientedReadTable to reflect the fact that this bubble was just phased.
+ for(auto it=indexByPositionInBubbleChain().find(bubble.positionInBubbleChain);
+ it!=indexByPositionInBubbleChain().end() and it->positionInBubbleChain == bubble.positionInBubbleChain; ++it) {
+ const PhasingTableEntry& phasingTableEntry = *it;
+ if(phasingTableEntry.discreteRelativePhase == 0) {
+ continue;
+ }
+
+ auto jt = orientedReadTable.get<0>().find(phasingTableEntry.orientedReadIndex);
+ SHASTA_ASSERT(jt != orientedReadTable.get<0>().end());
+ OrientedReadInfo info = *jt;
+ info.phasedUnambiguousBubbleCount++;
+ orientedReadTable.get<0>().replace(jt, info);
+ }
+ }
+
+ // Remove the starting oriented read from the orientedReadTable.
+ orientedReadTable.get<1>().erase(it);
+
+
+
+ // The inner loop phases one oriented read at a time, adding it to the current
+ // phasing component.
+ while(not orientedReadTable.empty()) {
+
+ // Find the oriented read with the most phased bubbles.
+ const auto it = orientedReadTable.get<2>().begin();
+ const OrientedReadInfo& orientedReadInfo = *it;
+ OrientedRead& orientedRead = orientedReads[orientedReadInfo.orientedReadIndex];
+
+ const uint64_t minPositionInBubbleChain = orientedRead.minPositionInBubbleChain;
+ const uint64_t maxPositionInBubbleChain = orientedRead.maxPositionInBubbleChain;
+ const uint64_t minBubbleIndex = bubblesMap[minPositionInBubbleChain];
+ const uint64_t maxBubbleIndex = bubblesMap[maxPositionInBubbleChain];
+
+ if(orientedReadInfo.phasedUnambiguousBubbleCount == 0) {
+ // Finish this phasing component.
+ break;
+ }
+
+ if(debug) {
+ cout << "Adding to phasing group " << orientedRead.id <<
+ ", index " << orientedReadInfo.orientedReadIndex <<
+ " with " << orientedReadInfo.unambiguousBubbleCount << " unambiguous bubbles," <<
+ " of which " << orientedReadInfo.phasedUnambiguousBubbleCount << " already phased ." << endl;
+ cout << "Bubble index range for this oriented read is [" <<
+ minBubbleIndex << "," << maxBubbleIndex << "]." << endl;
+ }
+
+ // Use the bubbles that are already phased to assign a phase to this oriented read.
+ uint64_t plusCount = 0;
+ uint64_t minusCount = 0;
+ for(auto it=indexByOrientedReadId().find(orientedRead.id);
+ it!=indexByOrientedReadId().end() and it->orientedReadId == orientedRead.id; ++it) {
+ const PhasingTableEntry& phasingTableEntry = *it;
+ if(phasingTableEntry.discreteRelativePhase == 0) {
+ continue;
+ }
+ Bubble& bubble = bubbles[phasingTableEntry.bubbleIndex];
+ if(bubble.phase == 0) {
+ continue;
+ }
+ int64_t phase;
+ if(phasingTableEntry.discreteRelativePhase == +1) {
+ phase = bubble.phase;
+ } else {
+ phase = - bubble.phase;
+ }
+ if(phase == +1) {
+ ++plusCount;
+ } else if(phase == -1) {
+ ++minusCount;
+ }
+ }
+
+ SHASTA_ASSERT(plusCount + minusCount == orientedReadInfo.phasedUnambiguousBubbleCount);
+
+ // Phase this oriented read in this phasing component.
+ SHASTA_ASSERT(orientedRead.phase == 0);
+ SHASTA_ASSERT(orientedRead.phasingComponent == invalid<uint64_t>);
+ orientedRead.phase = (plusCount >= minusCount) ? +1 : -1;
+ orientedRead.phasingComponent = phasingComponent;
+
+ // Assign to all unambiguous bubbles of this oriented read
+ // that are not already phased a phase consistent with it.
+ for(auto it=indexByOrientedReadId().find(orientedRead.id);
+ it!=indexByOrientedReadId().end() and it->orientedReadId == orientedRead.id; ++it) {
+ const PhasingTableEntry& phasingTableEntry = *it;
+
+ // Skip it if it is ambiguous.
+ if(phasingTableEntry.discreteRelativePhase == 0) {
+ continue;
+ }
+ Bubble& bubble = bubbles[phasingTableEntry.bubbleIndex];
+
+ // If already phased, skip it.
+ if(bubble.phase != 0) {
+ continue;
+ }
+
+ // Phase this bubble to a phase consistent with this oriented read.
+ bubble.phase = (phasingTableEntry.discreteRelativePhase == +1) ? orientedRead.phase : -orientedRead.phase;
+ bubble.phasingComponent = phasingComponent;
+
+ // Update the OrientedReadTable to reflect the fact that this bubble was just phased.
+ for(auto it=indexByPositionInBubbleChain().find(bubble.positionInBubbleChain);
+ it!=indexByPositionInBubbleChain().end() and it->positionInBubbleChain == bubble.positionInBubbleChain; ++it) {
+ const PhasingTableEntry& phasingTableEntry = *it;
+ if(phasingTableEntry.discreteRelativePhase == 0) {
+ continue;
+ }
+
+ auto jt = orientedReadTable.get<0>().find(phasingTableEntry.orientedReadIndex);
+ if(jt == orientedReadTable.get<0>().end()) {
+ continue;
+ }
+ OrientedReadInfo info = *jt;
+ info.phasedUnambiguousBubbleCount++;
+ orientedReadTable.get<0>().replace(jt, info);
+ }
+ }
+
+ // Remove the oriented read from the orientedReadTable.
+ orientedReadTable.get<2>().erase(it);
+ }
+ }
+}
+
+
+
+double PhasingTable::bubbleErrorRate(uint64_t positionInBubbleChain) const
+{
+ // Must be called for a diploid bubble
+ auto it = bubblesMap.find(positionInBubbleChain);
+ SHASTA_ASSERT(it != bubblesMap.end());
+ const Bubble& bubble = bubbles[it->second];
+
+ if(bubble.phase == 0) {
+ return 1.;
+ }
+
+ // This bubble is diploid and phased.
+ uint64_t consistent;
+ uint64_t inconsistent;
+ tie(consistent, inconsistent) = countConsistentEntriesForBubble(positionInBubbleChain);
+ return double(inconsistent) / double(consistent+ inconsistent);
+}
+
+
+
+// Use the phases stored in the Bubbles to consruct the PhasedComponents.
+// The PhasedComponents must be non-overlapping and sorted by position.
+void PhasingTable::constructPhasedComponents(bool debug)
+{
+ phasedComponents.clear();
+
+ // Create an initial version of PhasedComponents without
+ // worrying about ordering by position and about overlap between PhasedComponents.
+ for(const Bubble& bubble: bubbles) {
+ if(bubble.phase == 0) {
+ continue;
+ }
+ const uint64_t phasedComponentId = bubble.phasingComponent;
+ if(phasedComponentId >= phasedComponents.size()) {
+ for(uint64_t i=phasedComponents.size(); i<=phasedComponentId; i++) {
+ phasedComponents.push_back(make_shared<PhasedComponent>());
+ }
+ }
+ phasedComponents[phasedComponentId]->push_back({bubble.positionInBubbleChain, bubble.phase});
+ }
+
+ if(debug) {
+ uint64_t totalPhasedBubbleCount = 0;
+ for(const auto& phasedComponent: phasedComponents) {
+ totalPhasedBubbleCount += phasedComponent->size();
+ }
+ cout << "Created " << phasedComponents.size() << " initial phased components "
+ "with a total " << totalPhasedBubbleCount << " phased diploid bubbles." << endl;
+ }
+
+
+
+ // If there is more than one PhasedComponent, we have to eliminate overlaps.
+ // We do this by removing bubbles from overlapping PhasedComponents, giving
+ // priority to larger PhasedComponents.
+ if(phasedComponents.size() > 1) {
+
+ if(debug) {
+ cout << "More than one phased components found. Removing overlaps." << endl;
+ }
+
+ // Sort the phased components by decreasing size.
+ class SortHelper {
+ public:
+ bool operator()(
+ const shared_ptr<PhasedComponent>& p0,
+ const shared_ptr<PhasedComponent>& p1
+ ) const
+ {
+ return p0->size() > p1->size();
+ }
+ };
+ sort(phasedComponents.begin(), phasedComponents.end(), SortHelper());
+
+ for(const auto& phasedComponent: phasedComponents) {
+ phasedComponent->computePositionRange();
+ }
+
+ // Process the PhasedComponents in order of decreasing size.
+ vector< pair<uint64_t, uint64_t> > forbiddenRanges; // (min, max)
+ for(auto& phasedComponent: phasedComponents) {
+
+ // See if it overlaps any of the forbidden ranges.
+ bool overlaps = false;
+ for(const auto& forbiddenRange: forbiddenRanges) {
+ const bool disjointLeft = phasedComponent->maxPositionInBubbleChain < forbiddenRange.first;
+ const bool disjointRight = phasedComponent->minPositionInBubbleChain > forbiddenRange.second;
+ if(not(disjointLeft or disjointRight)) {
+ overlaps = true;
+ break;
+ }
+ }
+
+ if(debug) {
+ cout << "Phased component at " << phasedComponent->minPositionInBubbleChain << " " <<
+ phasedComponent->maxPositionInBubbleChain;
+ if(overlaps) {
+ cout << " overlaps a previous phased component." << endl;
+ } else {
+ cout << " has no overlaps with previous phased components." << endl;
+ }
+ }
+
+ if(not overlaps) {
+ forbiddenRanges.push_back(
+ {phasedComponent->minPositionInBubbleChain, phasedComponent->maxPositionInBubbleChain});
+ continue;
+ }
+
+
+
+ // This PhasedComponent overlaps a forbiddenRange.
+ // We need to remove the offending bubbles.
+ shared_ptr<PhasedComponent> newPhasedComponent = make_shared<PhasedComponent>();
+ for(const auto& p: *phasedComponent) {
+ const uint64_t positionInBubbleChain = p.first;
+
+ // See if this bubble overlaps any forbidden ranges.
+ bool overlaps = false;
+ for(const auto& forbiddenRange: forbiddenRanges) {
+ if( positionInBubbleChain >= forbiddenRange.first and
+ positionInBubbleChain <= forbiddenRange.second) {
+ overlaps = true;
+ break;
+ }
+ }
+
+ // Only keep it if there is no overlap.
+ if(not overlaps) {
+ newPhasedComponent->push_back(p);
+ }
+
+ }
+
+ // Replace this phased component with the new one.
+ phasedComponent = newPhasedComponent;
+ phasedComponent->computePositionRange();
+ forbiddenRanges.push_back({phasedComponent->minPositionInBubbleChain, phasedComponent->maxPositionInBubbleChain});
+
+ if(debug) {
+ cout << "After removing overlap, this phased component has " << phasedComponent->size() <<
+ " diploid bubbles and position range " << phasedComponent->minPositionInBubbleChain << " " <<
+ phasedComponent->maxPositionInBubbleChain << endl;
+ }
+ }
+ }
+
+
+
+ // This could have created empty PhasedComponents.
+ // Remove them if they are present.
+ {
+ vector< shared_ptr<PhasedComponent> > nonEmptyPhasedComponents;
+ for(const shared_ptr<PhasedComponent>& phasedComponent: phasedComponents) {
+ if(not phasedComponent->empty()) {
+ nonEmptyPhasedComponents.push_back(phasedComponent);
+ } else {
+ if(debug) {
+ cout << "Removing empty phased component." << endl;
+ }
+ }
+ }
+ if(nonEmptyPhasedComponents.size() != phasedComponents.size()) {
+ phasedComponents.swap(nonEmptyPhasedComponents);
+ }
+ }
+
+
+
+ // Compute the position ranges.
+ for(const auto& phasedComponent: phasedComponents) {
+ phasedComponent->computePositionRange();
+ }
+
+ // Sort the phased components in order of increasing position.
+ class SortHelper {
+ public:
+ bool operator()(
+ const shared_ptr<PhasedComponent>& p0,
+ const shared_ptr<PhasedComponent>& p1
+ ) const
+ {
+ return p0->minPositionInBubbleChain < p1->minPositionInBubbleChain;
+ }
+ };
+ sort(phasedComponents.begin(), phasedComponents.end(), SortHelper());
+
+ if(debug) {
+ cout << phasedComponents.size() << " phased components:" << endl;
+ for(const auto& phasedComponent: phasedComponents) {
+ cout << phasedComponent->size() << " diploid bubbles at positions " <<
+ phasedComponent->minPositionInBubbleChain << "..." <<
+ phasedComponent->maxPositionInBubbleChain << " in bubble chain." << endl;
+
+ }
+ // phasingGraph.writeGraphviz("PhasingGraph.dot");
+ }
+}
diff --git a/src/mode3-PhasingTable.hpp b/src/mode3-PhasingTable.hpp
new file mode 100644
index 0000000..8f7d61a
--- /dev/null
+++ b/src/mode3-PhasingTable.hpp
@@ -0,0 +1,250 @@
+#pragma once
+
+// Shasta.
+#include "invalid.hpp"
+#include "ReadId.hpp"
+
+// Boost libraries.
+#include <boost/multi_index_container.hpp>
+#include <boost/multi_index/ordered_index.hpp>
+#include <boost/multi_index/member.hpp>
+#include <boost/multi_index/composite_key.hpp>
+
+// Standard libraries.
+#include "array.hpp"
+#include <map>
+#include <cmath>
+#include "utility.hpp"
+#include "vector.hpp"
+
+namespace shasta {
+ namespace mode3 {
+ class PhasingComponent;
+ class PhasingTable;
+ class PhasingTableEntry;
+
+ class BubbleChain;
+ }
+ class MarkerGraph;
+}
+
+
+
+// A PhasingTableEntry describes the appearances of one oriented read
+// on one or both sides of a diploid Bubble of a BubbleChain.
+// The frequency array contains the number of times the oriented read
+// appears on non-terminal marker graph edges of the two Chains of the diploid Bubble.
+class shasta::mode3::PhasingTableEntry {
+public:
+
+ PhasingTableEntry(
+ OrientedReadId orientedReadId,
+ uint64_t positionInBubbleChain) :
+ orientedReadId(orientedReadId),
+ positionInBubbleChain(positionInBubbleChain)
+ {}
+
+ // The OrientedReadId this PhasingTableEntry refers to,
+ // and its index in the PhasingTable::orientedReads vector.
+ OrientedReadId orientedReadId;
+ uint64_t orientedReadIndex = invalid<uint64_t>;
+
+ // The position in the bubble chain of the diploid bubble
+ // this PhasingTableEntry refers to,
+ // and its index in the PhasingTable::orientedReads vector.
+ uint64_t positionInBubbleChain;
+ uint64_t bubbleIndex = invalid<uint64_t>;
+
+ // The number of times this oriented read
+ // appears on non-terminal marker graph edges of the two Chains of the diploid Bubble.
+ // The two entries in the array corresponds to the two chains of the diploid Bubble.
+ array<uint64_t, 2> frequency = {0, 0};
+
+ // The phase of this oriented read relative to this bubble
+ // is computed from the frequency array.
+
+ // The relative phase varies continuously between -1 and 1 and is:
+ // * +1 if this oriented read always appears in Chain 0 (that is, frequency[1] is 0).
+ // * -1 if this oriented read always appears in Chain 1 (that is, frequency[0] is 0).
+ // * 0 if this oriented appears with equal frequency on Chain 0 and Chain 1
+ // (that is, frequency[0] = frequency[1]).
+ double relativePhase = invalid<double>;
+
+ // The discrete relative phase can be:
+ // +1 if relativePhase > +1. - phaseErrorThreshold.
+ // -1 if relativePhase < -1. + phaseErrorThreshold.
+ // 0 otherwise.
+ int64_t discreteRelativePhase = invalid<int64_t>;
+
+ // Compute and store the relativePhase and discreteRelativePhase.
+ void storeRelativePhase(double phaseErrorThreshold)
+ {
+ relativePhase = 2. * double(frequency[0]) / double(frequency[0] + frequency[1]) - 1.;
+ if(relativePhase > 1. - phaseErrorThreshold) {
+ discreteRelativePhase = +1;
+ } else if(relativePhase < -1. + phaseErrorThreshold) {
+ discreteRelativePhase = -1;
+ } else {
+ discreteRelativePhase = 0;
+ }
+ }
+
+ void writeCsv(ostream&) const;
+};
+
+
+
+// A PhasingTable is a set of PhasingTableEntry objects,
+// randomly accessible by orientedReadId and by positionInBubbleChain.
+class shasta::mode3::PhasingTable: public boost::multi_index_container<PhasingTableEntry,
+ boost::multi_index::indexed_by <
+
+ // Index by (orientedReadId, positionInBubbleChain) (unique).
+ boost::multi_index::ordered_unique<
+ boost::multi_index::composite_key<
+ PhasingTableEntry,
+ boost::multi_index::member<PhasingTableEntry, OrientedReadId ,&PhasingTableEntry::orientedReadId>,
+ boost::multi_index::member<PhasingTableEntry, uint64_t, &PhasingTableEntry::positionInBubbleChain>
+ > >,
+
+ // Index by orientedReadId (non-unique).
+ boost::multi_index::ordered_non_unique<boost::multi_index::member<
+ PhasingTableEntry,
+ OrientedReadId,
+ &PhasingTableEntry::orientedReadId> >,
+
+ // Index by positionInBubbleChain (non-unique).
+ boost::multi_index::ordered_non_unique<boost::multi_index::member<
+ PhasingTableEntry,
+ uint64_t,
+ &PhasingTableEntry::positionInBubbleChain> >
+ > > {
+public:
+
+ PhasingTable(
+ const BubbleChain&,
+ const MarkerGraph&,
+ double phaseErrorThreshold);
+
+ uint64_t entryCount() const
+ {
+ return size();
+ }
+ uint64_t unambiguousEntryCount() const;
+ uint64_t ambiguousEntryCount() const;
+
+ uint64_t bubbleCount() const
+ {
+ return bubbles.size();
+ }
+
+ uint64_t orientedReadCount() const
+ {
+ return orientedReads.size();
+ }
+
+ // Experimental. Do not use.
+ void simpleIterativePhasing1();
+ void simpleIterativePhasing2();
+
+ // Optimize the phases of the oriented reads and of the bubbles.
+ void greedyPhasing();
+
+ void writeCsv(const string& fileNamePrefix) const;
+ enum class ColoringMethod {
+ byRelativePhase,
+ byDiscreteRelativePhase,
+ byConsistency
+ };
+ void writePng(const string& fileName, ColoringMethod) const;
+
+ double bubbleErrorRate(uint64_t positionInBubbleChain) const;
+
+ vector< shared_ptr<PhasedComponent> > phasedComponents;
+ void constructPhasedComponents(bool debug);
+
+private:
+ const auto& indexByBoth() const {return get<0>();}
+ const auto& indexByOrientedReadId() const {return get<1>();}
+ const auto& indexByPositionInBubbleChain() const {return get<2>();}
+
+ void fill(
+ const BubbleChain&,
+ const MarkerGraph&,
+ double phaseErrorThreshold);
+
+
+
+ // Information about the orientedReads that appears in the PhasingTable.
+ class OrientedRead {
+ public:
+ OrientedReadId id;
+ uint64_t minPositionInBubbleChain;
+ uint64_t maxPositionInBubbleChain;
+ int64_t phase = 0; // -1, 0 or +1
+ uint64_t phasingComponent = invalid<uint64_t>;
+ };
+ void gatherOrientedReads();
+ vector<OrientedRead> orientedReads;
+
+ // Map OrientedReadId to an index in the orientedReadInfos vector.
+ std::map<OrientedReadId, uint64_t> orientedReadsMap;
+
+
+
+ // Information about the diploid bubbles in this PhasingTable.
+ class Bubble {
+ public:
+ uint64_t positionInBubbleChain;
+ int64_t phase = 0; // -1, 0 or +1
+ uint64_t phasingComponent = invalid<uint64_t>;
+ };
+ vector<Bubble> bubbles;
+ void gatherBubbles();
+
+ // Map a positionInBubbleChain to an index in the bubbles vector.
+public:
+ std::map<uint64_t, uint64_t> bubblesMap;
+private:
+
+
+
+ // Fill the orientedReadIndex and bubbleIndex in all PhasingTableEntries.
+ // This can only be done after gatherOrientedReads and gatherBubbles
+ // have been called.
+ void fillIndexes();
+
+ // Compute the consistency state of a PhasingTableEntry relative
+ // to the current phases of its oriented read and bubble.
+ // It can be +1 (consistent), -1 (inconsistent), or 0 (unassigned or ambiguous).
+ // See the implementation for details.
+ int64_t consistencyState(const PhasingTableEntry&) const;
+
+ // Count the number of (consistent,inconsistent) PhasingTableEntries
+ // for an oriented read based on the phases currently assigned
+ // to bubbles and oriented reads.
+ pair<uint64_t, uint64_t> countConsistentEntriesForOrientedRead(OrientedReadId) const;
+
+ // Count the number of (consistent,inconsistent) PhasingTableEntries
+ // for the bubble at a given bubble chain position based on the phases currently assigned
+ // to bubbles and oriented reads.
+ pair<uint64_t, uint64_t> countConsistentEntriesForBubble(uint64_t positionInBubbleChain) const;
+
+ // Count the number of (unambiguous, ambiguous) PhasingTableEntries
+ // for the bubble at a given bubble chain position based on the phases currently assigned
+ // to bubbles and oriented reads.
+ pair<uint64_t, uint64_t> countEntriesForBubble(uint64_t positionInBubbleChain) const;
+
+public:
+ // Count the number of (consistent,inconsistent) PhasingTableEntries
+ // based on the phases currently assigned
+ // to bubbles and oriented reads.
+ pair<uint64_t, uint64_t> countConsistentEntries() const;
+
+private:
+ void writeOrientedReadsCsv(const string& fileNamePrefix) const;
+ void writeBubblesCsv(const string& fileNamePrefix, bool writePhasingInformation) const;
+ void writeDetailsCsv(const string& fileNamePrefix) const;
+};
+
+
diff --git a/src/mode3-PrimaryGraph.cpp b/src/mode3-PrimaryGraph.cpp
new file mode 100644
index 0000000..2988636
--- /dev/null
+++ b/src/mode3-PrimaryGraph.cpp
@@ -0,0 +1,548 @@
+// Shasta.
+#include "mode3-PrimaryGraph.hpp"
+#include "Assembler.hpp"
+#include "deduplicate.hpp"
+#include "longestPath.hpp"
+#include "MarkerGraph.hpp"
+#include "MurmurHash2.hpp"
+#include "orderPairs.hpp"
+#include "performanceLog.hpp"
+#include "timestamp.hpp"
+using namespace shasta;
+using namespace mode3;
+
+// Boost libraries.
+#include <boost/graph/iteration_macros.hpp>
+#include <boost/multi_index_container.hpp>
+#include <boost/multi_index/ordered_index.hpp>
+#include <boost/multi_index/member.hpp>
+#include <boost/pending/disjoint_sets.hpp>
+
+// Standard library.
+#include "fstream.hpp"
+#include <queue>
+
+
+
+PrimaryGraph::vertex_descriptor PrimaryGraph::addVertex(MarkerGraphEdgeId edgeId)
+{
+ SHASTA_ASSERT(not vertexMap.contains(edgeId));
+ const vertex_descriptor v = add_vertex({edgeId}, *this);
+ vertexMap.insert({edgeId, v});
+ return v;
+}
+
+
+
+void PrimaryGraph::addEdgeFromVertexDescriptors(
+ vertex_descriptor v0,
+ vertex_descriptor v1,
+ const MarkerGraphEdgePairInfo& info,
+ uint64_t coverage)
+{
+ add_edge(v0, v1, {info, coverage}, *this);
+}
+
+
+
+void PrimaryGraph::addEdge(
+ MarkerGraphEdgeId edgeId0,
+ MarkerGraphEdgeId edgeId1,
+ const MarkerGraphEdgePairInfo& info,
+ uint64_t coverage)
+{
+ auto it0 = vertexMap.find(edgeId0);
+ auto it1 = vertexMap.find(edgeId1);
+ SHASTA_ASSERT(it0 != vertexMap.end());
+ SHASTA_ASSERT(it1 != vertexMap.end());
+ const vertex_descriptor v0 = it0->second;
+ const vertex_descriptor v1 = it1->second;
+
+ addEdgeFromVertexDescriptors(v0, v1, info, coverage);
+}
+
+
+
+// Write a PrimaryGraph in graphviz format.
+void PrimaryGraph::writeGraphviz(
+ const string& name,
+ const PrimaryGraphDisplayOptions& options,
+ const MarkerGraph& markerGraph) const
+{
+ ofstream out(name + ".dot");
+
+ const PrimaryGraph& graph = *this;
+ out << "digraph " << name << " {\n";
+
+ BGL_FORALL_VERTICES(v, graph, PrimaryGraph) {
+ const PrimaryGraphVertex& vertex = graph[v];
+ out << vertex.edgeId;
+
+ if(options.labels or options.tooltips or options.colorVertices) {
+ out << "[";
+ }
+
+ if(options.labels) {
+ out << "label=\"";
+ out << vertex.edgeId << "\\n" << markerGraph.edgeCoverage(vertex.edgeId);
+ out << "\" ";
+ }
+
+ if(options.tooltips) {
+ out << "tooltip=\"";
+ out << vertex.edgeId;
+ out << "\" ";
+ }
+
+ if(options.labels or options.tooltips or options.colorVertices) {
+ out << "]";
+ }
+ out << ";\n";
+ }
+
+
+
+ BGL_FORALL_EDGES(e, graph, PrimaryGraph) {
+ const PrimaryGraphEdge& edge = graph[e];
+ if(not options.showNonTransitiveReductionEdges and edge.isNonTransitiveReductionEdge) {
+ continue;
+ }
+ const vertex_descriptor v0 = source(e, graph);
+ const vertex_descriptor v1 = target(e, graph);
+
+ out <<
+ graph[v0].edgeId << "->" <<
+ graph[v1].edgeId;
+
+ if(edge.isNonTransitiveReductionEdge or options.labels or options.tooltips or options.colorEdges) {
+ out << " [";
+ }
+
+ if(edge.isNonTransitiveReductionEdge) {
+ out << "style=dashed ";
+ }
+
+ if(options.tooltips) {
+ out <<
+ "tooltip=\"" <<
+ graph[v0].edgeId << "->" <<
+ graph[v1].edgeId << " ";
+ if(edge.coverage != invalid<uint64_t>) {
+ out << edge.coverage << "/";
+ }
+ out <<
+ edge.info.common << " " <<
+ std::fixed << std::setprecision(2) << edge.info.correctedJaccard() << " " <<
+ edge.info.offsetInBases << "\" ";
+ }
+
+ if(options.labels) {
+ out <<
+ "label=\"";
+ if(edge.coverage != invalid<uint64_t>) {
+ out << edge.coverage << "/";
+ }
+ out <<
+ edge.info.common << "\\n" <<
+ std::fixed << std::setprecision(2) << edge.info.correctedJaccard() << "\\n" <<
+ edge.info.offsetInBases << "\" ";
+
+ }
+
+ // Color.
+ if(options.colorEdges) {
+ const double correctedJaccard = edge.info.correctedJaccard();
+ if(correctedJaccard <= options.redJ) {
+ out << " color=red ";
+ } else if(correctedJaccard >= options.greenJ) {
+ out << " color=green ";
+ } else {
+ const double hue = (correctedJaccard - options.redJ) / (3. * (options.greenJ - options.redJ));
+ out << " color=\"" << hue << ",1,1\" ";
+ }
+ }
+
+ if(edge.isNonTransitiveReductionEdge or options.labels or options.tooltips or options.colorEdges) {
+ out << "]";
+ }
+ out << ";\n";
+ }
+
+ out << "}\n";
+}
+
+
+
+void PrimaryGraph::writeEdgeCoverageHistogram(const string& fileName) const
+{
+ const PrimaryGraph& primaryGraph = *this;
+
+ // Create a histogram indexed by histogram[coverage][commonCount].
+ vector< vector<uint64_t> > histogram;
+
+ // Loop over all edges.
+ BGL_FORALL_EDGES(e, primaryGraph, PrimaryGraph) {
+ const PrimaryGraphEdge& edge = primaryGraph[e];
+ const uint64_t coverage = edge.coverage;
+ const uint64_t commonCount = edge.info.common;
+ SHASTA_ASSERT(coverage <= commonCount);
+
+ // Increment the histogram, making space as necessary.
+ if(coverage >= histogram.size()) {
+ histogram.resize(coverage + 1);
+ }
+ vector<uint64_t>& h = histogram[coverage];
+ if(commonCount >= h.size()) {
+ h.resize(commonCount + 1, 0);
+ }
+ ++h[commonCount];
+ }
+
+ // Write out the histogram.
+ ofstream csv(fileName);
+ csv << "Coverage,Common count,Loss,Frequency\n";
+ for(uint64_t coverage=0; coverage<histogram.size(); coverage++) {
+ const vector<uint64_t>& h = histogram[coverage];
+ for(uint64_t commonCount=0; commonCount<h.size(); commonCount++) {
+ const uint64_t frequency = h[commonCount];
+
+ if(frequency > 0) {
+ const uint64_t loss = commonCount - coverage;
+ csv << coverage << ",";
+ csv << commonCount << ",";
+ csv << loss << ",";
+ csv << frequency << "\n";
+ }
+ }
+ }
+}
+
+
+
+// Create the connected components of this PrimaryGraph,
+// without changing the PrimaryGraph itself.
+vector< shared_ptr<PrimaryGraph> > PrimaryGraph::createConnectedComponents(
+ uint64_t minComponentSize) const
+{
+ const PrimaryGraph& graph = *this;
+
+ // Compute connected components.
+ // We can't use boost::connected_components because it only works
+ // for undirected graphs.
+ const uint64_t n = num_vertices(graph);
+ vector<uint64_t> rank(n);
+ vector<uint64_t> parent(n);
+ boost::disjoint_sets<uint64_t*, uint64_t*> disjointSets(&rank[0], &parent[0]);
+ for(uint64_t vertexId=0; vertexId<n; vertexId++) {
+ disjointSets.make_set(vertexId);
+ }
+ BGL_FORALL_EDGES(e, graph, PrimaryGraph) {
+ const PrimaryGraph::vertex_descriptor v0 = source(e, graph);
+ const PrimaryGraph::vertex_descriptor v1 = target(e, graph);
+ disjointSets.union_set(v0, v1);
+ }
+
+
+ // Gather the vertices in each connected component.
+ vector< shared_ptr<PrimaryGraph> > allComponentPointers(num_vertices(graph));
+ BGL_FORALL_VERTICES(v, graph, PrimaryGraph) {
+ const PrimaryGraphVertex& vertex = graph[v];
+ const uint64_t componentId = disjointSets.find_set(v);
+ shared_ptr<PrimaryGraph>& componentPointer = allComponentPointers[componentId];
+ if(not componentPointer) {
+ componentPointer = make_shared<PrimaryGraph>();
+ }
+ PrimaryGraph& component = *componentPointer;
+ component.addVertex(vertex.edgeId);
+ }
+
+
+ // Gather the edges in each connected component.
+ BGL_FORALL_EDGES(e, graph, PrimaryGraph) {
+ const PrimaryGraph::vertex_descriptor v0 = source(e, graph);
+ const PrimaryGraph::vertex_descriptor v1 = target(e, graph);
+ const uint64_t edgeId0 = graph[v0].edgeId;
+ const uint64_t edgeId1 = graph[v1].edgeId;
+ const uint64_t componentId = disjointSets.find_set(v0);
+ SHASTA_ASSERT(componentId == disjointSets.find_set(v1));
+ shared_ptr<PrimaryGraph>& componentPointer = allComponentPointers[componentId];
+ SHASTA_ASSERT(componentPointer);
+ PrimaryGraph& component = *componentPointer;
+ component.addEdge(
+ edgeId0,
+ edgeId1,
+ graph[e].info,
+ graph[e].coverage);
+ }
+
+
+
+ // Keep only the components with at least minComponentSize vertices
+ // and sort them by size.
+ vector< pair<shared_ptr<PrimaryGraph>, uint64_t> > componentPointersWithSizes;
+ for(const shared_ptr<PrimaryGraph>& p: allComponentPointers) {
+ if(p) {
+ const uint64_t componentSize = num_vertices(*p);
+ if(componentSize >= minComponentSize) {
+ componentPointersWithSizes.push_back({p, componentSize});
+ }
+ }
+ }
+ sort(componentPointersWithSizes.begin(), componentPointersWithSizes.end(),
+ OrderPairsBySecondOnlyGreater<shared_ptr<PrimaryGraph>, uint64_t>());
+
+
+ // For now return all components, including the empty ones.
+ // But we want to remove the small ones and sort them by size.
+ vector< shared_ptr<PrimaryGraph> > componentPointers;
+ for(const auto& p: componentPointersWithSizes) {
+ componentPointers.push_back(p.first);
+ }
+ return componentPointers;
+}
+
+
+
+// Remove cross-edges.
+// This removes an edge v0->v1 if the following are all true:
+// - It is not marked as removed by transitive reduction.
+// - Its coverage is at most lowCoverageThreshold.
+// - Its estimated offset is at least minOffset.
+// - v0 has at least one out-edge with coverage at least highCoverageThreshold
+// (ignoring edges marked as removed by transitive reduction).
+// - v1 has at least one in-edge with coverage at least highCoverageThreshold.
+// (ignoring edges marked as removed by transitive reduction).
+void PrimaryGraph::removeCrossEdges(
+ uint64_t lowCoverageThreshold,
+ uint64_t highCoverageThreshold,
+ uint64_t minOffset)
+{
+ PrimaryGraph& graph = *this;
+
+ // Find the edges we are going to remove.
+ vector<edge_descriptor> edgesToBeRemoved;
+ BGL_FORALL_EDGES(e, graph, PrimaryGraph) {
+ const PrimaryGraphEdge& edge = graph[e];
+
+ // If it is marked as removed by transitive reduction, skip it.
+ if(edge.isNonTransitiveReductionEdge) {
+ continue;
+ }
+
+ // Check coverage.
+ if(edge.coverage > lowCoverageThreshold) {
+ continue;
+ }
+
+ // Check estimated offset.
+ if(edge.info.offsetInBases < int64_t(minOffset)) {
+ continue;
+ }
+
+ // Check out-edges of v0.
+ const vertex_descriptor v0 = source(e, graph);
+ bool v0HasStrongOutEdge = false;
+ BGL_FORALL_OUTEDGES(v0, e0, graph, PrimaryGraph) {
+ // If it is marked as removed by transitive reduction, ignore it.
+ if(graph[e0].isNonTransitiveReductionEdge) {
+ continue;
+ }
+ if(graph[e0].coverage >= highCoverageThreshold) {
+ v0HasStrongOutEdge = true;
+ break;
+ }
+ }
+ if(not v0HasStrongOutEdge) {
+ continue;
+ }
+
+ // Check in-edges of v1.
+ const vertex_descriptor v1 = target(e, graph);
+ bool v1HasStrongOutEdge = false;
+ BGL_FORALL_INEDGES(v1, e1, graph, PrimaryGraph) {
+ // If it is marked as removed by transitive reduction, ignore it.
+ if(graph[e1].isNonTransitiveReductionEdge) {
+ continue;
+ }
+ if(graph[e1].coverage >= highCoverageThreshold) {
+ v1HasStrongOutEdge = true;
+ break;
+ }
+ }
+ if(not v1HasStrongOutEdge) {
+ continue;
+ }
+
+ // If all above checks passed, this edge will be removed.
+ edgesToBeRemoved.push_back(e);
+ }
+
+ // Remove the edges we found.
+ for(const edge_descriptor e: edgesToBeRemoved) {
+ boost::remove_edge(e, graph);
+ }
+}
+
+
+
+// Remove edges for which loss = (commonCount - coverage) / commonCount > maxLoss
+void PrimaryGraph::removeWeakEdges(double maxLoss)
+{
+ PrimaryGraph& graph = *this;
+
+ // Find the edges we are going to remove.
+ vector<edge_descriptor> edgesToBeRemoved;
+ BGL_FORALL_EDGES(e, graph, PrimaryGraph) {
+ const PrimaryGraphEdge& edge = graph[e];
+ const double loss = double(edge.info.common - edge.coverage) / double(edge.info.common);
+ if(loss > maxLoss) {
+ edgesToBeRemoved.push_back(e);
+ }
+ }
+
+
+
+ // Remove the edges we found.
+ for(const edge_descriptor e: edgesToBeRemoved) {
+ boost::remove_edge(e, graph);
+ }
+
+}
+
+
+#if 0
+// Given sets of two primary in-edges and two primary out-edges,
+// find primary mid-edges in-between that can be used for detangling.
+void GlobalPathGraph::searchForDetangling(
+ const array<MarkerGraphEdgeId, 2>& in,
+ const array<MarkerGraphEdgeId, 2>& out,
+ uint64_t highCommonCountThreshold,
+ uint64_t lowCommonCountThreshold,
+ const Assembler& assembler,
+ array<array<vector<MarkerGraphEdgeId>, 2>, 2>& mid)
+{
+ // Loop over the primary journeys of oriented reads in the "in" primary edges.
+ // Only use the journey portion following the "in" primary edges.
+ array<vector<MarkerGraphEdgeId>, 2> inFollowers;
+ array<vector<uint64_t>, 2> inFollowersCommonCount;
+ for(uint64_t i=0; i<2; i++) {
+ assembler.markerGraph.followPrimaryJourneysForward(in[i], inFollowers[i], inFollowersCommonCount[i]);
+ }
+
+
+
+ // Find inFollowers that have high common count with in[0]
+ // and low common count with in[1], or vice versa.
+ array<vector<MarkerGraphEdgeId>, 2> inCandidates;
+ {
+ uint64_t i0 = 0;
+ uint64_t i1 = 0;
+ uint64_t end0 = inFollowers[0].size();
+ uint64_t end1 = inFollowers[1].size();
+ while(i0<end0 and i1<end1) {
+ const MarkerGraphEdgeId edgeId0 = inFollowers[0][i0];
+ const MarkerGraphEdgeId edgeId1 = inFollowers[1][i1];
+
+ if(edgeId0 < edgeId1) {
+ // edgeId0 is in inFollowers[0] but not in inFollowers[1].
+ if(inFollowersCommonCount[0][i0] >= highCommonCountThreshold) {
+ inCandidates[0].push_back(edgeId0);
+ }
+ ++i0;
+ }
+
+ else if(edgeId1 < edgeId0) {
+ // edgeId1 is in inFollowers[1] but not in inFollowers[0].
+ if(inFollowersCommonCount[1][i1] >= highCommonCountThreshold) {
+ inCandidates[1].push_back(edgeId1);
+ }
+ ++i1;
+ }
+
+ else {
+ // edgeId0 is in inFollowers[0] and in inFollowers[1].
+ const uint64_t common0 = inFollowersCommonCount[0][i0];
+ const uint64_t common1 = inFollowersCommonCount[1][i1];
+ if(common0 >= highCommonCountThreshold and common1 <= lowCommonCountThreshold) {
+ inCandidates[0].push_back(edgeId0);
+ }
+ else if(common1 >= highCommonCountThreshold and common0 <= lowCommonCountThreshold) {
+ inCandidates[1].push_back(edgeId1);
+ }
+ ++i0;
+ ++i1;
+ }
+ }
+ }
+
+
+
+ // Loop over the primary journeys of oriented reads in the "out" primary edges.
+ // Only use the journey portion preceding the "out" primary edges.
+ array<vector<MarkerGraphEdgeId>, 2> outPreceders;
+ array<vector<uint64_t>, 2> outPrecedersCommonCount;
+ for(uint64_t i=0; i<2; i++) {
+ assembler.markerGraph.followPrimaryJourneysBackward(out[i], outPreceders[i], outPrecedersCommonCount[i]);
+ }
+
+
+
+ // Find outPreceders that have high common count with out[0]
+ // and low common count with out[1], or vice versa.
+ array<vector<MarkerGraphEdgeId>, 2> outCandidates;
+ {
+ uint64_t i0 = 0;
+ uint64_t i1 = 0;
+ uint64_t end0 = outPreceders[0].size();
+ uint64_t end1 = outPreceders[1].size();
+ while(i0<end0 and i1<end1) {
+ const MarkerGraphEdgeId edgeId0 = outPreceders[0][i0];
+ const MarkerGraphEdgeId edgeId1 = outPreceders[1][i1];
+
+ if(edgeId0 < edgeId1) {
+ // edgeId0 is in outPreceders[0] but not in outPreceders[1].
+ if(outPrecedersCommonCount[0][i0] >= highCommonCountThreshold) {
+ outCandidates[0].push_back(edgeId0);
+ }
+ ++i0;
+ }
+
+ else if(edgeId1 < edgeId0) {
+ // edgeId1 is in outPreceders[1] but not in outPreceders[0].
+ if(outPrecedersCommonCount[1][i1] >= highCommonCountThreshold) {
+ outCandidates[1].push_back(edgeId1);
+ }
+ ++i1;
+ }
+
+ else {
+ // edgeId0 is in outPreceders[0] and in outPreceders[1].
+ const uint64_t common0 = outPrecedersCommonCount[0][i0];
+ const uint64_t common1 = outPrecedersCommonCount[1][i1];
+ if(common0 >= highCommonCountThreshold and common1 <= lowCommonCountThreshold) {
+ outCandidates[0].push_back(edgeId0);
+ }
+ else if(common1 >= highCommonCountThreshold and common0 <= lowCommonCountThreshold) {
+ outCandidates[1].push_back(edgeId1);
+ }
+ ++i0;
+ ++i1;
+ }
+ }
+ }
+
+
+
+ // Find MarkerGraphEdgeIds that are both inCandidates and outCandidates.
+ for(uint64_t i0=0; i0<2; i0++) {
+ for(uint64_t i1=0; i1<2; i1++) {
+ mid[i0][i1].clear();
+ std::set_intersection(
+ inCandidates[i0].begin(), inCandidates[i0].end(),
+ outCandidates[i1].begin(), outCandidates[i1].end(),
+ back_inserter(mid[i0][i1]));
+ }
+ }
+}
+#endif
+
diff --git a/src/mode3-PrimaryGraph.hpp b/src/mode3-PrimaryGraph.hpp
new file mode 100644
index 0000000..2a10877
--- /dev/null
+++ b/src/mode3-PrimaryGraph.hpp
@@ -0,0 +1,148 @@
+#pragma once
+
+/*******************************************************************************
+
+In a PrimaryGraph, each vertex represents a primary edge of the marker graph.
+Edges are generated by following the reads.
+
+*******************************************************************************/
+
+// Shasta.
+#include "Base.hpp"
+#include "MarkerGraphEdgePairInfo.hpp"
+#include "MultithreadedObject.hpp"
+#include "ReadId.hpp"
+#include "shastaTypes.hpp"
+
+// Boost libraries.
+#include <boost/graph/adjacency_list.hpp>
+
+// Standard library.
+#include "iosfwd.hpp"
+#include "memory.hpp"
+#include "string.hpp"
+#include "utility.hpp"
+#include "vector.hpp"
+
+namespace shasta {
+ class Assembler;
+ class MarkerGraph;
+ namespace mode3 {
+
+ // A connected component of the primary graph,
+ // in which each vertex represents a primary edge of the marker graph.
+ // Edges are created by following the reads on their journeys
+ // over primary marker graph edges.
+ class PrimaryGraphVertex;
+ class PrimaryGraphEdge;
+ class PrimaryGraph;
+ using PrimaryGraphBaseClass = boost::adjacency_list<
+ boost::listS,
+ boost::vecS,
+ boost::bidirectionalS,
+ PrimaryGraphVertex,
+ PrimaryGraphEdge>;
+
+ class PrimaryGraphDisplayOptions;
+
+ }
+}
+
+
+
+// Class to control Graphviz output of PrimaryGraph.
+class shasta::mode3::PrimaryGraphDisplayOptions {
+public:
+ bool labels = true;
+ bool tooltips = true;
+ bool colorVertices = true;
+ bool colorEdges = true;
+ bool showNonTransitiveReductionEdges = true;
+
+ // Thresholds for coloring by corrected Jaccard similarity J'.
+ // If J' <= redJ, the edge is drawn red.
+ // If J' >= greenJ, the edge is drawn green.
+ // For values in between, the color is interpolated.
+ double redJ;
+ double greenJ;
+
+ PrimaryGraphDisplayOptions(double redJ = 0., double greenJ = 1.) :
+ redJ(redJ), greenJ(greenJ) {}
+
+ void makeCompact()
+ {
+ labels = false;
+ tooltips = false;
+ colorVertices = false;
+ colorEdges = false;
+ }
+};
+
+
+
+class shasta::mode3::PrimaryGraphVertex {
+public:
+
+ // The corresponding marker graph edgeId.
+ MarkerGraphEdgeId edgeId;
+};
+
+
+
+class shasta::mode3::PrimaryGraphEdge {
+public:
+ MarkerGraphEdgePairInfo info;
+ uint64_t coverage;
+ bool isNonTransitiveReductionEdge = false;
+};
+
+
+
+class shasta::mode3::PrimaryGraph : public PrimaryGraphBaseClass {
+public:
+
+ std::map<MarkerGraphEdgeId, vertex_descriptor> vertexMap;
+ vertex_descriptor addVertex(MarkerGraphEdgeId);
+
+ void addEdge(
+ MarkerGraphEdgeId,
+ MarkerGraphEdgeId,
+ const MarkerGraphEdgePairInfo&,
+ uint64_t coverage);
+ void addEdgeFromVertexDescriptors(
+ vertex_descriptor,
+ vertex_descriptor,
+ const MarkerGraphEdgePairInfo&,
+ uint64_t coverage);
+
+ void writeGraphviz(
+ const string& name,
+ const PrimaryGraphDisplayOptions&,
+ const MarkerGraph&) const;
+
+ void writeEdgeCoverageHistogram(const string& fileName) const;
+
+ // Create the connected components of this PrimaryGraph,
+ // without changing the PrimaryGraph itself.
+ vector< shared_ptr<PrimaryGraph> > createConnectedComponents(uint64_t minComponentSize) const;
+
+ void localTransitiveReduction(
+ uint64_t distance,
+ uint64_t maxCoverage);
+
+ // Remove cross-edges.
+ // This removes an edge v0->v1 if the following are all true:
+ // - Its coverage is at most lowCoverageThreshold.
+ // - Its estimated offset is at least minOffset.
+ // - v0 has at least one out-edge with coverage at least highCoverageThreshold.
+ // - v1 has at least one in-edge with coverage at least highCoverageThreshold.
+ void removeCrossEdges(
+ uint64_t lowCoverageThreshold,
+ uint64_t highCoverageThreshold,
+ uint64_t minOffset);
+
+ // Remove edges for which loss = (commonCount - coverage) / commonCount > maxLoss
+ void removeWeakEdges(double maxLoss);
+
+};
+
diff --git a/src/mode3-SegmentPairInformation.hpp b/src/mode3-SegmentPairInformation.hpp
deleted file mode 100644
index 80db6e3..0000000
--- a/src/mode3-SegmentPairInformation.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-#ifndef SHASTA_MODE3_SEGMENT_PAIR_INFORMATION_HPP
-#define SHASTA_MODE3_SEGMENT_PAIR_INFORMATION_HPP
-
-// Shasta.
-#include "invalid.hpp"
-#include "SHASTA_ASSERT.hpp"
-
-// Standard library.
-#include "algorithm.hpp"
-#include "array.hpp"
-#include "cstdint.hpp"
-
-namespace shasta {
- namespace mode3 {
- class SegmentPairInformation;
- }
-}
-
-
-
-// Information for a pair of segments, as computed by
-// mode3::AssemblyGraph::analyzeSegmentPair.
-class shasta::mode3::SegmentPairInformation {
-public:
-
- // The total number of oriented reads present in each segment.
- array<uint64_t, 2> totalCount = {0, 0};
-
- // The number of oriented reads present in both segments.
- // If this is zero, the rest of the information is not valid.
- uint64_t commonCount = 0;
-
- // The offset of segment 1 relative to segment 0, in markers.
- int64_t offset = invalid<int64_t>;
-
- // The number of oriented reads present in each segment
- // but missing from the other segment,
- // and which should have been present based on the above estimated offset.
- array<uint64_t, 2> unexplainedCount = {0, 0};
-
- // The number of oriented reads that appear in only one
- // of the two segments, but based on the estimated offset
- // are too short to appear in the other segment.
- array<uint64_t, 2> shortCount = {0, 0};
-
- // Check that the above counts are consistent.
- void check() const
- {
- for(uint64_t i=0; i<2; i++) {
- SHASTA_ASSERT(commonCount + unexplainedCount[i] + shortCount[i] ==
- totalCount[i]);
- }
- }
-
- // This computes the fraction of unexplained oriented reads,
- // without counting the short ones.
- double unexplainedFraction(uint64_t i) const
- {
- // return double(unexplainedCount[i]) / double(totalCount[i]);
- return double(unexplainedCount[i]) / double(commonCount + unexplainedCount[i]);
- }
- double maximumUnexplainedFraction() const
- {
- return max(unexplainedFraction(0), unexplainedFraction(1));
- }
-
- // Jaccard similarity, without counting the short reads.
- double jaccard() const
- {
- return double(commonCount) / double(commonCount + unexplainedCount[0] + unexplainedCount[1]);
- }
-
- // Raw Jaccard similarity (no special treatment of short reads)
- double rawJaccard() const
- {
- return double(commonCount) / double(totalCount[0] + totalCount[1] - commonCount);
- }
-};
-
-#endif
diff --git a/src/mode3.cpp b/src/mode3.cpp
deleted file mode 100644
index a800cdb..0000000
--- a/src/mode3.cpp
+++ /dev/null
@@ -1,3001 +0,0 @@
-
-// Shasta
-#include "mode3.hpp"
-#include "assembleMarkerGraphPath.hpp"
-#include "deduplicate.hpp"
-#include "findMarkerId.hpp"
-#include "html.hpp"
-#include "MarkerGraph.hpp"
-#include "mode3-AssemblyPath.hpp"
-#include "mode3-JaccardGraph.hpp"
-#include "orderPairs.hpp"
-#include "Reads.hpp"
-#include "ReadFlags.hpp"
-#include "mode3-SegmentPairInformation.hpp"
-#include "SubsetGraph.hpp"
-using namespace shasta;
-using namespace mode3;
-
-// Boost libraries.
-// Include disjoint_sets.hpp first to avoid Boost problems.
-#include <boost/pending/disjoint_sets.hpp>
-#include <boost/icl/discrete_interval.hpp>
-#include <boost/icl/right_open_interval.hpp>
-#include <boost/graph/iteration_macros.hpp>
-#include <boost/graph/strong_components.hpp>
-
-// Standard library.
-#include <bitset>
-#include "fstream.hpp"
-#include <map>
-#include <queue>
-#include <set>
-#include <unordered_set>
-
-#include "MultithreadedObject.tpp"
-template class MultithreadedObject<mode3::AssemblyGraph>;
-
-
-// Each linear chain of marker graph edges generates a segment.
-void AssemblyGraph::createSegmentPaths()
-{
- const bool debug = false;
-
- createNew(markerGraphPaths, "Mode3-MarkerGraphPaths");
- const MarkerGraph::EdgeId edgeCount = markerGraph.edges.size();
- vector<bool> wasFound(edgeCount, false);
-
- using MarkerGraphPath = vector<MarkerGraph::EdgeId>;
- MarkerGraphPath nextEdges;
- MarkerGraphPath previousEdges;
- MarkerGraphPath path;
- MarkerGraphPath reverseComplementedPath;
-
- // Main loop over all edges of the marker graph.
- // At each iteration we find a new linear path of edges.
- for(MarkerGraph::EdgeId startEdgeId=0; startEdgeId<edgeCount; startEdgeId++) {
-
- // If we already found this edge, skip it.
- // It is part of a path we already found.
- if(wasFound[startEdgeId]) {
- continue;
- }
-
- if(debug) {
- cout << "Starting a new path at edge " << startEdgeId << endl;
- }
-
- // Follow the path forward.
- nextEdges.clear();
- MarkerGraph::EdgeId edgeId = startEdgeId;
- bool isCircular = false;
- while(true) {
- const MarkerGraph::Edge edge = markerGraph.edges[edgeId];
- const MarkerGraph::VertexId v1 = edge.target;
- const auto outEdges = markerGraph.edgesBySource[v1];
- if(outEdges.size() != 1) {
- break;
- }
- const auto inEdges = markerGraph.edgesByTarget[v1];
- if(inEdges.size() != 1) {
- break;
- }
- edgeId = outEdges[0];
- if(edgeId == startEdgeId) {
- isCircular = true;
- break;
- }
- nextEdges.push_back(edgeId);
- SHASTA_ASSERT(not wasFound[edgeId]);
- if(debug) {
- cout << "Moving forward: added " << edgeId << endl;
- }
- }
-
- // Follow the path backward.
- previousEdges.clear();
- if(!isCircular) {
- edgeId = startEdgeId;
- while(true) {
- const MarkerGraph::Edge edge = markerGraph.edges[edgeId];
- const MarkerGraph::VertexId v0 = edge.source;
- const auto outEdges = markerGraph.edgesBySource[v0];
- if(outEdges.size() != 1) {
- break;
- }
- const auto inEdges = markerGraph.edgesByTarget[v0];
- if(inEdges.size() != 1) {
- break;
- }
- edgeId = inEdges[0];
- previousEdges.push_back(edgeId);
- SHASTA_ASSERT(not wasFound[edgeId]);
- if(debug) {
- cout << "Moving backward: added " << edgeId << endl;
- }
- }
- }
-
- // Gather the path.
- path.clear();
- copy(previousEdges.rbegin(), previousEdges.rend(), back_inserter(path));
- path.push_back(startEdgeId);
- copy(nextEdges.begin(), nextEdges.end(), back_inserter(path));
-
- // Mark all the edges in the path as found.
- for(const MarkerGraph::EdgeId edgeId: path) {
- if(wasFound[edgeId]) {
- cout << "Assertion failed at " << edgeId << endl;
- SHASTA_ASSERT(0);
- }
- wasFound[edgeId] = true;
- }
-
- // Store this path as a new segment.
- markerGraphPaths.appendVector();
- for(const MarkerGraphEdgeId edgeId: path) {
- markerGraphPaths.append(edgeId);
- }
- }
-
-
-
- // Check that all edges of the marker graph were found.
- SHASTA_ASSERT(find(wasFound.begin(), wasFound.end(), false) == wasFound.end());
-
-
- // Debug output: write the paths.
- if(debug) {
- ofstream csv("Paths.csv");
- for(uint64_t segmentId=0; segmentId<markerGraphPaths.size(); segmentId++) {
- const auto path = markerGraphPaths[segmentId];
- for(const MarkerGraphEdgeId edgeId: path) {
- csv << segmentId << ",";
- csv << edgeId << "\n";
- }
- }
- }
-
-}
-
-
-
-// Compute coverage for all segments.
-// It is computed as average marker graph edge coverage
-// over the marker graph edges in the path of each segment.
-void AssemblyGraph::computeSegmentCoverage()
-{
- // Initialize segmentCoverage.
- createNew(segmentCoverage, "Mode3-SegmentCoverage");
- const uint64_t segmentCount = markerGraphPaths.size();
- segmentCoverage.resize(segmentCount);
-
- // Loop over all segments.
- for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) {
-
- // Access the marker graph path for this segment.
- const span<MarkerGraphEdgeId> path = markerGraphPaths[segmentId];
-
-
- // Loop over this path.
- uint64_t coverageSum = 0.;
- for(uint64_t position=0; position<path.size(); position++) {
- MarkerGraphEdgeId& edgeId = path[position];
-
- // Add the marker intervals on this marker graph edge.
- const span<const MarkerInterval> markerIntervals = markerGraph.edgeMarkerIntervals[edgeId];
- coverageSum += markerIntervals.size();
- }
-
- segmentCoverage[segmentId] = float(coverageSum) / float(path.size());
-
- }
-
-
- // Write a histogram of segment coverage.
- vector<uint64_t> histogram;
- for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) {
- const uint64_t coverage = uint64_t(std::round(segmentCoverage[segmentId]));
- if(coverage >= histogram.size()) {
- histogram.resize(coverage + 1, 0);
- }
- ++histogram[coverage];
- }
- ofstream csv("SegmentCoverageHistogram.csv");
- csv << "Coverage,Frequency\n";
- for(uint64_t coverage=0; coverage<histogram.size(); coverage++) {
- csv << coverage << "," << histogram[coverage] << "\n";
- }
-}
-
-
-
-void AssemblyGraph::computeMarkerGraphEdgeTable(size_t threadCount)
-{
-
- // Initialize the marker graph edge table.
- createNew(markerGraphEdgeTable, "Mode3-MarkerGraphEdgeTable");
- markerGraphEdgeTable.resize(markerGraph.edges.size());
- fill(markerGraphEdgeTable.begin(), markerGraphEdgeTable.end(), make_pair(
- std::numeric_limits<uint64_t>::max(),
- std::numeric_limits<uint32_t>::max()
- ));
-
- // Fill in the marker graph edge table.
- const uint64_t batchSize = 100;
- setupLoadBalancing(markerGraphPaths.size(), batchSize);
- runThreads(&AssemblyGraph::computeMarkerGraphEdgeTableThreadFunction, threadCount);
-}
-
-
-
-void AssemblyGraph::computeMarkerGraphEdgeTableThreadFunction(size_t threadId)
-{
-
- // Loop over all batches assigned to this thread.
- uint64_t begin, end;
- while(getNextBatch(begin, end)) {
-
- // Loop over all vertices assigned to this batch.
- for(uint64_t segmentId=begin; segmentId!=end; ++segmentId) {
- const span<MarkerGraphEdgeId> path = markerGraphPaths[segmentId];
-
- // Loop over the path of this segment.
- for(uint64_t position=0; position<path.size(); position++) {
- const MarkerGraphEdgeId edgeId = path[position];
-
- // Store the marker graph edge table entry for this edge.
- SHASTA_ASSERT(edgeId < markerGraphEdgeTable.size());
- markerGraphEdgeTable[edgeId] = make_pair(segmentId, position);
- }
- }
-
- }
-}
-
-
-
-void AssemblyGraph::computeMarkerGraphJourneys(size_t threadCount)
-{
- const bool debug = true;
-
- createNew(markerGraphJourneys, "tmp-mode3-MarkerGraphJourneys");
-
- uint64_t batchSize = 1000;
- markerGraphJourneys.beginPass1(markers.size());
- setupLoadBalancing(markerGraphEdgeTable.size(), batchSize);
- runThreads(&AssemblyGraph::computeMarkerGraphJourneysPass1, threadCount);
- markerGraphJourneys.beginPass2();
- setupLoadBalancing(markerGraphEdgeTable.size(), batchSize);
- runThreads(&AssemblyGraph::computeMarkerGraphJourneysPass2, threadCount);
- markerGraphJourneys.endPass2();
-
- batchSize = 100;
- setupLoadBalancing(markerGraphJourneys.size(), batchSize);
- runThreads(&AssemblyGraph::sortMarkerGraphJourneys, threadCount);
-
- if(debug) {
- ofstream csv("MarkerGraphJourneys.csv");
- csv << "OrientedReadId,SegmentId,Position,ordinal0,Ordinal1\n";
- for(uint64_t i=0; i<markers.size(); i++) {
- const OrientedReadId orientedReadId = OrientedReadId::fromValue(ReadId(i));
- const auto markerGraphJourney = markerGraphJourneys[i];
- for(uint64_t position=0; position<markerGraphJourney.size(); position++) {
- const MarkerGraphJourneyEntry& entry = markerGraphJourney[position];
- csv << orientedReadId << ",";
- csv << entry.segmentId << ",";
- csv << entry.position << ",";
- csv << entry.ordinals[0] << ",";
- csv << entry.ordinals[1] << "\n";
- }
- }
-
- }
-}
-
-
-
-void AssemblyGraph::computeMarkerGraphJourneysPass1(size_t threadId)
-{
- computeMarkerGraphJourneysPass12(1);
-}
-
-
-
-void AssemblyGraph::computeMarkerGraphJourneysPass2(size_t threadId)
-{
- computeMarkerGraphJourneysPass12(2);
-}
-
-
-
-void AssemblyGraph::computeMarkerGraphJourneysPass12(uint64_t pass)
-{
- // Loop over all batches assigned to this thread.
- uint64_t begin, end;
- while(getNextBatch(begin, end)) {
-
- // Loop over marker graph edges assigned to this batch.
- for(MarkerGraph::EdgeId edgeId=begin; edgeId!=end; ++edgeId) {
- const auto& p = markerGraphEdgeTable[edgeId];
- const uint64_t segmentId = p.first;
- const uint32_t position = p.second;
- SHASTA_ASSERT(segmentId != std::numeric_limits<uint64_t>::max());
- SHASTA_ASSERT(position != std::numeric_limits<uint32_t>::max());
-
- // Loop over the marker intervals of this marker graph edge..
- const auto markerIntervals = markerGraph.edgeMarkerIntervals[edgeId];
- for(const MarkerInterval& markerInterval: markerIntervals) {
- const OrientedReadId orientedReadId = markerInterval.orientedReadId;
-
- if(pass == 1) {
- markerGraphJourneys.incrementCountMultithreaded(orientedReadId.getValue());
- } else {
- MarkerGraphJourneyEntry markerGraphJourneyEntry;
- markerGraphJourneyEntry.segmentId = segmentId;
- markerGraphJourneyEntry.position = position;
- markerGraphJourneyEntry.ordinals = markerInterval.ordinals;
- markerGraphJourneys.storeMultithreaded(orientedReadId.getValue(), markerGraphJourneyEntry);
- }
- }
- }
- }
-}
-
-
-
-void AssemblyGraph::sortMarkerGraphJourneys(size_t threadId)
-{
- uint64_t begin, end;
- while(getNextBatch(begin, end)) {
-
- // Loop over marker graph edges assigned to this batch.
- for(uint64_t i=begin; i!=end; ++i) {
- auto markerGraphJourney = markerGraphJourneys[i];
- sort(markerGraphJourney.begin(), markerGraphJourney.end());
- }
- }
-}
-
-
-// The assembly graph journey of an oriented read
-// is the sequence of segmentIds it encounters.
-void AssemblyGraph::computeAssemblyGraphJourneys()
-{
- const bool debug = true;
-
- // Initialize the assembly graph journeys.
- createNew(assemblyGraphJourneys, "Mode3-AssemblyGraphJourneys");
-
- // Work vector defined outside the loop to reduce memory allocation overhead.
- vector<AssemblyGraphJourneyEntry> assemblyGraphJourney;
-
- // Loop over all oriented reads.
- for(uint64_t i=0; i<markerGraphJourneys.size(); i++) {
-
- // Access the marker graph journey for this oriented read.
- const span<MarkerGraphJourneyEntry> markerGraphJourney = markerGraphJourneys[i];
-
- // Compute the assembly graph journey.
- computeAssemblyGraphJourney(markerGraphJourney, assemblyGraphJourney);
-
- // Store it.
- assemblyGraphJourneys.appendVector(assemblyGraphJourney);
- }
-
-
-
- // Write them out.
- if(debug) {
- ofstream csv("AssemblyGraphJourneys.csv");
- for(uint64_t i=0; i<assemblyGraphJourneys.size(); i++) {
- const ReadId readId = ReadId(i >> 1);
- const Strand strand = i & 1;
- const OrientedReadId orientedReadId(readId, strand);
- const span<AssemblyGraphJourneyEntry> assemblyGraphJourney = assemblyGraphJourneys[i];
-
- csv << orientedReadId << ",";
- for(const AssemblyGraphJourneyEntry entry: assemblyGraphJourney) {
- csv << entry.segmentId << ",";
- }
- csv << endl;
- }
- }
-
-
-
- // Write them out again, with more details.
- if(debug) {
- ofstream csv("AssemblyGraphJourneysDetails.csv");
- csv << "OrientedReadId,Position,SegmentId,"
- "First position,First ordinal0,First ordinal1,"
- "Last position,Last ordinal0,Last ordinal1\n";
- for(uint64_t i=0; i<assemblyGraphJourneys.size(); i++) {
- const ReadId readId = ReadId(i >> 1);
- const Strand strand = i & 1;
- const OrientedReadId orientedReadId(readId, strand);
- const span<AssemblyGraphJourneyEntry> assemblyGraphJourney = assemblyGraphJourneys[i];
-
- for(uint64_t position=0; position<assemblyGraphJourney.size(); position++) {
- const AssemblyGraphJourneyEntry& entry = assemblyGraphJourney[position];
- const MarkerGraphJourneyEntry& first = entry.markerGraphJourneyEntries[0];
- const MarkerGraphJourneyEntry& last = entry.markerGraphJourneyEntries[1];
- csv << orientedReadId << ",";
- csv << position << ",";
- csv << entry.segmentId << ",";
- csv << first.position << ",";
- csv << first.ordinals[0] << ",";
- csv << first.ordinals[1] << ",";
- csv << last.position << ",";
- csv << last.ordinals[0] << ",";
- csv << last.ordinals[1] << "\n";
- }
- }
- }
-
-
-}
-
-
-
-// Given the marker graph journey of an oriented read,
-// find the corresponding assembly graph journey.
-void AssemblyGraph::computeAssemblyGraphJourney(
- const span<MarkerGraphJourneyEntry> markerGraphJourney,
- vector<AssemblyGraphJourneyEntry>& assemblyGraphJourney)
-{
- // Start with an empty journey.
- assemblyGraphJourney.clear();
-
- // Loop over the marker graph journey, looking for places
- // where the segmentId changes.
- for(uint32_t i=0; i<markerGraphJourney.size(); /* Increment later */) {
- const MarkerGraphJourneyEntry& markerGraphJourneyEntry = markerGraphJourney[i];
- const uint64_t segmentId = markerGraphJourneyEntry.segmentId;
-
- // Move to the end of the streak with the same segmentId.
- const uint32_t streakBegin = i;
- uint32_t streakEnd = streakBegin + 1;
- for(;
- streakEnd<markerGraphJourney.size() and
- (markerGraphJourney[streakEnd].segmentId == segmentId);
- streakEnd++) {
- }
-
- // Store this segmentId in the assembly graph journey.
- AssemblyGraphJourneyEntry assemblyGraphJourneyEntry;
- assemblyGraphJourneyEntry.segmentId = segmentId;
- assemblyGraphJourneyEntry.markerGraphJourneyEntries[0] = markerGraphJourney[streakBegin];
- assemblyGraphJourneyEntry.markerGraphJourneyEntries[1] = markerGraphJourney[streakEnd - 1];
- assemblyGraphJourney.push_back(assemblyGraphJourneyEntry);
-
- // Prepare to handle the next segment.
- i = streakEnd;
- }
-}
-
-
-
-void AssemblyGraph::computeAssemblyGraphJourneyInfos()
-{
- const bool debug = true;
-
- const uint64_t segmentCount = markerGraphPaths.size();
- const uint64_t readCount = assemblyGraphJourneys.size()/2;
-
- createNew(assemblyGraphJourneyInfos, "Mode3-AssemblyGraphJourneyInfos");
-
- // Pass 1.
- assemblyGraphJourneyInfos.beginPass1(segmentCount);
- for(ReadId readId=0; readId<readCount; readId++) {
- for(Strand strand=0; strand<2; strand++) {
- const OrientedReadId orientedReadId(readId, strand);
- const auto assemblyGraphJourney = assemblyGraphJourneys[orientedReadId.getValue()];
-
- for(uint64_t position=0; position<assemblyGraphJourney.size(); position++) {
- const AssemblyGraphJourneyEntry& entry = assemblyGraphJourney[position];
- assemblyGraphJourneyInfos.incrementCount(entry.segmentId);
- }
- }
- }
-
- // Pass 2.
- assemblyGraphJourneyInfos.beginPass2();
- for(ReadId readId=0; readId<readCount; readId++) {
- for(Strand strand=0; strand<2; strand++) {
- const OrientedReadId orientedReadId(readId, strand);
- const auto assemblyGraphJourney = assemblyGraphJourneys[orientedReadId.getValue()];
-
- for(uint64_t position=0; position<assemblyGraphJourney.size(); position++) {
- const AssemblyGraphJourneyEntry& entry = assemblyGraphJourney[position];
- assemblyGraphJourneyInfos.store(entry.segmentId, make_pair(orientedReadId, position));
- }
- }
- }
- assemblyGraphJourneyInfos.endPass2();
-
- // Sort.
- for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) {
- const auto v = assemblyGraphJourneyInfos[segmentId];
- sort(v.begin(), v.end());
- }
-
-
- if(debug) {
- ofstream csv("SegmentJourneyInfo.csv");
- csv << "SegmentId,OrientedReadId,Position in assembly graph journey\n";
- for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) {
- const auto v = assemblyGraphJourneyInfos[segmentId];
- for(const auto& p: v) {
- csv << segmentId << ",";
- csv << p.first << ",";
- csv << p.second << "\n";
- }
- }
- }
-}
-
-
-
-// Find out if a segment contains a given OrientedReadId.
-// This returns true if assemblyGraphJourneyInfos[segmentId]
-// contains an entry with the given OrientedReadId.
-bool AssemblyGraph::segmentContainsOrientedRead(
- uint64_t segmentId,
- OrientedReadId orientedReadId) const
-{
- for(const auto& p: assemblyGraphJourneyInfos[segmentId]) {
- if(p.first == orientedReadId) {
- return true;
- }
- }
- return false;
-}
-
-
-
-void AssemblyGraph::findTransitions(std::map<SegmentPair, Transitions>& transitionMap)
-{
- transitionMap.clear();
-
- for(ReadId readId=0; readId<assemblyGraphJourneys.size()/2; readId++) {
- for(Strand strand=0; strand<2; strand++) {
- const OrientedReadId orientedReadId(readId, strand);
- const auto journey = assemblyGraphJourneys[orientedReadId.getValue()];
-
- for(uint64_t i=1; i<journey.size(); i++) {
- const auto& previous = journey[i-1];
- const auto& current = journey[i];
- SHASTA_ASSERT(previous.segmentId != current.segmentId);
-
- const SegmentPair segmentPair = make_pair(previous.segmentId, current.segmentId);
- transitionMap[segmentPair].push_back(
- make_pair(orientedReadId, Transition({
- previous.markerGraphJourneyEntries[1],
- current.markerGraphJourneyEntries[0]})));
-
- }
- }
- }
-}
-
-
-
-void AssemblyGraph::createLinks(
- const std::map<SegmentPair, Transitions>& transitionMap,
- uint64_t minCoverage)
-{
- createNew(links, "Mode3-Links");
- createNew(transitions, "Mode3-Transitions");
- for(const auto& p: transitionMap) {
- const auto& transitionVector = p.second;
- const uint64_t coverage = transitionVector.size();
- if(coverage >= minCoverage) {
- const uint64_t segmentId0 = p.first.first;
- const uint64_t segmentId1 = p.first.second;
- links.push_back(Link(segmentId0, segmentId1));
- transitions.appendVector(transitionVector);
- }
- }
-
- // Store link separation.
- for(uint64_t linkId=0; linkId<links.size(); linkId++) {
- Link& link = links[linkId];
-
- // Check if these two segments are adjacent in the marker graph.
- const uint64_t segmentId0 = link.segmentId0;
- const uint64_t segmentId1 = link.segmentId1;
- const auto path0 = markerGraphPaths[segmentId0];
- const auto path1 = markerGraphPaths[segmentId1];
- const MarkerGraph::Edge lastEdge0 = markerGraph.edges[path0.back()];
- const MarkerGraph::Edge firstEdge1 = markerGraph.edges[path1.front()];
- if(lastEdge0.target == firstEdge1.source) {
- // The segments are adjacent. Set the link separation to 0.
- link.segmentsAreAdjacent = true;
- link.separation = 0;
- } else {
- // The segments are not adjacent.
- // Use the transitions to estimate the separation.
- const auto linkTransitions = transitions[linkId];
- const double separation = linkSeparation(linkTransitions, path0.size());
-
- link.segmentsAreAdjacent = false;
- link.separation = int32_t(std::round(separation));
- }
- }
-
-
-
- ofstream csv("Links.csv");
- csv << "LinkId,SegmentId0,SegmentId1,Coverage,Adjacent,Separation\n";
- for(uint64_t linkId=0; linkId<links.size(); linkId++) {
- Link& link = links[linkId];
-
- csv << linkId << ",";
- csv << link.segmentId0 << ",";
- csv << link.segmentId1 << ",";
- csv << transitions[linkId].size() << ",";
- csv << (link.segmentsAreAdjacent ? "Yes" : "No") << ",";
- csv << link.separation << "\n";
- }
-
-}
-
-
-
-// Initial construction of the AssemblyGraph.
-AssemblyGraph::AssemblyGraph(
- const string& largeDataFileNamePrefix,
- size_t largeDataPageSize,
- size_t threadCount,
- uint64_t readRepresentation,
- uint64_t k, // Marker length
- const Reads& reads,
- const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
- const MarkerGraph& markerGraph,
- const ConsensusCaller& consensusCaller) :
- MultithreadedObject<AssemblyGraph>(*this),
- largeDataFileNamePrefix(largeDataFileNamePrefix),
- largeDataPageSize(largeDataPageSize),
- readRepresentation(readRepresentation),
- k(k),
- reads(reads),
- markers(markers),
- markerGraph(markerGraph),
- consensusCaller(consensusCaller)
-{
- // K must be even.
- SHASTA_ASSERT((k % 2) == 0);
-
- // This assumes RLE is not used.
- SHASTA_ASSERT(reads.representation == 0);
-
- // Minimum number of transitions (oriented reads) to create a link.
- // If this equals 1, then the sequence of segments visited by every
- // oriented read is a path in the graph.
- // But that is not desirable because of the extra edges it causes.
- const uint64_t minCoverage = 3; // EXPOSE WHEN CODE STABILIZES
-
- // Create a segment for each linear chain of marker graph edges.
- createSegmentPaths();
- computeSegmentCoverage();
-
- // Assembled sequence for each segment.
- assembleSegments();
-
- // Keep track of the segment and position each marker graph edge corresponds to.
- computeMarkerGraphEdgeTable(threadCount);
-
- // Compute marker graph and assembly graph journeys of all oriented reads.
- // We permanently store only the assembly graph journeys.
- computeMarkerGraphJourneys(threadCount);
- computeAssemblyGraphJourneys();
- markerGraphJourneys.remove();
- computeAssemblyGraphJourneyInfos();
-
- // Find transitions from segment to segment in the marker graph
- // journeys of all oriented reads, and store them keyed by the pair of segments.
- std::map<SegmentPair, Transitions> transitionMap;
- findTransitions(transitionMap);
-
- // Create a links between pairs of segments with a sufficient number of transitions.
- createLinks(transitionMap, minCoverage);
- createConnectivity();
- flagBackSegments();
-
- cout << "The mode 3 assembly graph has " << markerGraphPaths.size() << " segments and " <<
- links.size() << " links." << endl;
-}
-
-
-
-string AssemblyGraph::largeDataName(const string& name) const
-{
- if(largeDataFileNamePrefix.empty()) {
- return ""; // Anonymous;
- } else {
- return largeDataFileNamePrefix + name;
- }
-}
-
-
-
-// Constructor from binary data.
-AssemblyGraph::AssemblyGraph(
- const string& largeDataFileNamePrefix,
- uint64_t readRepresentation,
- uint64_t k, // Marker length
- const Reads& reads,
- const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
- const MarkerGraph& markerGraph,
- const ConsensusCaller& consensusCaller) :
- MultithreadedObject<AssemblyGraph>(*this),
- largeDataFileNamePrefix(largeDataFileNamePrefix),
- readRepresentation(readRepresentation),
- k(k),
- reads(reads),
- markers(markers),
- markerGraph(markerGraph),
- consensusCaller(consensusCaller)
-{
- accessExistingReadOnly(markerGraphPaths, "Mode3-MarkerGraphPaths");
- accessExistingReadOnly(segmentCoverage, "Mode3-SegmentCoverage");
- accessExistingReadOnly(segmentSequences, "Mode3-SegmentSequences");
- accessExistingReadOnly(segmentVertexOffsets, "Mode3-SegmentVertexOffsets");
- accessExistingReadOnly(markerGraphEdgeTable, "Mode3-MarkerGraphEdgeTable");
- accessExistingReadOnly(assemblyGraphJourneys, "Mode3-AssemblyGraphJourneys");
- accessExistingReadOnly(assemblyGraphJourneyInfos, "Mode3-AssemblyGraphJourneyInfos");
- accessExistingReadOnly(links, "Mode3-Links");
- accessExistingReadOnly(transitions, "Mode3-Transitions");
- accessExistingReadOnly(linksBySource, "Mode3-LinksBySource");
- accessExistingReadOnly(linksByTarget, "Mode3-LinksByTarget");
- accessExistingReadOnly(isBackSegment, "Mode3-IsBackSegment");
- accessExistingReadOnly(clusterIds, "Mode3-ClusterIds");
-}
-
-
-
-void AssemblyGraph::createConnectivity()
-{
- createNew(linksBySource, "Mode3-LinksBySource");
- createNew(linksByTarget, "Mode3-LinksByTarget");
-
- linksBySource.beginPass1(links.size());
- linksByTarget.beginPass1(links.size());
- for(uint64_t linkId=0; linkId<links.size(); linkId++) {
- const Link& link = links[linkId];
- linksBySource.incrementCount(link.segmentId0);
- linksByTarget.incrementCount(link.segmentId1);
- }
- linksBySource.beginPass2();
- linksByTarget.beginPass2();
- for(uint64_t linkId=0; linkId<links.size(); linkId++) {
- const Link& link = links[linkId];
- linksBySource.store(link.segmentId0, linkId);
- linksByTarget.store(link.segmentId1, linkId);
- }
- linksBySource.endPass2();
- linksByTarget.endPass2();
-}
-
-
-
-uint64_t AssemblyGraph::findLink(uint64_t segmentId0, uint64_t segmentId1) const
-{
- for(const uint64_t linkId: linksBySource[segmentId0]) {
- if(links[linkId].segmentId1 == segmentId1) {
- return linkId;
- }
- }
- SHASTA_ASSERT(0);
-}
-
-
-
-// Flag back-segments.
-// This does not do a full blown search for locally strongly connected components.
-// A segment is marked as a back-segment if:
-// - It has only a single incoming link.
-// - It has a single outgoing link.
-// - The incoming and outgoing links both connect to/from the same segment.
-void AssemblyGraph::flagBackSegments()
-{
- const uint64_t segmentCount = markerGraphPaths.size();
- createNew(isBackSegment, "Mode3-IsBackSegment");
- isBackSegment.resize(segmentCount);
-
- uint64_t backSegmentCount = 0;
- for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) {
-
- // Initially flag it as not a back-segment.
- isBackSegment[segmentId] = false;
-
- // For a back-segment, there must be a single incoming link.
- const auto incomingLinks = linksByTarget[segmentId];
- if(incomingLinks.size() != 1) {
- continue;
- }
-
- // For a back-segment, there must be a single outgoing link.
- const auto outgoingLinks = linksBySource[segmentId];
- if(outgoingLinks.size() != 1) {
- continue;
- }
-
- // For a back-segment, the incoming and outgoing links
- // both connect to/from the same segment.
- const uint64_t incomingLinkId = incomingLinks[0];
- const uint64_t outgoingLinkId = outgoingLinks[0];
- const Link& incomingLink = links[incomingLinkId];
- const Link& outgoingLink = links[outgoingLinkId];
- if(incomingLink.segmentId0 != outgoingLink.segmentId1) {
- continue;
- }
-
- // Flag it as a back-segment.
- isBackSegment[segmentId] = true;
- ++backSegmentCount;
- }
-
- cout << "Found " << backSegmentCount << " back-segments." << endl;
-}
-
-
-
-// Get the children or parents of a given segment.
-// Only use links with at least a specified coverage.
-void AssemblyGraph::getChildrenOrParents(
- uint64_t segmentId,
- uint64_t direction, // 0=forward (children), 1=backward (parents).
- uint64_t minimumLinkCoverage,
- vector<uint64_t>& childrenOrParents) const
-{
- switch(direction) {
- case 0:
- getChildren(segmentId, minimumLinkCoverage, childrenOrParents);
- break;
- case 1:
- getParents(segmentId, minimumLinkCoverage, childrenOrParents);
- break;
- default:
- SHASTA_ASSERT(0);
- }
-}
-
-
-
-void AssemblyGraph::getChildren(
- uint64_t segmentId,
- uint64_t minimumLinkCoverage,
- vector<uint64_t>& children) const
-{
- children.clear();
- for(const auto linkId: linksBySource[segmentId]) {
- if(transitions.size(linkId) >= minimumLinkCoverage) {
- const Link& link = links[linkId];
- children.push_back(link.segmentId1);
- }
- }
-}
-
-
-
-void AssemblyGraph::getParents(
- uint64_t segmentId,
- uint64_t minimumLinkCoverage,
- vector<uint64_t>& parents) const
-{
- parents.clear();
- for(const auto linkId: linksByTarget[segmentId]) {
- if(transitions.size(linkId) >= minimumLinkCoverage) {
- const Link& link = links[linkId];
- parents.push_back(link.segmentId0);
- }
- }
-}
-
-
-
-void AssemblyGraph::writeGfa(const string& baseName) const
-{
- ofstream gfa(baseName + ".gfa");
- ofstream csv(baseName + ".csv");
-
- // Write the headers.
- gfa << "H\tVN:Z:1.0\n";
- csv << "Segment,Path Length,Sequence Length,Average coverage,Read count\n";
-
- // Write the segments.
- for(uint64_t segmentId=0; segmentId<markerGraphPaths.size(); segmentId++) {
-
- const auto sequence = segmentSequences[segmentId];
- gfa <<"S\t" << segmentId << "\t";
- copy(sequence.begin()+k/2, sequence.end()-k/2, ostream_iterator<Base>(gfa));
- gfa << "\n";
-
- const auto path = markerGraphPaths[segmentId];
- csv << segmentId << ",";
- csv << path.size() << ",";
- csv << sequence.size() << ",";
- csv << segmentCoverage[segmentId] << ",";
- csv << assemblyGraphJourneyInfos[segmentId].size() << "\n";
- }
-
- // Write the links.
- for(const Link& link: links) {
- if(true /*link.segmentsAreAdjacent*/) {
- gfa << "L\t" <<
- link.segmentId0 << "\t+\t" <<
- link.segmentId1 << "\t+\t0M\n";
- } else {
- // This writes non-adjacent links as Jumps (GFA 1.2).
- // The original Bandage does not display them.
- // BandageNG does, but they are not taken into account during graph
- // creation, so it i not useful to write them like this.
- // For this reason, the if condition above was set to true,
- // so this branch is never reached.
- // Leaveing the code in place for possible future use.
- gfa << "J\t" <<
- link.segmentId0 << "\t+\t" <<
- link.segmentId1 << "\t+\t" << k * link.separation << "\n";
- }
- }
-
-}
-
-
-
-// Find the distinct oriented reads that appear on the path
-// of a segment. Also return the average edge coverage for the path.
-double AssemblyGraph::findOrientedReadsOnSegment(
- uint64_t segmentId,
- vector<OrientedReadId>& orientedReadIdsArgument) const
-{
- // Loop over the marker graph path corresponding to this segment.
- const span<const MarkerGraphEdgeId> path = markerGraphPaths[segmentId];
- double coverage = 0.;
- std::set<OrientedReadId> orientedReadIds;
- for(const MarkerGraphEdgeId& edgeId: path) {
-
- // Loop over the marker intervals for this marker graph edge.
- const span<const MarkerInterval> markerIntervals = markerGraph.edgeMarkerIntervals[edgeId];
- coverage += double(markerIntervals.size());
- for(const MarkerInterval& markerInterval: markerIntervals) {
- orientedReadIds.insert(markerInterval.orientedReadId);
- }
- }
-
- // Copy the oriented reads to the vector passed as an argument.
- orientedReadIdsArgument.clear();
- orientedReadIdsArgument.insert(orientedReadIdsArgument.end(),
- orientedReadIds.begin(), orientedReadIds.end());
-
- return coverage / double(path.size());
-}
-
-
-
-// Get information about the oriented reads that appear on the
-// marker graph path of a segment.
-void AssemblyGraph::getOrientedReadsOnSegment(
- uint64_t segmentId,
- SegmentOrientedReadInformation& information) const
-{
- // A data structure that, for each oriented read we find,
- // contains a sum of offsets and the number of marker graph vertices
- // that contributed to the sum.
- std::map<OrientedReadId, pair<uint64_t, int64_t> > table;
-
- // Loop over the marker graph path corresponding to this segment.
- const span<const MarkerGraphEdgeId> path = markerGraphPaths[segmentId];
- std::set<OrientedReadId> orientedReadIds;
- for(uint64_t position=0; position<path.size(); position++) {
- const MarkerGraphEdgeId& edgeId = path[position];
-
- // Loop over the marker intervals for this marker graph edge.
- const span<const MarkerInterval> markerIntervals = markerGraph.edgeMarkerIntervals[edgeId];
- for(const MarkerInterval& markerInterval: markerIntervals) {
- const OrientedReadId orientedReadId = markerInterval.orientedReadId;
-
- // Update our table for this oriented read.
- auto it = table.find(orientedReadId);
- if(it == table.end()) {
- tie(it, ignore) = table.insert(make_pair(orientedReadId, make_pair(0ULL, 0LL)));
- }
- auto& p = it->second;
- p.first += 2;
- p.second += int32_t(position) - int32_t(markerInterval.ordinals[0]);
- p.second += int32_t(position + 1) -int32_t(markerInterval.ordinals[1]);
- }
- }
-
-
-
- // Store what we found.
- information.infos.clear();
- for(const auto& p: table) {
- SegmentOrientedReadInformation::Info info;
- info.orientedReadId = p.first;
- const uint64_t n = p.second.first;
- const int64_t sum = p.second.second;
- info.averageOffset = int32_t(std::round(double(sum) / double(n)));
- information.infos.push_back(info);
- }
- }
-
-
-
-// Estimate the offset between two segments.
-// Takes as input SegmentOrientedReadInformation objects
-// for the two segments.
-// Common oriented reads between the two segments are used
-// to estimate the average offset, in markers,
-// between the beginning of the segments.
-// The number of common oriented reads
-// is computed and stored in the last argument.
-// If that is zero, the computed offset is not valid.
-void AssemblyGraph::estimateOffset(
- const SegmentOrientedReadInformation& info0,
- const SegmentOrientedReadInformation& info1,
- int64_t& offset,
- uint64_t& commonOrientedReadCount
- ) const
-{
- offset = 0;
- commonOrientedReadCount = 0;
-
- // Joint loop over common oriented reads in the two segments.
- const auto begin0 = info0.infos.begin();
- const auto begin1 = info1.infos.begin();
- const auto end0 = info0.infos.end();
- const auto end1 = info1.infos.end();
- auto it0 = begin0;
- auto it1 = begin1;
- while((it0 != end0) and (it1 != end1)) {
-
- if(it0->orientedReadId < it1->orientedReadId) {
- ++it0;
- } else if(it1->orientedReadId < it0->orientedReadId) {
- ++it1;
- } else {
- SHASTA_ASSERT(it0->orientedReadId == it1->orientedReadId);
-
- commonOrientedReadCount++;
- offset += (int64_t(it0->averageOffset) - int64_t(it1->averageOffset));
-
- ++it0;
- ++it1;
- }
- }
-
- if(commonOrientedReadCount) {
- offset = int64_t(std::round(double(offset) / double(commonOrientedReadCount)));
- } else {
- offset = std::numeric_limits<uint64_t>::max();
- }
-
-}
-
-
-
-// Analyze a pair of segments for common oriented reads,
-// offsets, missing reads, etc.
-void AssemblyGraph::analyzeSegmentPair(
- uint64_t segmentId0,
- uint64_t segmentId1,
- const SegmentOrientedReadInformation& info0,
- const SegmentOrientedReadInformation& info1,
- const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
- SegmentPairInformation& info01
- ) const
-{
- using boost::icl::discrete_interval;
- using boost::icl::intersects;
-
- // Store the number of oriented reads in each segment.
- info01.totalCount[0] = info0.infos.size();
- info01.totalCount[1] = info1.infos.size();
-
- // Use common oriented reads to estimate the offset between the two segments.
- // If there are no common oriented reads, stop here.
- estimateOffset(info0, info1, info01.offset, info01.commonCount);
- if(info01.commonCount == 0) {
- return;
- }
-
-
- // Count the oriented reads missing from each segment,
- // and which should have been present based on
- // the known relative offsets.
- info01.unexplainedCount = {0, 0};
- info01.shortCount = {0, 0};
-
- // Set up a joint loop over oriented reads in the two segments.
- const auto begin0 = info0.infos.begin();
- const auto begin1 = info1.infos.begin();
- const auto end0 = info0.infos.end();
- const auto end1 = info1.infos.end();
- auto it0 = begin0;
- auto it1 = begin1;
-
- const uint64_t length0 = markerGraphPaths.size(segmentId0);
- const uint64_t length1 = markerGraphPaths.size(segmentId1);
- while(true) {
-
- // At end of both segments.
- if((it0 == end0) and (it1 == end1)) {
- break;
- }
-
-
-
- // This read only appears on segment 0.
- if((it1 == end1) or ((it0 != end0) and (it0->orientedReadId < it1->orientedReadId))) {
- const int64_t orientedReadLength = markers.size(it0->orientedReadId.getValue());
-
- // Compute the hypothetical range of the oriented read relative
- // to the beginning of segment 1.
- const discrete_interval<int64_t> orientedReadRange1(
- it0->averageOffset - info01.offset,
- it0->averageOffset - info01.offset + orientedReadLength);
- const discrete_interval<int64_t> segment1Range(0, length1);
-
- // Figure out if it the oriented read would overlap segment 1.
- const bool wouldOverlap = intersects(orientedReadRange1, segment1Range);
-
- if(wouldOverlap) {
- ++info01.unexplainedCount[0];
- } else {
- ++info01.shortCount[0];
- }
-
- SHASTA_ASSERT(it0 != end0);
- ++it0;
- }
-
-
-
- // Only on segment 1
- else if((it0 == end0) or ((it1 != end1) and (it1->orientedReadId < it0->orientedReadId))) {
- const int64_t orientedReadLength = markers.size(it1->orientedReadId.getValue());
-
- // Compute the hypothetical range of the oriented read relative
- // to the beginning of segment 0.
- const discrete_interval<int64_t> orientedReadRange0(
- it1->averageOffset + info01.offset,
- it1->averageOffset + info01.offset + orientedReadLength);
- const discrete_interval<int64_t> segment0Range(0, length0);
-
- // Figure out if it the oriented read would overlap segment 0.
- const bool wouldOverlap = intersects(orientedReadRange0, segment0Range);
-
- if(wouldOverlap) {
- ++info01.unexplainedCount[1];
- } else {
- ++info01.shortCount[1];
- }
-
- SHASTA_ASSERT(it1 != end1);
- ++it1;
- }
-
- // On both segments.
- else {
- SHASTA_ASSERT(it0 != end0);
- SHASTA_ASSERT(it1 != end1);
- ++it0;
- ++it1;
- }
- }
-
- info01.check();
-
-}
-
-
-
-// Gather oriented read information for each segment.
-void AssemblyGraph::storeSegmentOrientedReadInformation(size_t threadCount)
-{
- const uint64_t segmentCount = markerGraphPaths.size();
- segmentOrientedReadInformation.resize(segmentCount);
- const uint64_t batchSize = 10;
- setupLoadBalancing(segmentCount, batchSize);
- runThreads(&AssemblyGraph::storeSegmentOrientedReadInformationThreadFunction, threadCount);
-}
-
-
-
-
-// Gather oriented read information for each segment.
-void AssemblyGraph::storeSegmentOrientedReadInformationThreadFunction(size_t threadId)
-{
-
- // Loop over batches assigned to this thread.
- uint64_t begin, end;
- while(getNextBatch(begin, end)) {
-
- // Loop over segments assigned to this batch.
- for(uint64_t segmentId=begin; segmentId!=end; ++segmentId) {
-
- // Get oriented read information for this segment.
- getOrientedReadsOnSegment(segmentId, segmentOrientedReadInformation[segmentId]);
- }
- }
-}
-
-
-
-#if 0
-void AssemblyGraph::clusterSegments(size_t threadCount, uint64_t minClusterSize)
-{
- // Gather oriented read information for all segments.
- storeSegmentOrientedReadInformation(threadCount);
-
- // Find the segment pairs.
- const uint64_t segmentCount = markerGraphPaths.size();
- const uint64_t batchSize = 10;
- setupLoadBalancing(segmentCount, batchSize);
- clusterSegmentsData.threadPairs.resize(threadCount);
- runThreads(&AssemblyGraph::clusterSegmentsThreadFunction1, threadCount);
-
- // For now, write a dot file with the pairs.
- ofstream dot("SegmentGraph.dot");
- dot << "graph segmentGraph {\n";
- for(const auto& threadPairs: clusterSegmentsData.threadPairs) {
- for(const auto& p: threadPairs) {
- dot << p.first << "--" << p.second << ";\n";
- }
- }
- dot << "}\n";
-
-
-
- // The segment pairs we found define a subgraph of the assembly graph.
- // Compute connected components of this subgraph.
- // The connected components of sufficient size become clusters.
- vector<uint64_t> rank(segmentCount);
- vector<uint64_t> parent(segmentCount);
- boost::disjoint_sets<uint64_t*, uint64_t*> disjointSets(&rank[0], &parent[0]);
- for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) {
- disjointSets.make_set(segmentId);
- }
- for(const auto& threadPairs: clusterSegmentsData.threadPairs) {
- for(const auto& p: threadPairs) {
- disjointSets.union_set(p.first, p.second);
- }
- }
-
- // Gather the segments in each connected component.
- vector< vector<uint64_t> > components(segmentCount);
- for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) {
- const uint64_t componentId = disjointSets.find_set(segmentId);
- components[componentId].push_back(segmentId);
- }
-
- // Each connected components of size at least minClusterSize
- // becomes a cluster.
- vector< pair<uint64_t, uint64_t> > clusterTable;
- for(uint64_t componentId=0; componentId<segmentCount; componentId++) {
- const vector<uint64_t>& component = components[componentId];
- const uint64_t componentSize = component.size();
- if(component.size() >= minClusterSize) {
- clusterTable.push_back(make_pair(componentId, componentSize));
- }
- }
-
- // Sort the clusters by decreasing size.
- sort(clusterTable.begin(), clusterTable.end(),
- OrderPairsBySecondOnlyGreater<uint64_t, uint64_t>());
-
- cout << "Found " << clusterTable.size() << " segment clusters with the following sizes:" << endl;
- uint64_t clusteredSegmentCount = 0;
- for(uint64_t clusterId=0; clusterId<clusterTable.size(); clusterId++) {
- const auto& p = clusterTable[clusterId];
- const uint64_t componentSize = p.second;
- cout << " " << componentSize;
- clusteredSegmentCount += componentSize;
- }
- cout << endl;
- cout << "Out of " << segmentCount << " segments, " <<
- clusteredSegmentCount << " were assigned to a cluster." << endl;
-
-
-
- // Store the cluster id of each segment.
- createNew(clusterIds, "Mode3-ClusterIds");
- clusterIds.resize(segmentCount);
- fill(clusterIds.begin(), clusterIds.end(), std::numeric_limits<uint64_t>::max());
- for(uint64_t clusterId=0; clusterId<clusterTable.size(); clusterId++) {
- const auto& p = clusterTable[clusterId];
- const uint64_t componentId = p.first;
- const vector<uint64_t>& cluster = components[componentId];
- for(const uint64_t segmentId: cluster) {
- clusterIds[segmentId] = clusterId;
- }
- }
-
-
-
- // Clean up.
- clusterSegmentsData.threadPairs.clear();
- clusterSegmentsData.threadPairs.shrink_to_fit();
- segmentOrientedReadInformation.clear();
- segmentOrientedReadInformation.shrink_to_fit();
-}
-
-
-
-void AssemblyGraph::clusterSegmentsThreadFunction1(size_t threadId)
-{
-
- auto& threadPairs = clusterSegmentsData.threadPairs[threadId];
- threadPairs.clear();
- vector<uint64_t> descendants;
-
- // Loop over batches assigned to this thread.
- uint64_t begin, end;
- while(getNextBatch(begin, end)) {
-
- // Loop over segments assigned to this batch.
- for(uint64_t segmentId0=begin; segmentId0!=end; ++segmentId0) {
-
- // Add pairs for which the lowest numbered segment is segmentId0.
- addClusterPairs(threadId, segmentId0);
- }
- }
-}
-
-
-
-void AssemblyGraph::addClusterPairs(size_t threadId, uint64_t startSegmentId)
-{
- // EXPOSE THESE CONSTANTS WHEN CODE STABILIZES.
- const uint64_t minCommonReadCount = 10;
- const double maxUnexplainedFraction = 0.25;
- const double minJaccard = 0.7;
- const uint64_t pairCountPerSegment = 1;
- const uint64_t maxDistance = 200;
-
- // std::lock_guard<std::mutex> lock(mutex); // *********** TAKE OUT
-
- // Do a BFS and check each pair as we encounter it.
- // The BFS terminates when we found enough pairs.
-
- // Do the BFS in both directions.
- for(uint64_t direction=0; direction<1; direction++) { // ********* ONE DIRECTION ONLY
- // cout << startSegmentId << " direction " << direction << endl;
-
- // Initialize the BFS.
- std::queue<uint64_t> q;
- q.push(startSegmentId);
- std::map<uint64_t, uint64_t> distanceMap;
- distanceMap.insert(make_pair(startSegmentId, 0));
- uint64_t foundCount = 0;
-
- // BFS loop.
- while(not q.empty()) {
- const uint64_t segmentId0 = q.front();
- // cout << "Dequeued " << segmentId0 << endl;
- q.pop();
-
- const uint64_t distance0 = distanceMap[segmentId0];
- const uint64_t distance1 = distance0 + 1;
-
- // Loop over children or parents of segmentId0.
- const auto neighbors = (direction == 0) ? linksBySource[segmentId0] : linksByTarget[segmentId0];
- for(const uint64_t linkId01: neighbors) {
- const Link& link01 = links[linkId01];
- const uint64_t segmentId1 = (direction==0) ? link01.segmentId1 : link01.segmentId0;
-
- // If we already encountered segmentId1, skip it.
- if(distanceMap.find(segmentId1) != distanceMap.end()) {
- continue;
- }
-
- // Enqueue it.
- if(distance1 < maxDistance) {
- q.push(segmentId1);
- }
- distanceMap.insert(make_pair(segmentId1, distance1));
-
- // cout << "Found " << segmentId1 << endl;
-
- // Check the pair (startSegmentId, segmentId1).
- SegmentPairInformation info;
- analyzeSegmentPair(startSegmentId, segmentId1,
- segmentOrientedReadInformation[startSegmentId],
- segmentOrientedReadInformation[segmentId1],
- markers, info);
- if(info.commonCount < minCommonReadCount) {
- continue;
- }
- if(info.maximumUnexplainedFraction() > maxUnexplainedFraction) {
- continue;
- }
- if(info.jaccard() < minJaccard) {
- continue;
- }
-
- // Store it.
- // cout << "Stored " << segmentId1 << endl;
- clusterSegmentsData.threadPairs[threadId].push_back(make_pair(startSegmentId, segmentId1));
- ++foundCount;
- if(foundCount >= pairCountPerSegment) {
- break;
- }
- }
-
- if(foundCount >= pairCountPerSegment) {
- break;
- }
- }
- }
-}
-#endif
-
-
-
-// Find descendants of a given segment, up to a given distance in the graph.
-void AssemblyGraph::findDescendants(
- uint64_t startSegmentId,
- uint64_t maxDistance,
- vector<uint64_t>& descendants
- ) const
-{
- // Initialize the BFS.
- descendants.clear();
- std::queue<uint64_t> q;
- q.push(startSegmentId);
- std::map<uint64_t, uint64_t> distanceMap;
- distanceMap.insert(make_pair(startSegmentId, 0));
-
- // BFS loop.
- while(not q.empty()) {
- const uint64_t segmentId0 = q.front();
- q.pop();
-
- const uint64_t distance0 = distanceMap[segmentId0];
- const uint64_t distance1 = distance0 + 1;
-
- // Loop over children of segmentId0.
- for(const uint64_t linkId01: linksBySource[segmentId0]) {
- const Link& link01 = links[linkId01];
- const uint64_t segmentId1 = link01.segmentId1;
-
- // If we already encountered segmentId1, skip it.
- if(distanceMap.find(segmentId1) != distanceMap.end()) {
- continue;
- }
-
- descendants.push_back(segmentId1);
- distanceMap.insert(make_pair(segmentId1, distance1));
- if(distance1 < maxDistance) {
- q.push(segmentId1);
- }
- }
- }
-}
-
-
-
-void AssemblyGraph::analyzeSubgraph(
- const vector<uint64_t>& segmentIds,
- vector<AnalyzeSubgraphClasses::Cluster>& clusters,
- bool debug) const
-{
- if(segmentIds.size() <= 64) {
- analyzeSubgraphTemplate<64>(segmentIds, clusters, debug);
- } else if(segmentIds.size() <= 128) {
- analyzeSubgraphTemplate<128>(segmentIds, clusters, debug);
- } else if(segmentIds.size() <= 192) {
- analyzeSubgraphTemplate<192>(segmentIds, clusters, debug);
- } else if(segmentIds.size() <= 256) {
- analyzeSubgraphTemplate<256>(segmentIds, clusters, debug);
- } else if(segmentIds.size() <= 320) {
- analyzeSubgraphTemplate<320>(segmentIds, clusters, debug);
- } else if(segmentIds.size() <= 384) {
- analyzeSubgraphTemplate<384>(segmentIds, clusters, debug);
- } else if(segmentIds.size() <= 448) {
- analyzeSubgraphTemplate<448>(segmentIds, clusters, debug);
- } else if(segmentIds.size() <= 512) {
- analyzeSubgraphTemplate<512>(segmentIds, clusters, debug);
- } else {
- SHASTA_ASSERT(0);
- }
-}
-
-
-
-template<uint64_t N> void AssemblyGraph::analyzeSubgraphTemplate(
- const vector<uint64_t>& unsortedSegmentIds,
- vector<AnalyzeSubgraphClasses::Cluster>& clusters,
- bool debug) const
-{
- // EXPOSE WHEN CODE STABILIZES.
- const double fractionThreshold = 0.05;
- const uint64_t minClusterCoverage = 6;
- const uint64_t minSegmentCoverage = 6;
-
- using BitVector = std::bitset<N>;
- using JourneySnippet = AnalyzeSubgraphClasses::JourneySnippet;
- using Cluster = AnalyzeSubgraphClasses::Cluster;
- using SnippetGraphVertex = AnalyzeSubgraphClasses::SnippetGraphVertex;
- using SnippetGraph = AnalyzeSubgraphClasses::SnippetGraph;
- using vertex_descriptor = SnippetGraph::vertex_descriptor;
-
- // Create a sorted version of the segmentIds. We will need it later.
- vector<uint64_t> segmentIds = unsortedSegmentIds;
- sort(segmentIds.begin(), segmentIds.end());
-
- // Gather triplets (orientedReadId, position in assembly graph journey, segmentId).
- using Triplet = tuple<OrientedReadId, uint64_t, uint64_t>;
- vector<Triplet> triplets;
- for(const uint64_t segmentId: segmentIds) {
- const auto v = assemblyGraphJourneyInfos[segmentId];
- for(const auto& p: v) {
- const OrientedReadId orientedReadId = p.first;
- const uint64_t position = p.second;
- triplets.push_back(Triplet(orientedReadId, position, segmentId));
- }
- }
- sort(triplets.begin(), triplets.end());
-
- // Write the triplets.
- if(debug) {
- ofstream csv("Triplets.csv");
- for(const Triplet& triplet: triplets) {
- csv << get<0>(triplet) << ",";
- csv << get<1>(triplet) << ",";
- csv << get<2>(triplet) << "\n";
- }
- }
-
-
-
- // Find streaks for the same OrientedReadId where the position
- // increases by 1 each time.
- // Each streak generates a JourneySnippet.
- vector<JourneySnippet> snippets;
- for(uint64_t i=0; i<triplets.size(); /* Increment later */) {
- const OrientedReadId orientedReadId = get<0>(triplets[i]);
-
- // Find this streak.
- uint64_t streakBegin = i;
- uint64_t streakEnd = streakBegin + 1;
- for(; streakEnd<triplets.size(); streakEnd++) {
- if(get<0>(triplets[streakEnd]) != orientedReadId) {
- break;
- }
- if(get<1>(triplets[streakEnd]) != get<1>(triplets[streakEnd-1]) + 1) {
- break;
- }
- }
-
- // Add a snippet.
- JourneySnippet snippet;
- snippet.orientedReadId = orientedReadId;
- snippet.firstPosition = get<1>(triplets[streakBegin]);
- for(uint64_t j=streakBegin; j!=streakEnd; ++j) {
- snippet.segmentIds.push_back(get<2>(triplets[j]));
- }
- snippets.push_back(snippet);
-
- // Prepare to process the next streak.
- i = streakEnd;
- }
-
-
-
- // Write the snippets.
- if(debug) {
- ofstream csv("JourneySnippets.csv");
- csv << "SnippetIndex,OrientedReadId,First position,LastPosition,SegmentIds\n";
- for(uint64_t snippetIndex=0; snippetIndex<snippets.size(); snippetIndex++) {
- const JourneySnippet& snippet = snippets[snippetIndex];
- csv << snippetIndex << ",";
- csv << snippet.orientedReadId << ",";
- csv << snippet.firstPosition << ",";
- csv << snippet.lastPosition() << ",";
- for(const uint64_t segmentId: snippet.segmentIds) {
- csv << segmentId << ",";
- }
- csv << "\n";
- }
- }
-
-
-
- // For each snippet, create a BitVector that describes the segments
- // the snippet visits.
- const uint64_t snippetCount = snippets.size();
- vector<BitVector> bitVectors(snippetCount);
- vector<uint64_t> bitVectorsPopCount(snippetCount);
- for(uint64_t snippetIndex=0; snippetIndex<snippetCount; snippetIndex++) {
- const JourneySnippet& snippet = snippets[snippetIndex];
- BitVector& bitVector = bitVectors[snippetIndex];
-
- for(const uint64_t segmentId: snippet.segmentIds) {
- auto it = lower_bound(segmentIds.begin(), segmentIds.end(), segmentId);
- SHASTA_ASSERT(it != segmentIds.end());
- SHASTA_ASSERT(*it == segmentId);
- const uint64_t bitIndex = it - segmentIds.begin();
- bitVector.set(bitIndex);
- }
- bitVectorsPopCount[snippetIndex] = bitVector.count();
- }
-
-
-
- // Create the SnippetGraph.
- // A vertex represents a set of snippets and stores
- // the corresponding snippet indexes.
- // An edge x->y is created if there is at least one snippet in y
- // that is an approximate subset of a snippet in x.
- // We express this condition as |y-x| < fractionThreshold * |y|
- // We start with one snippet per vertex.
- SnippetGraph graph;
- vector<vertex_descriptor> vertexTable;
- std::map<vertex_descriptor, uint64_t> vertexMap;
- for(uint64_t snippetIndex=0; snippetIndex<snippetCount; snippetIndex++) {
- const auto v = add_vertex(SnippetGraphVertex(snippetIndex), graph);
- vertexTable.push_back(v);
- vertexMap.insert(make_pair(v, snippetIndex));
- }
- for(uint64_t iy=0; iy<snippetCount; iy++) {
- const BitVector& y = bitVectors[iy];
- const uint64_t threshold = uint64_t(std::round(fractionThreshold * double(bitVectorsPopCount[iy])));
- const vertex_descriptor vy = vertexTable[iy];
- for(uint64_t ix=0; ix<snippetCount; ix++) {
- if(ix == iy) {
- continue;
- }
- const BitVector& x = bitVectors[ix];
-
- // Compute z = y-x.
- BitVector z = y;
- z &= (~x);
-
- if(z.count() <= threshold) {
- const vertex_descriptor vx = vertexTable[ix];
- add_edge(vx, vy, graph);
- }
- }
- }
-
-
-
- // Compute strongly connected components of the SnippetGraph.
- std::map<vertex_descriptor, uint64_t> componentMap;
- const uint64_t componentCount = boost::strong_components(
- graph,
- boost::make_assoc_property_map(componentMap),
- boost::vertex_index_map(boost::make_assoc_property_map(vertexMap)));
- // cout << "Found " << componentCount << " strongly connected components." << endl;
-
- // Gather the vertices of each strongly connected component.
- vector< vector<vertex_descriptor> > components(componentCount);
- BGL_FORALL_VERTICES_T(v, graph, SnippetGraph) {
- const uint64_t componentId = componentMap[v];
- SHASTA_ASSERT(componentId < componentCount);
- components[componentId].push_back(v);
- }
- if(false) {
- cout << "Strongly connected components:\n";
- for(uint64_t componentId=0; componentId<componentCount; componentId++) {
- cout << componentId << ": ";
- for(const vertex_descriptor v: components[componentId]) {
- cout << vertexMap[v] << " ";
- }
- cout << "\n";
- }
- }
-
-
-
- // Condense the strongly connected components.
- // After this, the SnippetGraph is guaranteed to be acyclic.
- for(const vector<vertex_descriptor>& component: components) {
- if(component.size() == 1) {
- continue;
- }
-
- // Create a new vertex to represent this component.
- const auto vNew = add_vertex(graph);
- vector<uint64_t>& snippetsNew = graph[vNew].snippetIndexes;
- for(const vertex_descriptor v: component) {
- const vector<uint64_t>& snippets = graph[v].snippetIndexes;
- SHASTA_ASSERT(snippets.size() == 1);
- snippetsNew.push_back(snippets.front());
- }
-
- // Create the new edges.
- for(const vertex_descriptor v0: component) {
-
- // Out-edges.
- BGL_FORALL_OUTEDGES_T(v0, e01, graph, SnippetGraph) {
- const vertex_descriptor v1 = target(e01, graph);
- if(v1 != vNew) {
- add_edge(vNew, v1,graph);
- }
- }
-
- // In-edges.
- BGL_FORALL_INEDGES_T(v0, e10, graph, SnippetGraph) {
- const vertex_descriptor v1 = source(e10, graph);
- if(v1 != vNew) {
- add_edge(v1, vNew, graph);
- }
- }
- }
-
- // Remove the old vertices and their edges.
- for(const vertex_descriptor v: component) {
- clear_vertex(v, graph);
- remove_vertex(v, graph);
- }
- }
-
-
- // Compute which maximal vertices each vertex is a descendant of.
- std::map<vertex_descriptor, vector<vertex_descriptor> > ancestorMap;
- BGL_FORALL_VERTICES_T(v0, graph, SnippetGraph) {
- if(in_degree(v0, graph) != 0) {
- continue; // Not a maximal vertex.
- }
-
- // Find the descendants of this maximal vertex.
- vector<vertex_descriptor> descendants;
- graph.findDescendants(v0, descendants);
-
- // Update the ancestor map.
- for(const vertex_descriptor v1: descendants) {
- ancestorMap[v1].push_back(v0);
- }
- }
-
-
-
- // Each maximal vertex generates a cluster consisting of the vertices
- // that descend from it and from no other maximal vertex.
- // Gather the vertices in each cluster.
- std::map<vertex_descriptor, vector<vertex_descriptor> > clusterMap;
- uint64_t unclusterVertexCount = 0;
- BGL_FORALL_VERTICES_T(v1, graph, SnippetGraph) {
- const vector<vertex_descriptor>& ancestors = ancestorMap[v1];
- if(ancestors.size() == 1) {
- const vertex_descriptor v0 = ancestors.front();
- clusterMap[v0].push_back(v1);
- } else {
- ++unclusterVertexCount;
- }
- }
- cout << "Found " << unclusterVertexCount << " unclustered vertices." << endl;
-
-
-
- // Gather the snippets in each cluster.
- clusters.clear();
- for(const auto& p: clusterMap) {
- const vector<vertex_descriptor>& clusterVertices = p.second;
- clusters.resize(clusters.size() + 1);
- Cluster& cluster = clusters.back();
-
- vector<uint64_t> clusterSnippetIndexes; // Only used for debug output.
- for(const vertex_descriptor v: clusterVertices) {
- const vector<uint64_t>& snippetIndexes = graph[v].snippetIndexes;
- for(const uint64_t snippetIndex: snippetIndexes) {
- cluster.snippets.push_back(snippets[snippetIndex]);
- clusterSnippetIndexes.push_back(snippetIndex);
- }
- }
- cluster.constructSegments();
- cluster.cleanupSegments(minSegmentCoverage);
- cout << "Found a cluster candidate with " <<
- clusterVertices.size() << " vertices and " <<
- cluster.snippets.size() << " snippets:" << endl;
- for(const uint64_t snippetIndex: clusterSnippetIndexes) {
- cout << snippetIndex << " ";
- }
- cout << endl;
-
- // If coverage on this cluster is too low, discard it.
- if(cluster.coverage() < minClusterCoverage) {
- clusters.resize(clusters.size() - 1);
- cout << "This cluster candidate was discarded because of low coverage." << endl;
- continue;
- }
-
- // This cluster will be stored and is assigned this clusterId;
- const uint64_t clusterId = clusters.size() - 1;
-
- if(debug) {
-
- cout << "This cluster was stored as cluster " << clusterId << endl;
- cout << "Segment(coverage) for this cluster:\n";
- for(const auto& p: cluster.segments) {
- cout << p.first << "(" << p.second << ") ";
- }
- cout << endl;
- }
-
- // Mark the vertices of this cluster.
- for(const vertex_descriptor v: clusterVertices) {
- graph[v].clusterId = clusterId;
- }
- }
- graph.clusterCount = clusters.size();
-
-
-
- // Write out the SnippetGraph.
- if(debug) {
- graph.writeGraphviz("SnippetGraph.dot");
- }
-}
-
-
-
-void AssemblyGraph::AnalyzeSubgraphClasses::SnippetGraph::findDescendants(
- const vertex_descriptor vStart,
- vector<vertex_descriptor>& descendants) const
-{
- const SnippetGraph& graph = *this;
-
- // Initialize the BFS.
- std::queue<vertex_descriptor> q;
- q.push(vStart);
- std::set<vertex_descriptor> descendantsSet;
- descendantsSet.insert(vStart);
-
- // BFS loop.
- while(not q.empty()) {
- const vertex_descriptor v0 = q.front();
- q.pop();
-
- BGL_FORALL_OUTEDGES(v0, e01, graph, SnippetGraph) {
- const vertex_descriptor v1 = target(e01, graph);
- if(descendantsSet.find(v1) == descendantsSet.end()) {
- q.push(v1);
- descendantsSet.insert(v1);
- }
- }
- }
-
- descendants.clear();
- copy(descendantsSet.begin(), descendantsSet.end(), back_inserter(descendants));
-}
-
-
-
-void AssemblyGraph::AnalyzeSubgraphClasses::SnippetGraph::writeGraphviz(
- const string& fileName) const
-{
- const SnippetGraph& graph = *this;
-
- ofstream dot(fileName);
- dot << "digraph SnippetGraph{\n"
- "node [shape=rectangle];\n";
- BGL_FORALL_VERTICES(v, graph, SnippetGraph) {
- dot << "\"" << v << "\" [label=\"";
- const vector<uint64_t>& snippetIndexes = graph[v].snippetIndexes;
- for(const uint64_t snippetIndex: snippetIndexes) {
- dot << snippetIndex;
- dot << "\\n";
- }
- dot << "\"";
- const uint64_t clusterId = graph[v].clusterId;
- if(clusterId != std::numeric_limits<uint64_t>::max()) {
- dot << " style=filled fillcolor=\"" <<
- float(clusterId)/float(clusterCount) <<
- ",0.3,1\"";
- }
- dot << "];\n";
- }
- BGL_FORALL_EDGES(e, graph, SnippetGraph) {
- const vertex_descriptor vx = source(e, graph);
- const vertex_descriptor vy = target(e, graph);
- dot << "\"" << vx << "\"->\"" << vy << "\";\n";
- }
- dot << "}\n";
-
-}
-
-
-
-void AssemblyGraph::AnalyzeSubgraphClasses::Cluster::constructSegments()
-{
- // A map with Key=segmentId, value = coverage.
- std::map<uint64_t, uint64_t> segmentMap;
-
- for(const JourneySnippet& snippet: snippets) {
- for(const uint64_t segmentId: snippet.segmentIds) {
- auto it = segmentMap.find(segmentId);
- if(it == segmentMap.end()) {
- segmentMap.insert(make_pair(segmentId, 1));
- } else {
- ++(it->second);
- }
- }
- }
-
- segments.clear();
- copy(segmentMap.begin(), segmentMap.end(), back_inserter(segments));
-}
-
-
-
-void AssemblyGraph::AnalyzeSubgraphClasses::Cluster::cleanupSegments(uint64_t minSegmentCoverage)
-{
- vector< pair<uint64_t, uint64_t > > newSegments;
- for(const auto& p: segments) {
- if(p.second >= minSegmentCoverage) {
- newSegments.push_back(p);
- }
- }
- segments.swap(newSegments);
-}
-
-
-
-vector<uint64_t> AssemblyGraph::AnalyzeSubgraphClasses::Cluster::getSegments() const
-{
- vector<uint64_t> v;
- for(const auto& p: segments) {
- v.push_back(p.first);
- }
- return v;
-}
-
-
-
-// Create an assembly path starting at a given segment.
-void AssemblyGraph::createAssemblyPath(
- uint64_t startSegmentId,
- uint64_t direction, // 0 = forward, 1 = backward
- AssemblyPath& path
- ) const
-{
- // EXPOSE WHEN CODE STABILIZES.
- const uint64_t minCommonForLink = 3;
- const uint64_t minCommonForReference = 3;
- const double minJaccard = 0.75;
- const int32_t minLinkSeparation = -20;
-
- const bool debug = false;
- if(true) {
- cout << timestamp << "createAssemblyPath begins at segment " << startSegmentId <<
- ", direction " << direction << endl;
- }
-
-
-
- // At each iteration, we start from segmentIdA (the current "primary segment")
- // and move in the specified direction until we find segmentIdB with
- // sufficiently high Jaccard similarity and number of
- // common oriented reads with segmentIdA.
- // At each step, we choose the links that has the most common oriented
- // reads with the current primary segment.
- uint64_t referenceSegmentId = startSegmentId;
- SegmentOrientedReadInformation infoReference;
- getOrientedReadsOnSegment(referenceSegmentId, infoReference);
- uint64_t segmentId0 = startSegmentId;
- path.clear();
- path.segments.push_back(AssemblyPathSegment(startSegmentId, true));
- vector<uint64_t> lastIterationSegments;
- std::set< pair<uint64_t, uint64_t> > previousPairs; // (reference segment, current segment).
- while(true) {
-
- if(debug) {
- cout << "Reference segment " << referenceSegmentId <<
- ", segmentId0 " << segmentId0 << endl;
- }
-
- // Loop over outgoing or incoming links of the current segment.
- // Find the link with the most common reads with the reference segment.
- const auto linkIds = (direction == 0) ? linksBySource[segmentId0] : linksByTarget[segmentId0];
- if(linkIds.empty()) {
- if(debug) {
- cout << "No links in this direction." << endl;
- }
- break;
- }
- uint64_t linkIdBest = invalid<uint64_t>;
- uint64_t commonOrientedReadCountBest = 0;
- for(const uint64_t linkId: linkIds) {
-
- // If link separation is too negative, skip it.
- // The goal here is to avoid cycles in paths.
- const Link& link = links[linkId];
- if(link.separation < minLinkSeparation) {
- continue;
- }
-
- // Count the number of common oriented reads between the reference segment and this link.
- uint64_t commonOrientedReadCount;
- analyzeSegmentLinkPair(referenceSegmentId, linkId, commonOrientedReadCount);
-
- // If better than the one we have it, record it.
- if(commonOrientedReadCount > commonOrientedReadCountBest) {
- linkIdBest = linkId;
- commonOrientedReadCountBest = commonOrientedReadCount;
- }
- }
- if(commonOrientedReadCountBest < minCommonForLink) {
- if(debug) {
- cout << "No good links found." << endl;
- }
- break;
- }
- const uint64_t linkId = linkIdBest;
- if(debug) {
- cout << "Best link " << linkId <<
- ", " << commonOrientedReadCountBest <<
- " common oriented reads with the reference segment." << endl;
- }
-
- // Get the segment at the other side of this link.
- const Link& link = links[linkId];
- const uint64_t segmentId1 = (direction==0) ? link.segmentId1 : link.segmentId0;
- if(debug) {
- cout << "segmentId1 " << segmentId1 << endl;
- }
- lastIterationSegments.push_back(segmentId1);
-
- // Check that we haven't been here before.
- if(previousPairs.contains(make_pair(referenceSegmentId, segmentId1))) {
- break;
- }
- previousPairs.insert(make_pair(referenceSegmentId, segmentId1));
-
- // Check segmentId1 against the reference segment.
- SegmentOrientedReadInformation info1;
- getOrientedReadsOnSegment(segmentId1, info1);
- SegmentPairInformation info;
- analyzeSegmentPair(
- referenceSegmentId, segmentId1,
- infoReference, info1,
- markers, info);
- if(debug) {
- cout << "Jaccard " << info.jaccard() << endl;
- }
-
- // If the Jaccard similarity is high, this becomes the new reference segment.
- if(info.commonCount >= minCommonForReference and info.jaccard() >= minJaccard) {
- referenceSegmentId = segmentId1;
- getOrientedReadsOnSegment(referenceSegmentId, infoReference);
- const uint64_t lastPrimarySegmentId = path.segments.back().id;
- if(debug) {
- cout << "New reference segment is " << segmentId1 << endl;
- cout << "Previous reference segment is " << lastPrimarySegmentId << endl;
- }
- for(const uint64_t segmentId: lastIterationSegments) {
- path.segments.push_back(AssemblyPathSegment(segmentId, false));
- if(debug) {
- cout << "Added segment " << segmentId << " to path." << endl;
- }
- if(segmentId != segmentId1) {
- if(direction == 0) {
- path.segments.back().previousPrimarySegmentId = lastPrimarySegmentId;
- path.segments.back().nextPrimarySegmentId = segmentId1;
- } else {
- path.segments.back().previousPrimarySegmentId = segmentId1;
- path.segments.back().nextPrimarySegmentId = lastPrimarySegmentId;
- }
- }
- }
- path.segments.back().isPrimary = true;
- lastIterationSegments.clear();
- }
-
- segmentId0 = segmentId1;
- }
-
-
-
- if(true) {
- cout << timestamp << "createAssemblyPath3 ends." << endl;
- }
-}
-
-
-
-// Count the number of common oriented reads between a segment and a link,
-// without counting oriented reads that appear more than once on the
-// segment or on the link.
-void AssemblyGraph::analyzeSegmentLinkPair(
- uint64_t segmentId,
- uint64_t linkId,
- uint64_t& commonOrientedReadCount
-) const
-{
- // The oriented reads in this segment,
- // with some extra information that we don't care about here.
- const auto segmentOrientedReads = assemblyGraphJourneyInfos[segmentId];
-
- // The oriented reads in this link,
- // with some extra information that we don't care about here.
- const auto linkOrientedReads = transitions[linkId];
-
- // Joint loop over oriented reads.
- commonOrientedReadCount = 0;
- const auto segmentBegin = segmentOrientedReads.begin();
- const auto segmentEnd = segmentOrientedReads.end();
- const auto linkBegin = linkOrientedReads.begin();
- const auto linkEnd = linkOrientedReads.end();
- auto itSegment = segmentBegin;
- auto itLink = linkBegin;
- while(itSegment != segmentEnd and itLink != linkEnd) {
-
- if(itSegment->first < itLink->first) {
- ++itSegment;
- continue;
- }
- if(itLink->first < itSegment->first) {
- ++itLink;
- continue;
- }
- SHASTA_ASSERT(itSegment->first == itLink->first);
-
- // If it appears more than once in the segment, skip it.
- auto itSegmentNext = itSegment + 1;
- if(itSegmentNext != segmentEnd and itSegmentNext->first == itSegment->first) {
- ++itSegment;
- ++itLink;
- continue;
- }
- if(itSegment != segmentBegin) {
- auto itSegmentPrevious = itSegment - 1;
- if(itSegmentPrevious->first == itSegment->first) {
- ++itSegment;
- ++itLink;
- continue;
- }
- }
-
- // If it appears more than once in the link, skip it.
- auto itLinkNext = itLink + 1;
- if(itLinkNext != linkEnd and itLinkNext->first == itLink->first) {
- ++itSegment;
- ++itLink;
- continue;
- }
- if(itLink != linkBegin) {
- auto itLinkPrevious = itLink - 1;
- if(itLinkPrevious->first == itLink->first) {
- ++itSegment;
- ++itLink;
- continue;
- }
- }
-
- // Ok, this is a common oriented read that appears only once
- // in both the segment and the link.
- ++commonOrientedReadCount;
- ++itSegment;
- ++itLink;
- }
-
-}
-
-
-
-// Given a segment, use a BFS to move in the specified direction until
-// we find a segment with sufficiently high Jaccard similarity
-// and number of common reads.
-// This returns invalid<uint64_t> if no such segment is found
-// within the specified distance.
-uint64_t AssemblyGraph::findSimilarSegmentBfs(
- uint64_t segmentIdA,
- uint64_t direction, // 0 = forward, 1 = backward
- uint64_t maxDistance,
- uint64_t minCommon,
- double minJaccard) const
-{
- const bool debug = true;
- if(debug) {
- cout << "findSimilarSegmentBfs starts " << segmentIdA << " " << direction << endl;
- }
-
- // Sanity check.
- SHASTA_ASSERT(maxDistance > 0);
-
- // Get the oriented reads on segmentIdA.
- SegmentOrientedReadInformation infoA;
- getOrientedReadsOnSegment(segmentIdA, infoA);
-
- // Initialize a BFS.
- std::queue<uint64_t> q;
- q.push(segmentIdA);
-
- // Keep track of segments we already encountered and their distance.
- // Key = segmentId;
- // Value = distance.
- std::map<uint64_t, uint64_t> distanceMap;
- distanceMap.insert(make_pair(segmentIdA, 0));
-
-
-
- // BFS loop.
- while(not q.empty()) {
-
- // Dequeue a segment.
- const uint64_t segmentId0 = q.front();
- q.pop();
- const uint64_t distance0 = distanceMap[segmentId0];
- SHASTA_ASSERT(distance0 < maxDistance);
- const uint64_t distance1 = distance0 + 1;
- if(debug) {
- cout << "dequeued " << segmentId0 << " " << distance0 << endl;
- }
-
- // Loop over outgoing or incoming links.
- const auto linkIds = (direction == 0) ? linksBySource[segmentId0] : linksByTarget[segmentId0];
- for(const uint64_t linkId: linkIds) {
- const Link& link = links[linkId];
-
- // Get the segment at the other side of this link.
- const uint64_t segmentId1 = (direction==0) ? link.segmentId1 : link.segmentId0;
-
- // If we already found it, skip it.
- if(distanceMap.contains(segmentId1)) {
- continue;
- }
-
- if(debug) {
- cout << "found " << segmentId1 << " " << distance1 << endl;
- }
-
- // Get the oriented reads on segmentId1.
- SegmentOrientedReadInformation info1;
- getOrientedReadsOnSegment(segmentId1, info1);
-
- // See how similar this is to segmentIdA.
- SegmentPairInformation infoA1;
- analyzeSegmentPair(
- segmentIdA, segmentId1,
- infoA, info1,
- markers, infoA1);
-
- // If this satisfies our criteria, we are done.
- if(infoA1.commonCount >= minCommon and
- infoA1.jaccard() >= minJaccard) {
- if(debug) {
- cout << "findSimilarSegmentBFS returns " << segmentId1 << " " << direction << endl;
- }
- return segmentId1;
- }
-
- // This segment did not satisfy our criteria, so we
- // have to continue the BFS.
- if(distance1 < maxDistance) {
- q.push(segmentId1);
- distanceMap.insert(make_pair(segmentId1, distance1));
- if(debug) {
- cout << "enqueued " << segmentId1 << " " << distance1 << endl;
- }
- }
- }
-
- }
-
-
-
- // If getting here, we did not find a segment that satisfies
- // the requested criteria.
- if(debug) {
- cout << "findSimilarSegmentBfs returns invalid" << endl;
- }
- return invalid<uint64_t>;
-}
-
-
-
-// Given a segment, move in the specified direction,
-// in order of increasing distance in markers, until
-// we find a segment with sufficiently high Jaccard similarity
-// and number of common reads.
-// This returns invalid<uint64_t> if no such segment is found
-// within the specified distance.
-uint64_t AssemblyGraph::findSimilarSegment(
- uint64_t segmentIdA,
- uint64_t direction, // 0 = forward, 1 = backward
- uint64_t maxDistance, // In markers
- uint64_t minLinkCoverage,
- int32_t minLinkSeparation,
- uint64_t minCommon,
- double maxUnexplainedFraction,
- double minJaccard,
- vector<uint64_t>& segments) const
-{
- const bool debug = false;
- if(debug) {
- cout << "findSimilarSegment begins, segmentIdA " << segmentIdA << endl;
- }
- // Sanity check.
- SHASTA_ASSERT(maxDistance > 0);
-
- segments.clear();
-
- // Get the oriented reads on segmentIdA.
- SegmentOrientedReadInformation infoA;
- getOrientedReadsOnSegment(segmentIdA, infoA);
-
- // (Offset, segmentId) for queued segments.
- std::multimap<uint64_t, uint64_t> q;
- q.insert(make_pair(0, segmentIdA));
-
- // The segments that we already encountered.
- std::set<uint64_t> visitedSegmentSet;
- visitedSegmentSet.insert(segmentIdA);
-
-
-
- // Search loop.
- while(not q.empty()) {
-
- // Dequeue the segment with the smallest offset.
- const auto it0 = q.begin();
- const uint64_t segmentId0 = it0->second;
- q.erase(it0);
-
- // Analyze against segmentIdA.
- SegmentOrientedReadInformation info0;
- getOrientedReadsOnSegment(segmentId0, info0);
- SegmentPairInformation infoA0;
- analyzeSegmentPair(
- segmentIdA, segmentId0,
- infoA, info0,
- markers, infoA0);
-
- // Add it to our list of segments, if possible.
- const double unexplainedFraction = infoA0.unexplainedFraction(0);
- if(unexplainedFraction < maxUnexplainedFraction) {
- segments.push_back(segmentId0);
- }
-
- // If unexplained fraction and Jaccard similarity are low, we are done.
- if(segmentId0 != segmentIdA) {
- if((unexplainedFraction < maxUnexplainedFraction) and (infoA0.jaccard() >= minJaccard)) {
- SHASTA_ASSERT(segments.back() == segmentId0);
- return segmentId0;
- }
- }
-
- if(debug) {
- cout << "Dequeued " << segmentId0 << endl;
- }
-
- // Loop over outgoing or incoming links.
- const auto linkIds = (direction == 0) ? linksBySource[segmentId0] : linksByTarget[segmentId0];
- for(const uint64_t linkId: linkIds) {
- // If link coverage is too low, skip.
- if(transitions.size(linkId) < minLinkCoverage) {
- continue;
- }
-
- // If link separation is too negative, skip it.
- // The goal here is to avoid cycles in paths.
- const Link& link = links[linkId];
- if(link.separation < minLinkSeparation) {
- continue;
- }
-
- // Get the segment at the other side of this link.
- const uint64_t segmentId1 = (direction==0) ? link.segmentId1 : link.segmentId0;
- if(debug) {
- cout << "Found " << segmentId1 << endl;
- }
-
- // If we already found it, skip it.
- if(visitedSegmentSet.contains(segmentId1)) {
- if(debug) {
- cout << "Already found, skipping." << endl;
- }
- continue;
- }
- visitedSegmentSet.insert(segmentId1);
-
- // Get the oriented reads on segmentId1.
- SegmentOrientedReadInformation info1;
- getOrientedReadsOnSegment(segmentId1, info1);
-
- // Analyze similarity to segmentIdA.
- SegmentPairInformation infoA1;
- analyzeSegmentPair(
- segmentIdA, segmentId1,
- infoA, info1,
- markers, infoA1);
-
- // If not enough common segments, skip it.
- if(infoA1.commonCount < minCommon) {
- if(debug) {
- cout << "Not enough common reads." << endl;
- }
- continue;
- }
-
- // Offset estimates are not reliable.
- // Don't use them to rule out segments.
-#if 0
- // If not in the expected direction, skip it.
- uint64_t offset;
- if(direction == 0) {
- if(infoA1.offset < 0) {
- if(debug) {
- cout << "Not in the forward direction." << endl;
- }
- continue;
- } else {
- offset = uint64_t(infoA1.offset);
- }
- } else {
- if(infoA1.offset > 0) {
- if(debug) {
- cout << "Not in the backward direction." << endl;
- }
- continue;
- } else {
- offset = uint64_t(-infoA1.offset);
- }
- }
-#endif
-
- // If we went too far, skip it.
- if(labs(infoA1.offset) > maxDistance) {
- if(debug) {
- cout << "Too far." << endl;
- }
- continue;
- }
-
- // Queue it.
- q.insert(make_pair(labs(infoA1.offset), segmentId1));
- if(debug) {
- cout << "Queued " << segmentId1 << endl;
- }
-
- }
-
- }
-
-
-
- // If getting here, we did not find a segment that satisfies
- // the requested criteria.
- return invalid<uint64_t>;
-}
-
-
-
-// BFS with given begin/end.
-// Does a BFS which starts at segmentIdA.
-// and ends when segmentIdB is encountered.
-// The BFS if forward if direction is 0
-// and backward if direction is 1.
-// Computes a vector of all the segments encountered,
-// excluding segmentIdA and segmentIdB,
-// in the order in which they are encountered in the BFS.
-void AssemblyGraph::targetedBfs(
- uint64_t segmentIdA,
- uint64_t segmentIdB,
- uint64_t direction,
- vector<uint64_t>& segments
- ) const
-{
-
- // Initialize the BFS.
- std::queue<uint64_t> q;
- q.push(segmentIdA);
-
- // Keep track of segments we already encountered.
- std::set<uint64_t> segmentSet;
- segmentSet.insert(segmentIdA);
-
-
-
- // BFS loop.
- segments.clear();
- while(not q.empty()) {
-
- // Dequeue a segment.
- const uint64_t segmentId0 = q.front();
- q.pop();
-
- // Loop over outgoing or incoming links.
- const auto linkIds = (direction == 0) ? linksBySource[segmentId0] : linksByTarget[segmentId0];
- for(const uint64_t linkId: linkIds) {
- const Link& link = links[linkId];
-
- // Get the segment at the other side of this link.
- const uint64_t segmentId1 = (direction==0) ? link.segmentId1 : link.segmentId0;
-
- // If we found segmentIdB, we are done.
- if(segmentId1 == segmentIdB) {
- break;
- }
-
- // If we already found it, skip it.
- if(segmentSet.contains(segmentId1)) {
- continue;
- }
-
- // Queue and store this segment.
- q.push(segmentId1);
- segments.push_back(segmentId1);
- segmentSet.insert(segmentId1);
- }
- }
-
-}
-
-
-// Assemble the assembly paths stored in the JaccardGraph.
-void AssemblyGraph::assembleJaccardGraphPaths()
-{
- const JaccardGraph& jaccardGraph = *jaccardGraphPointer;
- ofstream fasta("JaccardGraphPaths.fasta");
-
- uint64_t totalSequenceAssembled = 0;
- for(uint64_t clusterId=0; clusterId<jaccardGraph.assemblyPaths.size(); clusterId++) {
- const vector<uint64_t>& primarySegments = jaccardGraph.assemblyPaths[clusterId];
- AssemblyPath assemblyPath;
- assembleJaccardGraphPath(primarySegments, assemblyPath);
-
- const auto& sequence = assemblyPath.rawSequence;
- totalSequenceAssembled += sequence.size();
- fasta << ">" << clusterId << " " << sequence.size() << "\n";
- copy(sequence.begin(), sequence.end(), ostream_iterator<Base>(fasta));
- fasta << "\n";
- }
- cout << "Assembled a total " << totalSequenceAssembled << " bases." << endl;
-}
-
-
-
-void AssemblyGraph::assembleJaccardGraphPath(
- const vector<uint64_t>& primarySegments,
- AssemblyPath& assemblyPath)
-{
- SHASTA_ASSERT(primarySegments.size() >= 2);
-
- const JaccardGraph& jaccardGraph = *jaccardGraphPointer;
-
- // Initialize the path and add the first primary segment.
- assemblyPath.segments.clear();
- assemblyPath.links.clear();
- assemblyPath.segments.push_back(AssemblyPathSegment(primarySegments.front(), true));
-
- // Add the remaining primary and secondary segments.
- for(uint64_t i=1; i<primarySegments.size(); i++) {
- const uint64_t primarySegment0 = primarySegments[i-1];
- const uint64_t primarySegment1 = primarySegments[i];
-
- // Get the JaccardGraph vertices corresponding to these primary segments.
- const JaccardGraph::vertex_descriptor v0 = jaccardGraph.vertexTable[primarySegment0];
- const JaccardGraph::vertex_descriptor v1 = jaccardGraph.vertexTable[primarySegment1];
-
- // Get the JaccardGraph edge between these two vertices.
- JaccardGraph::edge_descriptor e;
- bool edgeWasFound = false;
- tie(e, edgeWasFound) = edge(v0, v1, jaccardGraph);
- SHASTA_ASSERT(edgeWasFound);
- const JaccardGraphEdge& edge = jaccardGraph[e];
-
- // Access the secondary segments on this edge.
- const vector<uint64_t>& secondarySegmentIds = edge.segmentIds;
-
- // Add the secondary segments between these two primary segments.
- for(const uint64_t segmentId: secondarySegmentIds) {
- AssemblyPathSegment assemblyPathSegment(segmentId, false);
- assemblyPathSegment.previousPrimarySegmentId = primarySegment0;
- assemblyPathSegment.nextPrimarySegmentId = primarySegment1;
- assemblyPath.segments.push_back(assemblyPathSegment);
-
- }
-
- // Add the next primary segment.
- assemblyPath.segments.push_back(AssemblyPathSegment(primarySegment1, true));
- }
-
- // Assemble sequence for this path.
- assemblyPath.assemble(*this);
-
-}
-
-
-
-// De Bruijn graph of the assembly graph journeys of all oriented reads.
-// Each assembly graph journey is interpreted as a sequence of segment ids.
-// Each vertex represents a sequence of K segment ids.
-template<uint64_t K> void AssemblyGraph::createDeBruijnGraphTemplated() const
-{
- // EXPOSE WHEN CODE STABILIZES.
- const uint64_t minCoverage = 8;
-
- // Type used to store the sequence of a vertex (K segment ids).
- using VertexSequence = array<uint64_t, K>;
-
- // Loop over all oriented reads to gather vertex sequences.
- vector<VertexSequence> vertexSequences;
- for(uint64_t i=0; i<assemblyGraphJourneys.size(); i++) {
-
- // Get the assembly graph journey for this oriented read.
- const span<const AssemblyGraphJourneyEntry>& journey = assemblyGraphJourneys[i];
- const uint64_t journeyLength = journey.size();
-
- // If too short, skip.
- if(journeyLength < K) {
- continue;
- }
-
- // Extract sequences of length K from the journey.
- // Loop over starting positions.
- for(uint64_t j=0; j<=journeyLength-K; j++) {
-
- // Fill in the seqyence of length K starting here.
- VertexSequence vertexSequence;
- for(uint64_t k=0; k<K; k++) {
- vertexSequence[k] = journey[j+k].segmentId;
- }
- // Store it.
- vertexSequences.push_back(vertexSequence);
- }
- }
-
- // Count how many times each sequence was found.
- vector<uint64_t> coverage;
- deduplicateAndCount(vertexSequences, coverage);
- SHASTA_ASSERT(vertexSequences.size() == coverage.size());
-
- // Each sequence with sufficient coverage generates a vertex.
- using Vertex = pair<VertexSequence, uint64_t>; // Stores sequence and coverage.
- using Graph = boost::adjacency_list<boost::listS, boost::listS, boost::bidirectionalS, Vertex>;
- using vertex_descriptor = Graph::vertex_descriptor;
- Graph graph;
-
- for(uint64_t i=0; i<vertexSequences.size(); i++) {
- const uint64_t c = coverage[i];
- if(c >= minCoverage) {
- add_vertex(Vertex(vertexSequences[i], c), graph);
- }
- }
-
- // To generate edges, index the vertices by their (K-1)-prefix.
- using Prefix = array<uint64_t, K-1>;
- std::map<Prefix, vector<vertex_descriptor> > vertexMap;
- BGL_FORALL_VERTICES_T(v, graph, Graph) {
- const VertexSequence& sequence = graph[v].first;
- Prefix prefix;
- copy(sequence.begin(), sequence.begin()+K-1, prefix.begin());
- vertexMap[prefix].push_back(v);
- }
-
- // Now we can generate the edges.
- using Suffix = Prefix;
- BGL_FORALL_VERTICES_T(v0, graph, Graph) {
- const VertexSequence& sequence = graph[v0].first;
- Suffix suffix;
- copy(sequence.begin()+1, sequence.end(), suffix.begin());
- auto it = vertexMap.find(suffix);
- if(it == vertexMap.end()) {
- continue;
- }
- for(const vertex_descriptor v1: it->second) {
- add_edge(v0, v1, graph);
- }
- }
- cout << "The DeBruijn graph has " << num_vertices(graph) << " vertices and " <<
- num_edges(graph) << " edges." << endl;
-
- // Write it out.
- ofstream dot("DeBruijnGraph.dot");
- dot << "digraph DeBruijnGraph {\n";
- BGL_FORALL_VERTICES_T(v, graph, Graph) {
- const uint64_t coverage = graph[v].second;
- dot << "\"" << v << "\" ["
- "tooltip=\"" << coverage << "\" "
- "width=" << 0.001*double(coverage) <<
- "];\n";
- }
- BGL_FORALL_EDGES_T(e, graph, Graph) {
- const vertex_descriptor v0 = source(e, graph);
- const vertex_descriptor v1 = target(e, graph);
- dot << "\"" << v0 << "\"->\"" << v1 << "\";\n";
- }
- dot << "}\n";
-}
-
-
-
-void AssemblyGraph::createDeBruijnGraph() const
-{
- createDeBruijnGraphTemplated<3>();
-}
-
-
-
-void AssemblyGraph::assembleSegments()
-{
- createNew(segmentSequences, "Mode3-SegmentSequences");
- createNew(segmentVertexOffsets, "Mode3-SegmentVertexOffsets");
- for(uint64_t segmentId=0; segmentId<markerGraphPaths.size(); segmentId++) {
- assembleSegment(segmentId);
- }
-}
-void AssemblyGraph::assembleSegment(uint64_t segmentId)
-{
- // Assemble it.
- AssembledSegment assembledSegment;
- assembleMarkerGraphPath(
- readRepresentation,
- k,
- markers,
- markerGraph,
- markerGraphPaths[segmentId],
- false,
- assembledSegment);
-
- // Store assembled sequence and vertex offsets.
- segmentSequences.appendVector(assembledSegment.rawSequence);
- segmentVertexOffsets.appendVector(assembledSegment.vertexOffsets);
-
-}
-
-
-
-// Assemble a link, given a set of allowed OrientedReadId(s).
-// The returned sequence overrides:
-// - The trim0 last bases of the preceding segment.
-// - The trim1 first bases of the following segment.
-void AssemblyGraph::assembleLink(
- uint64_t linkId,
- const vector<OrientedReadId>& allowedOrientedReadIds,
- vector<Base>& sequence, // The entire MSA sequence
- uint64_t& leftTrim, // The number of MSA sequence to be trimmed on the left for assembly
- uint64_t& rightTrim, // The number of MSA sequence to be trimmed on the left for assembly
- uint64_t& trim0, // The number of bases at the end of segment0 to be trimmed for assembly
- uint64_t& trim1, // The number of bases at the beginning of segment1 to be trimmed for assembly
- ostream& html
-) const
-{
- const bool debug = false;
- const Link& link = links[linkId];
-
- SHASTA_ASSERT(std::is_sorted(allowedOrientedReadIds.begin(), allowedOrientedReadIds.end()));
-
- // If the preceding and last segment are adjacent,
- // assembling the link is trivial.
- // We return an empty sequence which overrides
- // the last k/2 bases of the preceding segment
- // and the first k/2 bases of the following segment.
- // This was the resulting segment sequences are exactly adjacent.
- if(link.segmentsAreAdjacent) {
- sequence.clear();
- leftTrim = 0;
- rightTrim = 0;
- trim0 = k / 2;
- trim1 = k / 2;
- return;
- }
-
- // If getting here, the two segments of this link are not adjacent.
- // Get some infomation we are going to need below.
- const uint64_t segmentId0 = link.segmentId0;
- const uint64_t segmentId1 = link.segmentId1;
- const auto sequence0 = segmentSequences[segmentId0];
- const auto sequence1 = segmentSequences[segmentId1];
- SHASTA_ASSERT(not sequence0.empty());
- SHASTA_ASSERT(not sequence1.empty());
- const auto vertexOffsets0 = segmentVertexOffsets[segmentId0];
- const auto vertexOffsets1 = segmentVertexOffsets[segmentId1];
-
-
- // First, find:
- // - The position in segmentId0 of the leftmost transition.
- // - The position in segmentId1 of the rightmost transition.
- uint64_t minEdgePosition0 = markerGraphPaths[segmentId0].size();
- uint64_t maxEdgePosition1 = 0;
- for(const auto &p : transitions[linkId]) {
- const OrientedReadId orientedReadId = p.first;
-
- // If not one of the allowed OrientedReadId(s), skip it.
- if(not binary_search(allowedOrientedReadIds.begin(), allowedOrientedReadIds.end(), orientedReadId)) {
- continue;
- }
-
- // Access the transition from segmentId0 to segmentId1 for this oriented read.
- const Transition &transition = p.second;
-
- minEdgePosition0 = min(minEdgePosition0,
- uint64_t(transition[0].position));
- maxEdgePosition1 = max(maxEdgePosition1,
- uint64_t(transition[1].position));
- }
-
- // When getting here:
- // - minEdgePosition0 is the leftmost position of the transitions in path0.
- // - maxEdgePosition1 is the rightmost position of the transitions in path1.
- // These positions are edge positions in markerGraphPath0 and markerGraphPath1.
- // We will do a multiple sequence alignment of the oriented reads,
- // using the sequence of segmentId0 to extend to the left all reads to minEdgePosition0,
- // and using the sequence of segmentId1 to extend to the right all reads to maxEdgePosition1,
-
- // Get the corresponding vertex positions in segmentId0 and segmentId1.
- const uint64_t minVertexPosition0 = minEdgePosition0 + 1;
- const uint64_t maxVertexPosition1 = maxEdgePosition1;
-
- // To compute an MSA anchored at both sides,we will extend the
- // sequence of each read to the left/right using the sequence of
- // adjacent segments.
-
-
- // Now extract the portion of each oriented read sequence that
- // will be used to assemble this link.
- vector<OrientedReadId> orientedReadIdsForAssembly;
- vector<vector<Base> > orientedReadsSequencesForAssembly;
- for(const auto &p : transitions[linkId]) {
- const OrientedReadId orientedReadId = p.first;
-
- // If not one of the allowed OrientedReadId(s), skip it.
- if(not binary_search(allowedOrientedReadIds.begin(), allowedOrientedReadIds.end(), orientedReadId)) {
- continue;
- }
-
- // Access the transition from segmentId0 to segmentId1 for this oriented read.
- const Transition &transition = p.second;
-
- // Get the ordinals of the last appearance of this oriented
- // read on segmentId0 and the first on segmentId1,
- // and the corresponding markers.
- const uint32_t ordinal0 = transition[0].ordinals[1];
- const uint32_t ordinal1 = transition[1].ordinals[0];
- const CompressedMarker &marker0 = markers[orientedReadId.getValue()][ordinal0];
- const CompressedMarker &marker1 = markers[orientedReadId.getValue()][ordinal1];
-
- // Get the positions of these markers on the oriented read.
- // If using RLE, these are RLE positions.
- const uint32_t position0 = marker0.position;
- const uint32_t position1 = marker1.position;
-
- // Extract the sequence between these markers (including the markers).
- vector<Base> orientedReadSequence;
- for(uint64_t position = position0;
- position < position1 + k; position++) {
- const Base b = reads.getOrientedReadBase(orientedReadId, uint32_t(position));
- orientedReadSequence.push_back(b);
- }
-
- // We need to extend the sequence of this read to the left,
- // using segmentId0 sequence, up to minVertexPosition0,
- // so the portions of all reads we will be using for the MSA
- // all begin in the same place.
- vector<Base> leftSequence;
- vector<uint32_t> leftRepeatCounts;
- const uint64_t vertexPosition0 = transition[0].position + 1; // Add 1 to get vertex position.
- const uint64_t begin0 =
- vertexOffsets0[minVertexPosition0];
- const uint64_t end0 = vertexOffsets0[vertexPosition0];
- for(uint64_t position = begin0; position != end0; position++) {
- leftSequence.push_back(sequence0[position]);
- }
-
- vector<Base> rightSequence;
- const uint64_t vertexPosition1 = transition[1].position;
- const uint64_t begin1 = vertexOffsets1[vertexPosition1] + k;
- const uint64_t end1 = vertexOffsets1[maxVertexPosition1] + k;
- for(uint64_t position = begin1; position != end1; position++) {
- rightSequence.push_back(sequence1[position]);
- }
-
- // Construct the extended sequence for this oriented read,
- // to be used in the MSA.
- vector<Base> orientedReadExtendedSequence;
- const auto addToExtendedSequence = back_inserter(orientedReadExtendedSequence);
- copy(leftSequence, addToExtendedSequence);
- copy(orientedReadSequence, addToExtendedSequence);
- copy(rightSequence, addToExtendedSequence);
-
- orientedReadIdsForAssembly.push_back(orientedReadId);
- orientedReadsSequencesForAssembly.push_back(orientedReadExtendedSequence);
-
- if(debug) {
- copy(orientedReadExtendedSequence, ostream_iterator<Base>(cout));
- cout << " " << orientedReadId << endl;
- }
- }
-
-
-
- // Compute the consensus sequence for the link.
- if(html) {
- html << "<h2>Link " << linkId << "</h2>\n";
- }
- vector<Base> msaRleSequence;
- computeLinkConsensusUsingSpoa(
- orientedReadIdsForAssembly,
- orientedReadsSequencesForAssembly,
- consensusCaller,
- debug,
- html,
- msaRleSequence);
-
- if(debug) {
- cout << "Consensus RLE sequence length before trimming " << msaRleSequence.size() << endl;
- cout << "Portion of segment on left involved in the MSA begins at position " <<
- vertexOffsets0[minVertexPosition0] << endl;
- cout << "Portion of segment on right involved in the MSA ends at position " <<
- vertexOffsets1[maxVertexPosition1] + k << endl;
- }
-
- // Count the number of identical (RLE) bases at the beginning of the
- // link consensus sequence and of the segmentId0 sequence portion
- // involved in assembling this link.
- uint64_t identicalOnLeft = 0;
- const uint64_t begin0 = vertexOffsets0[minVertexPosition0];
- const uint64_t end0 = sequence0.size();
- for(uint64_t i=begin0; (i!=end0 and (i-begin0)<msaRleSequence.size()); i++) {
- if(msaRleSequence[i-begin0] == sequence0[i]) {
- // cout << "*** " << begin0 << " " << end0 << " " << i << endl;
- ++identicalOnLeft;
- } else {
- break;
- }
- }
- if(debug) {
- cout << "Identical on left: " << identicalOnLeft << endl;
- }
-
- // Count the number of identical (RLE) bases at the end of the
- // link consensus sequence and the beginning of segmentId1 .
- uint64_t identicalOnRight = 0;
- const uint64_t end1 = vertexOffsets1[maxVertexPosition1] + k;
- for(uint64_t i=end1-1; ; i--) {
- const uint64_t j = msaRleSequence.size() - (end1 - i);
- if(msaRleSequence[j] == sequence1[i]) {
- // cout << "*** " << i << " " << assembledSegment1.runLengthSequence[i] << " " <<
- // j << " " << consensusRleSequence[j] << endl;
- ++identicalOnRight;
- } else {
- break;
- }
- if(i == 0) {
- break;
- }
- if(j == 0) {
- break;
- }
- }
- identicalOnRight = min(identicalOnRight, msaRleSequence.size()-identicalOnLeft);
- if(debug) {
- cout << "Identical on right: " << identicalOnRight << endl;
- }
-
- // Trim these identical bases from the link consensus sequence.
- leftTrim = identicalOnLeft;
- rightTrim = identicalOnRight;
-
- // Compute and store the number of bases to be trimmed at the end of segmentId0
- // and at the beginning of segmentId1.
- trim0 =
- sequence0.size() -
- vertexOffsets0[minVertexPosition0] -
- identicalOnLeft;
- trim1 =
- vertexOffsets1[maxVertexPosition1] + k
- - identicalOnRight;
-}
-
-
-
-void AssemblyGraph::computeLinkConsensusUsingSpoa(
- const vector<OrientedReadId> orientedReadIds,
- const vector< vector<Base> > rleSequences,
- const ConsensusCaller&,
- bool debug,
- ostream& html,
- vector<Base>& consensusRleSequence
- )
-{
- SHASTA_ASSERT(0);
-}
diff --git a/src/mode3.hpp b/src/mode3.hpp
deleted file mode 100644
index 39285d6..0000000
--- a/src/mode3.hpp
+++ /dev/null
@@ -1,710 +0,0 @@
-#ifndef SHASTA_MODE3_HPP
-#define SHASTA_MODE3_HPP
-
-/*******************************************************************************
-
-Class mode3::AssemblyGraph is the class used for Mode 3 assembly.
-Using GFA terminology, the graph consists of Segments and Links.
-
-A Segment corresponds to a linear sequence of edges, without branches,
-in the marker graph.
-
-If an oriented read enters segment 1 immediately after exiting segment 0,
-we say that there is a transition 0->1. If there is a sufficient
-number of transitions 0->1, we create a link 0->1.
-
-*******************************************************************************/
-
-// Shasta.
-#include "invalid.hpp"
-#include "MemoryMappedVectorOfVectors.hpp"
-#include "MultithreadedObject.hpp"
-#include "ReadId.hpp"
-#include "shastaTypes.hpp"
-
-// Boost libraries.
-#include <boost/graph/adjacency_list.hpp>
-
-// Standard library.
-#include "array.hpp"
-#include "memory.hpp"
-#include "tuple.hpp"
-#include "unordered_map"
-#include "vector.hpp"
-
-
-
-namespace shasta {
- namespace mode3 {
- class AssemblyGraph;
- class AssemblyGraphJourneyEntry;
- class MarkerGraphJourneyEntry;
- class AssemblyGraphJourneyInterval;
- class AssemblyPath;
- class JaccardGraph;
- class JaccardGraphEdgeInfo;
- class SegmentPairInformation;
- class Transition;
-
- }
-
- // Some forward declarations of classes in the shasta namespace.
- class AssembledSegment;
- class Base;
- class ConsensusCaller;
- class Reads;
- class CompressedMarker;
- class MarkerGraph;
-
- extern template class MultithreadedObject<mode3::AssemblyGraph>;
-}
-
-
-
-// The marker graph journey of an oriented read is the sequence
-// of marker graph edges it encounters.
-// (An oriented read encounters a marker graph edge
-// if the oriented read appears in the marker intervals for the edge).
-// The marker graph journey of an oriented read is not necessarily
-// a path in the marker graph because the oriented read
-// can "skip" marker graph edges due to errors.
-// In other places in Shasta, journeys are called "pseudopaths".
-// We describe the marker graph journey of each oriented read as a sequence
-// of MarkerGraphJourneyEntry objects.
-// The MarkerGraphJourneyEntry identifies a marker graph edge
-// by the segmentId in the assembly graph and the position in that segment
-// (that is, the first marker graph in the segment is at
-// position 0, and so on).
-class shasta::mode3::MarkerGraphJourneyEntry {
-public:
- uint64_t segmentId;
- uint32_t position;
- array<uint32_t, 2> ordinals;
-
- bool operator<(const MarkerGraphJourneyEntry& that) const
- {
- return ordinals[0] < that.ordinals[0];
- }
- bool operator==(const MarkerGraphJourneyEntry& that) const
- {
- return
- tie(segmentId, position, ordinals) ==
- tie(that.segmentId, that.position, that.ordinals);
- }
-};
-
-
-
-// The assembly graph journey of an oriented read is the sequence
-// of assembly graph segments (vertices) it encounters.
-// The journey on an oriented read is not necessarily
-// a path in the assembly graph because the oriented read
-// can "skip" segments due to errors.
-// We store the assembly graph journey of each oriented read as a sequence
-// of AssemblyGraphJourneyEntry objects.
-// The AssemblyGraphJourneyEntry stores the segment id and
-// the first and last MarkerGraphJourneyEntry objects
-// on the segment for the given oriented read.
-// Indexed by OrientedReadId::getValue().
-// Note a segmentId can appear more than once in the assembly
-// graph journey of an oriented read. This can happen
-// if the oriented read "goes around" in a tangle caused by repeats.
-class shasta::mode3::AssemblyGraphJourneyEntry {
-public:
- uint64_t segmentId;
-
- // The first and last MarkerGraphJourneyEntry that contributed to this
- // AssemblyGraphJourneyEntry.
- array<MarkerGraphJourneyEntry, 2> markerGraphJourneyEntries;
-};
-
-
-
-// A portion of the assembly graph journey of an oriented read.
-class shasta::mode3::AssemblyGraphJourneyInterval {
-public:
-
- OrientedReadId orientedReadId;
-
- // The first and last position in the assembly graph journey
- // for this oriented read.
- uint64_t first;
- uint64_t last;
-
-
- bool operator<(const AssemblyGraphJourneyInterval& that) const
- {
- return tie(orientedReadId, first) < tie(that.orientedReadId, that.first);
- }
-
-};
-
-
-
-// A transition is a sequence of two consecutive positions
-// in the assembly graph journey of an oriented reads.
-// In other words, it describes the transition of an oriented read
-// from a segment to the next segment it encounters.
-// Transitions are used to create edges in the AssemblyGraph (gfa links).
-// Indexed by the linkId. For each link, they are sorted.
-class shasta::mode3::Transition : public array<MarkerGraphJourneyEntry, 2> {
-public:
- Transition(const array<MarkerGraphJourneyEntry, 2>& x) : array<MarkerGraphJourneyEntry, 2>(x) {}
- Transition() {}
-};
-
-
-
-// The AssemblyGraph is used to store the Mode 3 assembly graph,
-// when it no longer needs to be changed,
-// in memory mapped data structures.
-class shasta::mode3::AssemblyGraph :
- public MultithreadedObject<AssemblyGraph> {
-public:
-
- // Initial construction.
- AssemblyGraph(
- const string& largeDataFileNamePrefix,
- size_t largeDataPageSize,
- size_t threadCount,
- uint64_t readRepresentation,
- uint64_t k, // Marker length
- const Reads& reads,
- const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
- const MarkerGraph&,
- const ConsensusCaller& consensusCaller);
-
- // Constructor from binary data.
- AssemblyGraph(
- const string& largeDataFileNamePrefix,
- uint64_t readRepresentation,
- uint64_t k, // Marker length
- const Reads& reads,
- const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
- const MarkerGraph&,
- const ConsensusCaller& consensusCaller);
-
- // Data and functions to handle memory mapped data.
- const string& largeDataFileNamePrefix;
- size_t largeDataPageSize;
- string largeDataName(const string&) const;
- template<class T> void createNew(T& t, const string& name)
- {
- t.createNew(largeDataName(name), largeDataPageSize);
- }
- template<class T> void accessExistingReadOnly(T& t, const string& name)
- {
- t.accessExistingReadOnly(largeDataName(name));
- }
-
- // References or copies for Assembler objects.
- uint64_t readRepresentation;
- uint64_t k;
- const Reads& reads;
- const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers;
- const MarkerGraph& markerGraph;
- const ConsensusCaller& consensusCaller;
-
- uint64_t readCount() const
- {
- return markers.size() / 2;
- }
-
- // Each linear chain of marker graph edges generates a segment.
- // The marker graph path corresponding to each segment is stored
- // indexed by segment id.
- MemoryMapped::VectorOfVectors<MarkerGraphEdgeId, uint64_t> markerGraphPaths;
- void createSegmentPaths();
-
- // Average marker graph edge coverage for each segment.
- MemoryMapped::Vector<float> segmentCoverage;
- void computeSegmentCoverage();
-
- // Assembled sequence for each segment.
- // For each segment we store the entire sequance, including
- // the complete sequences of the first and last vertex.
- // When writing to gfa, we skip the first and last k/2 bases.
- MemoryMapped::VectorOfVectors<Base, uint64_t> segmentSequences;
- MemoryMapped::VectorOfVectors<uint32_t, uint64_t> segmentVertexOffsets; // Filed in by assembleSegment.
- void assembleSegments();
- void assembleSegment(uint64_t segmentId);
-
- // Keep track of the segment and position each marker graph edge corresponds to.
- // For each marker graph edge, store in the marker graph edge table
- // the corresponding segment id and position in the path, if any.
- // Indexed by the edge id in the marker graph.
- // This is needed when computing assembly graph journeys.
- MemoryMapped::Vector< pair<uint64_t, uint32_t> > markerGraphEdgeTable;
- void computeMarkerGraphEdgeTable(size_t threadCount);
- void computeMarkerGraphEdgeTableThreadFunction(size_t threadId);
-
-
-
- // The marker graph journeys of all oriented reads.
- // Indexed by OrientedReadId::getValue().
- // This is only stored temporarily and used to compute assembly graph journeys.
- MemoryMapped::VectorOfVectors<MarkerGraphJourneyEntry, uint64_t> markerGraphJourneys;
- void computeMarkerGraphJourneys(size_t threadCount);
- void computeMarkerGraphJourneysPass1(size_t threadId);
- void computeMarkerGraphJourneysPass2(size_t threadId);
- void computeMarkerGraphJourneysPass12(uint64_t pass);
- void sortMarkerGraphJourneys(size_t threadId);
-
-
-
- // The assembly graph journeys of all oriented reads.
- // Indexed by OrientedReadId::getValue().
- MemoryMapped::VectorOfVectors<AssemblyGraphJourneyEntry, uint64_t> assemblyGraphJourneys;
- void computeAssemblyGraphJourneys();
- void computeAssemblyGraphJourney(
- const span<MarkerGraphJourneyEntry> markerGraphJourney,
- vector<AssemblyGraphJourneyEntry>& assemblyGraphJourney);
-
- // Store appearances of segments in assembly graph journeys.
- // For each segment, store pairs (orientedReadId, position in assembly graph journey).
- // Indexed by the segmentId.
- // For each segment, they are sorted.
- MemoryMapped::VectorOfVectors<pair<OrientedReadId, uint64_t>, uint64_t>
- assemblyGraphJourneyInfos;
- void computeAssemblyGraphJourneyInfos();
-
- //Coverage is th enumber of oriented reads that appear in this segment.
- // This is not the same as average coverage on marker graph vertices or edges.
- uint64_t coverage(uint64_t segmentId) const
- {
- return assemblyGraphJourneyInfos.size(segmentId);
- }
-
- // Find out if a segment contains a given OrientedReadId.
- // This returns true if assemblyGraphJourneyInfos[segmentId]
- // contains an entry with the given OrientedReadId.
- bool segmentContainsOrientedRead(
- uint64_t segmentId,
- OrientedReadId) const;
-
-
-
- using SegmentPair = pair<uint64_t, uint64_t>;
- using Transitions = vector< pair<OrientedReadId, Transition> >;
- std::map<SegmentPair, Transitions> transitionMap;
- void findTransitions(std::map<SegmentPair, Transitions>& transitionMap);
-
-
-
- // The links.
- class Link {
- public:
- uint64_t segmentId0;
- uint64_t segmentId1;
-
- // Flag to indicate whether the two segments are adjacent.
- // This is set if the last marker graph vertex of segmentId0
- // is the same as the first marker graph vertex of segmentId1.
- // In that case the separation will be set to 0.
- // However, the separation is just an estimate, so it
- // could be 0 even when the segments are ot adjacent.
- bool segmentsAreAdjacent;
-
- // Estimated separation in markers.
- int32_t separation;
-
-
- Link(
- uint64_t segmentId0 = 0,
- uint64_t segmentId1 = 0) :
- segmentId0(segmentId0),
- segmentId1(segmentId1) {}
- };
- MemoryMapped::Vector<Link> links;
- void createLinks(
- const std::map<SegmentPair, Transitions>& transitionMap,
- uint64_t minCoverage);
-
- // The transitions for each link.
- // Indexed by linkId.
- MemoryMapped::VectorOfVectors< pair<OrientedReadId, Transition>, uint64_t> transitions;
- uint64_t linkCoverage(uint64_t linkId) const
- {
- return transitions.size(linkId);
- }
-
- // Assemble a link, given a set of allowed OrientedReadId(s).
- // The returned sequence overrides:
- // - The trim0 last bases of the preceding segment.
- // - The trim1 first bases of the following segment.
- void assembleLink(
- uint64_t linkId,
- const vector<OrientedReadId>& allowedOrientedReadIds,
- vector<Base>& sequence, // The entire MSA sequence
- uint64_t& leftTrim, // The number of MSA sequence to be trimmed on the left for assembly
- uint64_t& rightTrim, // The number of MSA sequence to be trimmed on the left for assembly
- uint64_t& trim0, // The number of bases at the end of segment0 to be trimmed for assembly
- uint64_t& trim1, // The number of bases at the beginning of segment1 to be trimmed for assembly
- ostream& html
- ) const;
-
- // Use spoa to compute consensus sequence for a link.
- static void computeLinkConsensusUsingSpoa(
- const vector<OrientedReadId> orientedReadIds,
- const vector< vector<Base> > rleSequences,
- const ConsensusCaller&,
- bool debug,
- ostream& html,
- vector<Base>& consensusRleSequence
- );
-
- // The links for each source or target segments.
- // Indexed by segment id.
- MemoryMapped::VectorOfVectors<uint64_t, uint64_t> linksBySource;
- MemoryMapped::VectorOfVectors<uint64_t, uint64_t> linksByTarget;
- void createConnectivity();
- uint64_t findLink(uint64_t segmentId0, uint64_t segmentId1) const;
-
-
- // Flag back-segments.
- // This does not do a full blown search for locally strongly connected components.
- // A segment is marked as a back-segment if:
- // - It has only a single incoming link.
- // - It has a single outgoing link.
- // - The incoming and outgoing links both connect to/from the same segment.
- void flagBackSegments();
- MemoryMapped::Vector<bool> isBackSegment;
-
-
-
- // Get the children or parents of a given segment.
- // Only use links with at least a specified coverage.
- void getChildren(
- uint64_t segmentId,
- uint64_t minimumLinkCoverage,
- vector<uint64_t>&
- ) const;
- void getParents(
- uint64_t segmentId,
- uint64_t minimumLinkCoverage,
- vector<uint64_t>&
- ) const;
- void getChildrenOrParents(
- uint64_t segmentId,
- uint64_t direction, // 0=forward (children), 1=backward (parents).
- uint64_t minimumLinkCoverage,
- vector<uint64_t>&
- ) const;
-
-
- // Find descendants of a given segment, up to a given distance in the graph.
- void findDescendants(
- uint64_t segmentId,
- uint64_t maxDistance,
- vector<uint64_t>& segmentIds
- ) const;
-
- // BFS with given begin/end.
- // Does a BFS which starts at segmentIdA.
- // and ends when segmentIdB is encountered.
- // The BFS if forward if direction is 0
- // and backward if direction is 1.
- // Computes a vector of all the segments encountered,
- // excluding segmentIdA and segmentIdB,
- // in the order in which they are encountered in the BFS.
- void targetedBfs(
- uint64_t segmentIdA,
- uint64_t segmentIdB,
- uint64_t direction,
- vector<uint64_t>& segments
- ) const;
-
- void writeGfa(const string& baseName) const;
-
- // Find the distinct oriented reads that appear on the path
- // of a segment. Also return the average edge coverage for the path.
- double findOrientedReadsOnSegment(
- uint64_t segmentId,
- vector<OrientedReadId>&) const;
-
-
-
- // Get information about the oriented reads that appear on the
- // marker graph path of a segment.
- class SegmentOrientedReadInformation {
- public:
-
- // The oriented reads on this segment,
- // each storage with an average offset relative to the segment.
- class Info {
- public:
- OrientedReadId orientedReadId;
-
- // The average offset, in markers, between the
- // beginning of this oriented read and the
- // beginning of the segment.
- int32_t averageOffset;
- };
- vector<Info> infos;
- };
- void getOrientedReadsOnSegment(
- uint64_t segmentId,
- SegmentOrientedReadInformation&) const;
-
- // Oriented read information for each segment.
- // This is only stored when needed.
- vector<SegmentOrientedReadInformation> segmentOrientedReadInformation;
- void storeSegmentOrientedReadInformation(size_t threadCount);
- void storeSegmentOrientedReadInformationThreadFunction(size_t threadId);
-
-
-
- // Estimate the offset between two segments.
- // Takes as input SegmentOrientedReadInformation objects
- // for the two segments.
- // Common oriented reads between the two segments are used
- // to estimate the average offset, in markers,
- // between the beginning of the segments.
- // The number of common oriented reads
- // is computed and stored in the last argument.
- // If that is zero, the computed offset is not valid.
- void estimateOffset(
- const SegmentOrientedReadInformation& info0,
- const SegmentOrientedReadInformation& info1,
- int64_t& offset,
- uint64_t& commonOrientedReadCount
- ) const;
-
-
-
- // Analyze a pair of segments for common oriented reads,
- // offsets, missing reads, etc.
- void analyzeSegmentPair(
- uint64_t segmentId0,
- uint64_t segmentId1,
- const SegmentOrientedReadInformation& info0,
- const SegmentOrientedReadInformation& info1,
- const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers,
- SegmentPairInformation&
- ) const;
-
- // Count the number of common oriented reads between a segment and a link,
- // without counting oriented reads that appear more than once on the
- // segment or on the link.
- void analyzeSegmentLinkPair(
- uint64_t segmentId,
- uint64_t linkId,
- uint64_t& commonOrientedReadCount
- ) const;
-
-
-
-#if 0
- // Find segment pairs a sufficient number of common reads
- // and with low unexplained fraction (in both directions)
- // between segmentId0 and one of its descendants within the specified distance.
- // This requires the vector segmentOrientedReadInformation above to be
- // available.
- void findSegmentPairs(
- uint64_t segmentId0,
- uint64_t maxDistance,
- uint64_t minCommonReadCount,
- double maxUnexplainedFraction,
- vector<uint64_t>& segmentIds1
- ) const;
-
-
- // Cluster the segments based on read composition.
- // We find segment pairs a sufficient number of common reads
- // and with low unexplained fraction (in both directions).
- void clusterSegments(size_t threadCount, uint64_t minClusterSize);
- class ClusterSegmentsData {
- public:
-
- // The segment pairs found by each thread.
- // In each pair, the lower number segment comes first.
- vector< vector< pair<uint64_t, uint64_t> > > threadPairs;
- };
- ClusterSegmentsData clusterSegmentsData;
- void clusterSegmentsThreadFunction1(size_t threadId);
- void addClusterPairs(size_t threadId, uint64_t segmentId0);
-#endif
-
- // The cluster that each segment belongs to.
- // Each connected component of the Jaccard graph corresponds to a cluster.
- MemoryMapped::Vector<uint64_t> clusterIds;
-
-
-
- // Analyze a subgraph of the assembly graph.
-
- // Classes used in analyzeSubgraph.
- class AnalyzeSubgraphClasses {
- public:
-
- // A JourneySnippet describes a sequence of consecutive positions
- // of the assembly graph journey of an oriented read.
- // An OrientedReadId can have than more one JourneySnippet in a given subgraph,
- // but this is not common. It can happen if the assembly graph contains a cycle.
- class JourneySnippet {
- public:
-
- // The OrientedReadId this refers to.
- OrientedReadId orientedReadId;
-
- // The sequence of segments encountered.
- vector<uint64_t> segmentIds;
-
- // The first and last position of this snippet
- // in the assembly graph journey of this OrientedReadId.
- uint64_t firstPosition;
- uint64_t lastPosition() const
- {
- return firstPosition + segmentIds.size() - 1;
- }
- };
-
- // A Cluster is a set of JourneySnippet's.
- class Cluster {
- public:
-
- // The snippets in this cluster.
- vector<JourneySnippet> snippets;
- uint64_t coverage() const
- {
- return snippets.size();
- }
-
- // The segments visited by the snippets of this cluster,
- // each stored with its coverage (number of snippets);
- vector< pair<uint64_t, uint64_t > > segments;
- vector<uint64_t> getSegments() const;
-
- // Remove segments with coverage less than the specified value.
- void cleanupSegments(uint64_t minClusterCoverage);
-
- // Construct the segments given the snippets.
- void constructSegments();
- };
-
-
-
- // The SnippetGraph is used by analyzeSubgraph2.
- // A vertex represents a set of snippets and stores
- // the corresponding snippet indexes.
- // An edge x->y is created if there is at least one snippet in y
- // that is an approximate subset of a snippet in x.
- // Strongly connected components are condensed, so after that
- //the graph is guaranteed to have no cycles.
- class SnippetGraphVertex {
- public:
- vector<uint64_t> snippetIndexes;
- uint64_t clusterId = invalid<uint64_t>;
- SnippetGraphVertex() {}
- SnippetGraphVertex(uint64_t snippetIndex) :
- snippetIndexes(1, snippetIndex) {}
- };
- using SnippetGraphBaseClass =
- boost::adjacency_list<boost::setS, boost::listS, boost::bidirectionalS, SnippetGraphVertex>;
- class SnippetGraph : public SnippetGraphBaseClass {
- public:
- uint64_t clusterCount = 0;
- void findDescendants(const vertex_descriptor, vector<vertex_descriptor>&) const;
- void writeGraphviz(const string& fileName) const;
- };
- };
-
-
-
- void analyzeSubgraph(
- const vector<uint64_t>& segmentIds,
- vector<AnalyzeSubgraphClasses::Cluster>&,
- bool debug) const;
- template<uint64_t N> void analyzeSubgraphTemplate(
- const vector<uint64_t>& segmentIds,
- vector<AnalyzeSubgraphClasses::Cluster>&,
- bool debug) const;
-
- // Given a segment, use a BFS to move in the specified direction until
- // we find a segment with sufficiently high Jaccard similarity
- // and number of common reads.
- // This returns invalid<uint64_t> if no such segment is found
- // within the specified distance.
- uint64_t findSimilarSegmentBfs(
- uint64_t segmentId,
- uint64_t direction, // 0 = forward, 1 = backward
- uint64_t maxDistance,
- uint64_t minCommon,
- double minJaccard) const;
-
- // Given a segment, move in the specified direction,
- // in order of increasing distance in markers, until
- // we find a segment with sufficiently high Jaccard similarity
- // and number of common reads.
- // This returns invalid<uint64_t> if no such segment is found
- // within the specified distance.
- uint64_t findSimilarSegment(
- uint64_t segmentId,
- uint64_t direction, // 0 = forward, 1 = backward
- uint64_t maxDistance, // In markers
- uint64_t minLinkCoverage,
- int32_t minLinkSeparation,
- uint64_t minCommon,
- double maxUnexplainedFraction,
- double minJaccard,
- vector<uint64_t>& segments) const;
-
- // Create an assembly path starting at a given segment.
- void createAssemblyPath(
- uint64_t segmentId,
- uint64_t direction, // 0 = forward, 1 = backward
- AssemblyPath&
- ) const;
-
- // Compute link separation given a set of Transitions.
- template<class Container> static double linkSeparation(
- const Container& transitions,
- uint64_t pathLength0)
- {
- double averageLinkSeparation = 0.;
-
- for(const pair<OrientedReadId, Transition>& p: transitions) {
- const Transition& transition = p.second;
- const MarkerGraphJourneyEntry& entry0 = transition[0];
- const MarkerGraphJourneyEntry& entry1 = transition[1];
-
- SHASTA_ASSERT(entry1.ordinals[0] >= entry0.ordinals[1]);
-
- const int64_t linkSeparation =
- int64_t(entry1.ordinals[0] - entry0.ordinals[1]) -
- int64_t(pathLength0 - 1 - entry0.position) -
- int64_t(entry1.position);
- averageLinkSeparation += double(linkSeparation);
- }
- averageLinkSeparation /= double(transitions.size());
-
- return averageLinkSeparation;
- }
-
- // Jaccard graph.
- shared_ptr<JaccardGraph> jaccardGraphPointer;
- void createJaccardGraph(size_t threadCount);
- void createJaccardGraphThreadFunction(size_t threadId);
- void createJaccardGraphEdges(
- uint64_t segmentId,
- vector<JaccardGraphEdgeInfo>& edges);
- void createJaccardGraphEdges(
- uint64_t segmentId,
- uint64_t direction,
- vector<JaccardGraphEdgeInfo>& edges);
-
- // Assemble the assembly paths stored in the JaccardGraph.
- void assembleJaccardGraphPaths();
- void assembleJaccardGraphPath(const vector<uint64_t>& primarySegments, AssemblyPath&);
-
- // De Bruijn graph of the assembly graph journeys of all oriented reads.
- // Each assembly graph journey is interpreted as a sequence of segment ids.
- void createDeBruijnGraph() const;
- template<uint64_t K> void createDeBruijnGraphTemplated() const;
-};
-
-
-
-
-#endif
-
diff --git a/src/removeReciprocalEdges.hpp b/src/removeReciprocalEdges.hpp
new file mode 100644
index 0000000..4dbd1ac
--- /dev/null
+++ b/src/removeReciprocalEdges.hpp
@@ -0,0 +1,34 @@
+#ifndef SHASTA_REMOVE_RECIPROCAL_ERDGES_HPP
+#define SHASTA_REMOVE_RECIPROCAL_ERDGES_HPP
+
+#include <boost/graph/iteration_macros.hpp>
+#include "vector.hpp"
+
+namespace shasta {
+ template<class Graph> void removeReciprocalEdges(Graph&);
+}
+
+
+
+template<class Graph> void shasta::removeReciprocalEdges(Graph& graph)
+{
+ vector<typename Graph::edge_descriptor> edgesTobeRemoved;
+
+ BGL_FORALL_EDGES_T(e, graph, Graph) {
+ const typename Graph::vertex_descriptor v0 = source(e, graph);
+ const typename Graph::vertex_descriptor v1 = target(e, graph);
+
+ bool reverseEdgeExists = false;
+ tie(ignore, reverseEdgeExists) = boost::edge(v1, v0, graph);
+ if(reverseEdgeExists) {
+ edgesTobeRemoved.push_back(e);
+ }
+ }
+
+ for(const typename Graph::edge_descriptor e: edgesTobeRemoved) {
+ boost::remove_edge(e, graph);
+ }
+
+}
+#endif
+
diff --git a/src/seqan.hpp b/src/seqan.hpp
index ffbb7d0..db56272 100644
--- a/src/seqan.hpp
+++ b/src/seqan.hpp
@@ -41,6 +41,20 @@ namespace shasta {
bool freeOnRight,
vector< pair<bool, bool> >& alignment);
+ // Same, banded.
+ template<class Iterator>
+ int64_t seqanAlign(
+ Iterator begin0, Iterator end0,
+ Iterator begin1, Iterator end1,
+ int64_t matchScore,
+ int64_t mismatchScore,
+ int64_t gapScore,
+ int64_t bandMin,
+ int64_t bandMax,
+ bool freeOnLeft,
+ bool freeOnRight,
+ vector< pair<bool, bool> >& alignment);
+
// Find out if the alignment computed by seqanAlign contains mismatches.
template<class Iterator>
bool containsMismatches(
@@ -164,6 +178,117 @@ template<class Iterator>
+// Same, banded.
+template<class Iterator>
+ int64_t shasta::seqanAlign(
+ Iterator begin0, Iterator end0,
+ Iterator begin1, Iterator end1,
+ int64_t matchScore,
+ int64_t mismatchScore,
+ int64_t gapScore,
+ int64_t bandMin,
+ int64_t bandMax,
+ bool freeOnLeft,
+ bool freeOnRight,
+ vector< pair<bool, bool> >& alignment)
+{
+ // SeqAn does not handle empty sequences.
+ SHASTA_ASSERT(begin0 != end0);
+ SHASTA_ASSERT(begin1 != end1);
+
+ // SeqAn types used below.
+ using namespace seqan;
+ using Int = typename Iterator::value_type;
+ using Sequence = String<Int>;
+ using StringSet = seqan::StringSet<Sequence>;
+ using DepStringSet = seqan::StringSet<Sequence, Dependent<> >;
+ using AlignGraph = Graph<seqan::Alignment<DepStringSet> >;
+
+ // Fill in the sequences, adding 100 to all values
+ // because SeqAn uses 45 to represent gaps.
+ Sequence sequence0;
+ for(Iterator it=begin0; it!=end0; ++it) {
+ appendValue(sequence0, *it + 100);
+ }
+ Sequence sequence1;
+ for(Iterator it=begin1; it!=end1; ++it) {
+ appendValue(sequence1, *it + 100);
+ }
+ // Store them in a SeqAn string set.
+ StringSet sequences;
+ appendValue(sequences, sequence0);
+ appendValue(sequences, sequence1);
+
+
+
+ // Compute the alignment.
+ // See https://docs.seqan.de/seqan/2.1.0/class_AlignConfig.html
+ // for meaning of AlignConfig.
+ AlignGraph graph(sequences);
+ int64_t alignmentScore = 0;
+ if(freeOnLeft) {
+ if(freeOnRight) {
+ // Free on both sides.
+ alignmentScore = globalAlignment(
+ graph,
+ Score<int64_t, seqan::Simple>(matchScore, mismatchScore, gapScore),
+ AlignConfig<true, true, true, true>(),
+ int32_t(bandMin), int32_t(bandMax),
+ LinearGaps());
+ } else {
+ // Free on left only.
+ alignmentScore = globalAlignment(
+ graph,
+ Score<int64_t, seqan::Simple>(matchScore, mismatchScore, gapScore),
+ AlignConfig<true, true, false, false>(),
+ int32_t(bandMin), int32_t(bandMax),
+ LinearGaps());
+ }
+ }else {
+ if(freeOnRight) {
+ // Free on right only.
+ alignmentScore = globalAlignment(
+ graph,
+ Score<int64_t, seqan::Simple>(matchScore, mismatchScore, gapScore),
+ AlignConfig<false, false, true, true>(),
+ int32_t(bandMin), int32_t(bandMax),
+ LinearGaps());
+ } else {
+ // Free on neither side.
+ alignmentScore = globalAlignment(
+ graph,
+ Score<int64_t, seqan::Simple>(matchScore, mismatchScore, gapScore),
+ AlignConfig<false, false, false, false>(),
+ int32_t(bandMin), int32_t(bandMax),
+ LinearGaps());
+ }
+ }
+
+
+
+ // Extract the alignment from the graph.
+ // This creates a single sequence consisting of the two rows
+ // of the alignment, concatenated.
+ Sequence align;
+ convertAlignment(graph, align);
+ const uint64_t totalAlignmentLength = seqan::length(align);
+ SHASTA_ASSERT((totalAlignmentLength % 2) == 0);
+ const uint64_t alignmentLength = totalAlignmentLength / 2;
+
+ // Fill in the bool pairs representing the alignment.
+ alignment.resize(alignmentLength);
+ for(uint64_t i=0; i<alignmentLength; i++) {
+ auto& p = alignment[i];
+ p.first = not (align[i] == 45);
+ p.second = not (align[i+alignmentLength] == 45);
+ }
+
+
+ return alignmentScore;
+}
+
+
+
// Find out if the alignment computed by seqanAlign contains mismatches.
template<class Iterator>
bool shasta::containsMismatches(
diff --git a/src/shastaTypes.hpp b/src/shastaTypes.hpp
index 53f1272..b837c18 100644
--- a/src/shastaTypes.hpp
+++ b/src/shastaTypes.hpp
@@ -5,7 +5,9 @@
namespace shasta {
- using KmerId = uint32_t;
+ using KmerId16 = uint32_t;
+ using KmerId32 = uint64_t;
+ using KmerId = KmerId32;
using ReadId = uint32_t;
using Strand = ReadId;
diff --git a/src/transitiveReduction.hpp b/src/transitiveReduction.hpp
index 0720920..5f5764d 100644
--- a/src/transitiveReduction.hpp
+++ b/src/transitiveReduction.hpp
@@ -11,25 +11,32 @@
// Standard library.
#include "iterator.hpp"
-using std::back_inserter;
+#include <map>
#include <queue>
+#include <set>
#include "vector.hpp"
namespace shasta {
+
+ // Version that requires the graph to use vecS.
template<class Graph> void transitiveReduction(Graph&);
+
+ // Less performant version without the above requirement.
+ template<class Graph> void transitiveReductionAny(Graph&);
}
// Transitive reduction of a directed graph without cycles.
// Class Graph must be a boost::adjacency_list with
-// the first three template arguments set to <listS, vecS, directedS>.
+// the first three template arguments set to <listS, vecS, directedS or bidirectionalS>.
// If the graph has cycles, this throws boost::not_a_dag.
template<class Graph> void shasta::transitiveReduction(Graph &graph)
{
using namespace boost;
using vertex_descriptor = typename Graph::vertex_descriptor;
using edge_descriptor = typename Graph::edge_descriptor;
+ using edge_iterator = typename Graph::edge_iterator;
// Check the Graph type.
// Use C++20 concepts instead.
@@ -42,16 +49,18 @@ template<class Graph> void shasta::transitiveReduction(Graph &graph)
"shasta::transitiveReduction requires an adjacency_list "
"with the second template argument set to boost::vecS.");
static_assert(
+ std::is_same<typename Graph::directed_selector, directedS>::value
+ or
std::is_same<typename Graph::directed_selector, bidirectionalS>::value,
"shasta::transitiveReduction requires an adjacency_list "
- "with the third template argument set to boost::bidirectionalS.");
+ "with the third template argument set to boost::directedS or boost::bidirectionalS.");
// Use boost topological_sort to get a vector of vertex descriptors
// in reverse toplogical order.
vector<vertex_descriptor> sortedVertices;
topological_sort(graph, back_inserter(sortedVertices));
- // Now construct a vector containg the rank of each vertex in topological order.
+ // Now construct a vector containing the rank of each vertex in topological order.
vector<uint64_t> vertexRank(num_vertices(graph));
uint64_t rank = num_vertices(graph);
for (const vertex_descriptor v : sortedVertices) {
@@ -60,9 +69,14 @@ template<class Graph> void shasta::transitiveReduction(Graph &graph)
// Find the edges that should be removed.
vector<edge_descriptor> edgesToBeRemoved;
- vector<bool> wasVisited(num_vertices(graph));
- BGL_FORALL_EDGES_T(e, graph, Graph)
- {
+ vector<bool> wasVisited(num_vertices(graph), false);
+ vector<vertex_descriptor> visitedVertices;
+ edge_iterator it, itEnd;
+ tie(it, itEnd) = edges(graph);
+ while(it != itEnd) {
+ edge_iterator itNext = it;
+ ++itNext;
+ const edge_descriptor e = *it;
const vertex_descriptor v0 = source(e, graph);
const vertex_descriptor v1 = target(e, graph);
@@ -80,11 +94,11 @@ template<class Graph> void shasta::transitiveReduction(Graph &graph)
// Initialize the BFS.
std::queue<vertex_descriptor> q;
q.push(v0);
- std::fill(wasVisited.begin(), wasVisited.end(), false);
wasVisited[v0] = true;
+ visitedVertices.push_back(v0);
// BFS loop.
- while (not q.empty()) {
+ while(not q.empty()) {
// Dequeue a vertex.
const vertex_descriptor vv0 = q.front();
@@ -114,24 +128,136 @@ template<class Graph> void shasta::transitiveReduction(Graph &graph)
if (vv1 == v1) {
// We reached v1. Edge e can be removed and we can stop the BFS.
- edgesToBeRemoved.push_back(e);
+ boost::remove_edge(e, graph);
q = { };
+ break;
} else {
// Continue the BFS.
wasVisited[vv1] = true;
+ visitedVertices.push_back(vv1);
q.push(vv1);
}
}
}
- }
- // Remove the edges.
- deduplicate(edgesToBeRemoved);
- for (const edge_descriptor e : edgesToBeRemoved) {
- remove_edge(e, graph);
+ // Prepare for the next iteration.
+ it = itNext;
+
+ // Clean up.
+ for(const vertex_descriptor v: visitedVertices) {
+ wasVisited[v] = false;
+ }
+ visitedVertices.clear();
}
+
}
+// Less performant version which works on any acyclic boost directed graph.
+template<class Graph> void shasta::transitiveReductionAny(Graph &graph)
+ {
+ using namespace boost;
+ using vertex_descriptor = typename Graph::vertex_descriptor;
+ using edge_descriptor = typename Graph::edge_descriptor;
+ using edge_iterator = typename Graph::edge_iterator;
+
+ // Map vertices to integers.
+ std::map<vertex_descriptor, uint64_t> vertexIndexMap;
+ uint64_t vertexIndex = 0;
+ BGL_FORALL_VERTICES_T(v, graph, Graph) {
+ vertexIndexMap.insert({v, vertexIndex++});
+ }
+
+ // Use boost topological_sort to get a vector of vertex descriptors
+ // in reverse topological order.
+ vector<vertex_descriptor> sortedVertices;
+ topological_sort(
+ graph,
+ back_inserter(sortedVertices),
+ boost::vertex_index_map(boost::make_assoc_property_map(vertexIndexMap)));
+
+ // Store the rank of each vertex in topological order.
+ std::map<vertex_descriptor, uint64_t> vertexRank;
+ uint64_t rank = num_vertices(graph);
+ for (const vertex_descriptor v : sortedVertices) {
+ vertexRank.insert({v, --rank});
+ }
+
+ // Find the edges that should be removed.
+ edge_iterator it, itEnd;
+ tie(it, itEnd) = edges(graph);
+ while(it != itEnd) {
+ edge_iterator itNext = it;
+ ++itNext;
+ const edge_descriptor e = *it;
+ const vertex_descriptor v0 = source(e, graph);
+ const vertex_descriptor v1 = target(e, graph);
+
+ // Edge e should be removed if there is a path
+ // from v0 to v1 that does not use edge e.
+
+ // Do a forward BFS starting at v0 and ending at v1 but:
+ // - Don't use edge e in the BFS.
+ // - Don't use any vertices that have topological order
+ // greater than the topological order of v1,
+ // because there can be no paths ending at v1
+ // that use these vertices.
+ // If the BFS encounters v1, edge e can be removed.
+
+ // Initialize the BFS.
+ std::queue<vertex_descriptor> q;
+ q.push(v0);
+ std::set<vertex_descriptor> visitedVertices;
+ visitedVertices.insert(v0);
+
+ // BFS loop.
+ while(not q.empty()) {
+
+ // Dequeue a vertex.
+ const vertex_descriptor vv0 = q.front();
+ q.pop();
+
+ // Loop over its out-edges.
+ BGL_FORALL_OUTEDGES_T(vv0, ee, graph, Graph)
+ {
+
+ // Don't use edge e in the BFS.
+ if (ee == e) {
+ continue;
+ }
+
+ // Get the other vertex in edge ee.
+ const vertex_descriptor vv1 = target(ee, graph);
+
+ // If vv1 was already visited in this BFS, skip it.
+ if(visitedVertices.contains(vv1)) {
+ continue;
+ }
+
+ // If vv1 follows v1 in topological order, skip it.
+ if (vertexRank[vv1] > vertexRank[v1]) {
+ continue;
+ }
+
+ if (vv1 == v1) {
+ // We reached v1. Edge e can be removed and we can stop the BFS.
+ boost::remove_edge(e, graph);
+ q = { };
+ break;
+ } else {
+ // Continue the BFS.
+ visitedVertices.insert(vv1);
+ q.push(vv1);
+ }
+ }
+ }
+
+ // Prepare for the next iteration.
+ it = itNext;
+
+ }
+
+}
+
#endif
diff --git a/srcMain/main.cpp b/srcMain/main.cpp
index d8968f6..cee1120 100644
--- a/srcMain/main.cpp
+++ b/srcMain/main.cpp
@@ -1,4 +1,3 @@
-// Main program for the Shasta static executable.
// The static executable provides
// basic functionality and reduced performance.
// For full functionality use the shared library built
@@ -220,6 +219,18 @@ void shasta::main::assemble(
"and is now required to run an assembly.");
}
+ // Check --Kmers.k.
+ // Using Kmer=ShortBaseSequence16 limits it to 16 bases.
+ // But alignment methods adds 100 to KmerIds to deal
+ // with the Seqan gap value 45, so this means
+ // that we cannot use k=16.
+ // Therefore the maximum allowed value is 15.
+ // We also reject values that are grossly too low.
+ if(assemblerOptions.kmersOptions.k > 31 or assemblerOptions.kmersOptions.k < 6) {
+ throw runtime_error("Invalid value specified for --Kmers.k. "
+ "Must be between 6 and 31");
+ }
+
// Check that we have at least one input file.
if(assemblerOptions.commandLineOnlyOptions.inputFileNames.empty()) {
cout << assemblerOptions.allOptionsDescription << endl;
@@ -228,17 +239,19 @@ void shasta::main::assemble(
}
// Check assemblerOptions.minHashOptions.version.
- if( assemblerOptions.minHashOptions.version!=0 and
- assemblerOptions.minHashOptions.version!=1) {
+ if( assemblerOptions.minHashOptions.version!=0) {
throw runtime_error("Invalid value " +
to_string(assemblerOptions.minHashOptions.version) +
- " specified for --MinHash.version. Must be 0 or 1.");
+ " specified for --MinHash.version. Must be 0.");
}
// Check assemblerOptions.minHashOptions minimum/maximum bucket size.
- if( assemblerOptions.minHashOptions.maxBucketSize <=
- assemblerOptions.minHashOptions.minBucketSize) {
- throw runtime_error("MinHash maximum bucket size must be greater than minimum bucket size. "
+ const bool dynamicMinHashBucketRange =
+ (assemblerOptions.minHashOptions.minBucketSize == 0) and
+ (assemblerOptions.minHashOptions.maxBucketSize == 0);
+ if((not dynamicMinHashBucketRange) and (assemblerOptions.minHashOptions.maxBucketSize <=
+ assemblerOptions.minHashOptions.minBucketSize)) {
+ throw runtime_error("Invalid MinHash min/max bucket sizes specified. "
"The following values were specified:"
" minimum bucket size " +
to_string(assemblerOptions.minHashOptions.minBucketSize) +
@@ -258,9 +271,9 @@ void shasta::main::assemble(
if( assemblerOptions.alignOptions.alignMethod < 0 or
assemblerOptions.alignOptions.alignMethod == 2 or
- assemblerOptions.alignOptions.alignMethod > 4) {
+ assemblerOptions.alignOptions.alignMethod > 5) {
throw runtime_error("Align method " + to_string(assemblerOptions.alignOptions.alignMethod) +
- " is not valid. Valid options are 0, 1, 3, and 4.");
+ " is not valid. Valid options are 0, 1, 3, 4, and 5.");
}
if(assemblerOptions.readGraphOptions.creationMethod != 0 and
@@ -284,6 +297,10 @@ void shasta::main::assemble(
assemblerOptions.readGraphOptions.strandSeparationMethod != 2) {
throw runtime_error("--Assembly.mode 2 requires --ReadGraph.strandSeparationMethod 2.");
}
+ if(assemblerOptions.assemblyOptions.mode == 3 and
+ assemblerOptions.readGraphOptions.strandSeparationMethod != 2) {
+ throw runtime_error("--Assembly.mode 3 requires --ReadGraph.strandSeparationMethod 2.");
+ }
// Find absolute paths of the input files.
// We will use them below after changing directory to the output directory.
@@ -370,11 +387,12 @@ void shasta::main::assemble(
assemblerOptions.commandLineOnlyOptions.memoryMode != "filesystem") {
cout << "This run uses options \"--memoryBacking " << assemblerOptions.commandLineOnlyOptions.memoryBacking <<
" --memoryMode " << assemblerOptions.commandLineOnlyOptions.memoryMode << "\".\n"
- "This could result in performance degradation.\n"
- "For full performance, use \"--memoryBacking 2M --memoryMode filesystem\"\n"
+ "This could result in longer run time.\n"
+ "For faster assembly, use \"--memoryBacking 2M --memoryMode filesystem\"\n"
"(root privilege via sudo required).\n"
"Therefore the results of this run should not be used\n"
- "for benchmarking purposes." << endl;
+ "for the purpose of benchmarking assembly time.\n"
+ "However the memory options don't affect assembly results in any way." << endl;
}
// Create the Assembler.
@@ -391,11 +409,12 @@ void shasta::main::assemble(
assemblerOptions.commandLineOnlyOptions.memoryMode != "filesystem") {
cout << "This run used options \"--memoryBacking " << assemblerOptions.commandLineOnlyOptions.memoryBacking <<
" --memoryMode " << assemblerOptions.commandLineOnlyOptions.memoryMode << "\".\n"
- "This could have resulted in performance degradation.\n"
- "For full performance, use \"--memoryBacking 2M --memoryMode filesystem\"\n"
+ "This could result in longer run time.\n"
+ "For faster assembly, use \"--memoryBacking 2M --memoryMode filesystem\"\n"
"(root privilege via sudo required).\n"
"Therefore the results of this run should not be used\n"
- "for benchmarking purposes." << endl;
+ "for the purpose of benchmarking assembly time.\n"
+ "However the memory options don't affect assembly results in any way." << endl;
}
// Write out the build id again.
@@ -530,8 +549,10 @@ void shasta::main::assemble(
cout << "This assembly will use " << threadCount << " threads." << endl;
// Set up the consensus caller.
- cout << "Setting up consensus caller " <<
- assemblerOptions.assemblyOptions.consensusCaller << endl;
+ if(assembler.getReads().representation == 1) {
+ cout << "Setting up consensus caller " <<
+ assemblerOptions.assemblyOptions.consensusCaller << endl;
+ }
assembler.setupConsensusCaller(assemblerOptions.assemblyOptions.consensusCaller);
@@ -588,71 +609,25 @@ void shasta::main::assemble(
performanceLog << timestamp << "Done loading reads from " << inputFileNames.size() << " files." << endl;
performanceLog << "Read loading took " << seconds(t1-t0) << "s." << endl;
+ // Find duplicate reads and handle them according to the setting
+ // of --Reads.handleDuplicates.
+ assembler.findDuplicateReads(assemblerOptions.readsOptions.handleDuplicates);
-
- // Select the k-mers that will be used as markers.
- switch(assemblerOptions.kmersOptions.generationMethod) {
- case 0:
- assembler.randomlySelectKmers(
- assemblerOptions.kmersOptions.k,
- assemblerOptions.kmersOptions.probability, 231);
- break;
-
- case 1:
- // Randomly select the k-mers to be used as markers, but
- // excluding those that are globally overenriched in the input reads,
- // as measured by total frequency in all reads.
- assembler.selectKmersBasedOnFrequency(
- assemblerOptions.kmersOptions.k,
- assemblerOptions.kmersOptions.probability, 231,
- assemblerOptions.kmersOptions.enrichmentThreshold, threadCount);
- break;
-
- case 2:
- // Randomly select the k-mers to be used as markers, but
- // excluding those that are overenriched even in a single oriented read.
- assembler.selectKmers2(
- assemblerOptions.kmersOptions.k,
- assemblerOptions.kmersOptions.probability, 231,
- assemblerOptions.kmersOptions.enrichmentThreshold, threadCount);
- break;
-
- case 3:
- // Read the k-mers to be used as markers from a file.
- if(assemblerOptions.kmersOptions.file.empty() or
- assemblerOptions.kmersOptions.file[0] != '/') {
- throw runtime_error("Option --Kmers.file must specify an absolute path. "
- "A relative path is not accepted.");
- }
- assembler.readKmersFromFile(
- assemblerOptions.kmersOptions.k,
- assemblerOptions.kmersOptions.file);
- break;
-
- case 4:
- // Randomly select the k-mers to be used as markers, but
- // excluding those that appear in two copies close to each other
- // even in a single oriented read.
- assembler.selectKmers4(
- assemblerOptions.kmersOptions.k,
- assemblerOptions.kmersOptions.probability, 231,
- assemblerOptions.kmersOptions.distanceThreshold, threadCount);
- break;
-
- default:
- throw runtime_error("Invalid --Kmers generationMethod. "
- "Specify a value between 0 and 4, inclusive.");
- }
-
-
+ // Initialize the KmerChecker, which has the information needed
+ // to decide if a k-mer is a marker.
+ assembler.createKmerChecker(assemblerOptions.kmersOptions, threadCount);
// Find the markers in the reads.
assembler.findMarkers(0);
- if(!assemblerOptions.readsOptions.palindromicReads.skipFlagging) {
+ // Gather marker KmerIds for all markers.
+ // They are used by LowHash and alignment computation.
+ // These will be kept until we are done computing alignments.
+ assembler.computeMarkerKmerIds(threadCount);
- // Flag palindromic reads.
- // These will be excluded from further processing.
+ // Flag palindromic reads.
+ // These will be excluded from further processing.
+ if(!assemblerOptions.readsOptions.palindromicReads.skipFlagging) {
assembler.flagPalindromicReads(
assemblerOptions.readsOptions.palindromicReads.maxSkip,
assemblerOptions.readsOptions.palindromicReads.maxDrift,
@@ -663,12 +638,11 @@ void shasta::main::assemble(
threadCount);
}
-
-
// Find alignment candidates.
if(assemblerOptions.minHashOptions.allPairs) {
assembler.markAlignmentCandidatesAllPairs();
- } else if(assemblerOptions.minHashOptions.version == 0) {
+ } else {
+ SHASTA_ASSERT(assemblerOptions.minHashOptions.version == 0); // Already checked for that.
assembler.findAlignmentCandidatesLowHash0(
assemblerOptions.minHashOptions.m,
assemblerOptions.minHashOptions.hashFraction,
@@ -679,17 +653,6 @@ void shasta::main::assemble(
assemblerOptions.minHashOptions.maxBucketSize,
assemblerOptions.minHashOptions.minFrequency,
threadCount);
- } else {
- SHASTA_ASSERT(assemblerOptions.minHashOptions.version == 1); // Already checked for that.
- assembler.findAlignmentCandidatesLowHash1(
- assemblerOptions.minHashOptions.m,
- assemblerOptions.minHashOptions.hashFraction,
- assemblerOptions.minHashOptions.minHashIterationCount,
- 0,
- assemblerOptions.minHashOptions.minBucketSize,
- assemblerOptions.minHashOptions.maxBucketSize,
- assemblerOptions.minHashOptions.minFrequency,
- threadCount);
}
@@ -711,6 +674,9 @@ void shasta::main::assemble(
assemblerOptions.alignOptions,
threadCount);
+ // Marker KmerIds are freed here.
+ // They can always be recomputed from the reads when needed.
+ assembler.cleanupMarkerKmerIds();
// Create the read graph.
@@ -943,14 +909,6 @@ void shasta::main::mode0Assembly(
assemblerOptions.markerGraphOptions.highCoverageThreshold,
assemblerOptions.markerGraphOptions.maxDistance,
assemblerOptions.markerGraphOptions.edgeMarkerSkipThreshold);
- if(assemblerOptions.markerGraphOptions.reverseTransitiveReduction) {
- assembler.reverseTransitiveReduction(
- assemblerOptions.markerGraphOptions.lowCoverageThreshold,
- assemblerOptions.markerGraphOptions.highCoverageThreshold,
- assemblerOptions.markerGraphOptions.maxDistance);
- }
-
-
// Prune the marker graph.
assembler.pruneMarkerGraphStrongSubgraph(
@@ -1118,52 +1076,54 @@ void shasta::main::mode3Assembly(
const AssemblerOptions& assemblerOptions,
uint32_t threadCount)
{
+ // Mode 3 assembly requires reads in raw representation (not RLE).
+ SHASTA_ASSERT(assemblerOptions.readsOptions.representation == 0);
+
+ // The marker length must be even.
+ SHASTA_ASSERT((assembler.assemblerInfo->k %2) == 0);
+
// Create marker graph vertices.
+ // To create a complete marker graph, generate all vertices
+ // regardless of coverage, and allow duplicate markers on vertices.
assembler.createMarkerGraphVertices(
- assemblerOptions.markerGraphOptions.minCoverage,
- assemblerOptions.markerGraphOptions.maxCoverage,
- assemblerOptions.markerGraphOptions.minCoveragePerStrand,
- assemblerOptions.markerGraphOptions.allowDuplicateMarkers,
- assemblerOptions.markerGraphOptions.peakFinderMinAreaFraction,
- assemblerOptions.markerGraphOptions.peakFinderAreaStartIndex,
+ 1, // minVertexCoverage
+ std::numeric_limits<uint64_t>::max(), // maxVertexCoverage
+ 0, // minVertexCoveragePerStrand
+ true, // allowDuplicateMarkers
+ std::numeric_limits<double>::signaling_NaN(), // For peak finder, unused because minVertexCoverage is not 0.
+ invalid<uint64_t>, // For peak finder, unused because minVertexCoverage is not 0.
threadCount);
assembler.findMarkerGraphReverseComplementVertices(threadCount);
// Create marker graph edges.
- // For assembly mode 1 we use createMarkerGraphEdgesStrict
- // with minimum edge coverage (total and per strand).
+ // Use createMarkerGraphEdgesStrict so all oriented reads on an edge
+ // have exactly the same sequence.
+ // To create a complete marker graph, generate all edges
+ // regardless of coverage.
assembler.createMarkerGraphEdgesStrict(
- assemblerOptions.markerGraphOptions.minEdgeCoverage,
- assemblerOptions.markerGraphOptions.minEdgeCoveragePerStrand, threadCount);
+ 0, // minEdgeCoverage
+ 0, // minEdgeCoveragePerStrand
+ threadCount);
assembler.findMarkerGraphReverseComplementEdges(threadCount);
// Coverage histograms for vertices and edges of the marker graph.
assembler.computeMarkerGraphCoverageHistogram();
- // In mode 3 assembly, we don't add secondary edges.
-
- // Coverage histograms for vertices and edges of the marker graph.
- assembler.computeMarkerGraphCoverageHistogram();
-
- // Compute optimal repeat counts for each vertex of the marker graph.
- if(assemblerOptions.readsOptions.representation == 1) {
- assembler.assembleMarkerGraphVertices(threadCount);
- }
-
- // Compute consensus sequence for all marker graph edges.
- assembler.assembleMarkerGraphEdges(
- threadCount,
- assemblerOptions.assemblyOptions.markerGraphEdgeLengthThresholdForConsensus,
- assemblerOptions.assemblyOptions.storeCoverageData or
- assemblerOptions.assemblyOptions.storeCoverageDataCsvLengthThreshold>0,
- true
- );
-
- // Run mode 3 assembly.
- assembler.mode3Assembly(
+ // Assemble sequence for marker graph edges.
+ // This assembles MarkerGraph::edgeSequence which is
+ // different from what happens in other assembly modes.
+ // See the comments before MarkerGraph::edgeSequence
+ // for more information.
+ assembler.assembleMarkerGraphEdgesMode3();
+
+ // Flag primary marker graph edges.
+ assembler.flagPrimaryMarkerGraphEdges(
+ assemblerOptions.assemblyOptions.mode3Options.minPrimaryCoverage,
+ assemblerOptions.assemblyOptions.mode3Options.maxPrimaryCoverage,
threadCount);
-
+ // Run Mode 3 assembly.
+ assembler.mode3Assembly(threadCount, assemblerOptions.assemblyOptions.mode3Options, false);
}
@@ -1321,8 +1281,10 @@ void shasta::main::explore(
Assembler assembler("Data/", false, 1, 0);
// Set up the consensus caller.
- cout << "Setting up consensus caller " <<
- assemblerOptions.assemblyOptions.consensusCaller << endl;
+ if(assembler.getReads().representation == 1) {
+ cout << "Setting up consensus caller " <<
+ assemblerOptions.assemblyOptions.consensusCaller << endl;
+ }
assembler.setupConsensusCaller(assemblerOptions.assemblyOptions.consensusCaller);
// Access all available binary data.
diff --git a/staticExecutable/CMakeLists.txt b/staticExecutable/CMakeLists.txt
index 824eb8f..7926d84 100644
--- a/staticExecutable/CMakeLists.txt
+++ b/staticExecutable/CMakeLists.txt
@@ -5,7 +5,7 @@ project(shastaStaticExecutable)
add_definitions(-std=c++20)
# Compilation warnings.
-add_definitions(-Wall -Wconversion -Wno-unused-result)
+add_definitions(-Wall -Wconversion -Wno-unused-result -Wno-trigraphs -Wno-psabi)
# Optimization and debug options.
if(BUILD_DEBUG)
@@ -62,14 +62,14 @@ if(X86_64)
target_link_libraries(
shastaStaticExecutable
shastaStaticLibrary
- atomic boost_system boost_program_options boost_chrono spoa png z
+ atomic boost_system boost_program_options boost_chrono boost_serialization spoa png z
lapack blas gfortran quadmath
-Wl,--whole-archive -lpthread -Wl,--no-whole-archive)
else(X86_64)
target_link_libraries(
shastaStaticExecutable
shastaStaticLibrary
- atomic boost_system boost_program_options boost_chrono spoa png z
+ atomic boost_system boost_program_options boost_chrono boost_serialization spoa png z
lapack blas gfortran
-Wl,--whole-archive -lpthread -Wl,--no-whole-archive)
endif(X86_64)
diff --git a/staticLibrary/CMakeLists.txt b/staticLibrary/CMakeLists.txt
index 8e001eb..afa9548 100644
--- a/staticLibrary/CMakeLists.txt
+++ b/staticLibrary/CMakeLists.txt
@@ -5,7 +5,7 @@ project(shastaStaticLibrary)
add_definitions(-std=c++20)
# Compilation warnings.
-add_definitions(-Wall -Wconversion -Wno-unused-result -Wno-trigraphs)
+add_definitions(-Wall -Wconversion -Wno-unused-result -Wno-trigraphs -Wno-psabi)
# Optimization and debug options.
if(BUILD_DEBUG)