diff options
174 files changed, 24260 insertions, 15146 deletions
diff --git a/.github/workflows/Build.yml b/.github/workflows/Build.yml index 0506cc6..1014563 100644 --- a/.github/workflows/Build.yml +++ b/.github/workflows/Build.yml @@ -19,8 +19,8 @@ jobs: tar -cvf shasta-docs.tar --transform='s/docs/shastaDocs/' docs mkdir shasta-build cd shasta-build - # cmake .. -DBUILD_ID="Shasta unreleased test build newer than release 0.11.0 at commit "$GITHUB_SHA - cmake .. -DBUILD_ID="Shasta Release 0.11.1" + # cmake .. -DBUILD_ID="Shasta unreleased test build newer than release 0.11.1 at commit "$GITHUB_SHA + cmake .. -DBUILD_ID="Shasta Release 0.12.0" make -j 2 all make install/strip mv shasta-install shasta-Ubuntu-22.04 @@ -55,8 +55,8 @@ jobs: lsb_release -a mkdir shasta-build cd shasta-build - # cmake .. -DBUILD_DYNAMIC_EXECUTABLE=OFF -DBUILD_DYNAMIC_LIBRARY=OFF -DBUILD_ID="Shasta unreleased test build newer than release 0.11.0 at commit "$GITHUB_SHA - cmake .. -DBUILD_DYNAMIC_EXECUTABLE=OFF -DBUILD_DYNAMIC_LIBRARY=OFF -DBUILD_ID="Shasta Release 0.11.1 minimal build" + # cmake .. -DBUILD_DYNAMIC_EXECUTABLE=OFF -DBUILD_DYNAMIC_LIBRARY=OFF -DBUILD_ID="Shasta unreleased test build newer than release 0.11.1 at commit "$GITHUB_SHA + cmake .. -DBUILD_DYNAMIC_EXECUTABLE=OFF -DBUILD_DYNAMIC_LIBRARY=OFF -DBUILD_ID="Shasta Release 0.12.0 minimal build" make -j 2 all make install/strip mv shasta-install shasta-Ubuntu-22.04 @@ -1,6 +1,11 @@ # Shasta long read assembler De novo assembler for long reads, optimized for Oxford Nanopore (ONT) reads. + +🆕 [Mode 3 assembly: presentation of assembly results](https://paoloshasta.github.io/shasta/Shasta-0.12.0.pdf) + +🆕 [Mode 3 assembly: usage notes](https://paoloshasta.github.io/shasta/Mode3-0.12.0.html) + ___ **Shasta development continues in this fork.** diff --git a/conf/Nanopore-ncm23-May2024.conf b/conf/Nanopore-ncm23-May2024.conf new file mode 100644 index 0000000..3569298 --- /dev/null +++ b/conf/Nanopore-ncm23-May2024.conf @@ -0,0 +1,44 @@ +# This assembly configuration is for nanopore reads generated using the +# "Experimental extremely high-accuracy, ultra-long sequencing kit" +# from the ONT December 2023 data release: +# https://labs.epi2me.io/gm24385_ncm23_preview/ + +# It uses Mode 3 assembly to create a phased assembly. +# It was only tested for a human genome at coverage 40x to 60x, +# but it should work at lower or higher coverage, +# within reasonable limits, because it includes some +# provisions for coverage adaptivity. + +[Reads] +representation = 0 +minReadLength = 10000 +noCache = True +palindromicReads.deltaThreshold = 300 + +[Kmers] +k = 30 +probability = 0.05 + +[MinHash] +minHashIterationCount = 50 +minBucketSize = 0 +maxBucketSize = 0 +minFrequency = 5 + +[Align] +alignMethod = 5 +sameChannelReadAlignment.suppressDeltaThreshold = 30 +minAlignedMarkerCount = 1000 +minAlignedFraction = 0.9 +maxSkip = 20 +maxDrift = 10 +maxTrim = 20 + +[ReadGraph] +maxAlignmentCount = 20 +strandSeparationMethod = 2 + +[Assembly] +mode = 3 + + diff --git a/debian/changelog b/debian/changelog index a17c104..223e637 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,18 @@ +shasta (0.12.0-1) unstable; urgency=medium + + * New upstream version 0.12.0 + * gcc-13.patch: delete: applied upstream. + * gcc-13-bis.patch: delete: applied upstream. + * d/control: declare compliance to standards version 4.7.0. + + -- Étienne Mollier <emollier@debian.org> Sun, 26 May 2024 18:05:55 +0200 + +shasta (0.11.1-5) unstable; urgency=medium + + * d/rules: meddle with RPATH only after dh_shlibdeps. (Closes: #1069370) + + -- Étienne Mollier <emollier@debian.org> Wed, 08 May 2024 22:02:57 +0200 + shasta (0.11.1-4) unstable; urgency=medium * gcc-13-bis.patch: new: fix ftbfs with gcc 13.2.0. (Closes: #1059139) diff --git a/debian/control b/debian/control index fa214cb..e5779d8 100644 --- a/debian/control +++ b/debian/control @@ -24,7 +24,7 @@ Build-Depends: debhelper-compat (= 13), libblas-dev, liblapack-dev, gfortran -Standards-Version: 4.6.2 +Standards-Version: 4.7.0 Vcs-Browser: https://salsa.debian.org/med-team/shasta Vcs-Git: https://salsa.debian.org/med-team/shasta.git Homepage: https://github.com/chanzuckerberg/shasta diff --git a/debian/patches/gcc-13-bis.patch b/debian/patches/gcc-13-bis.patch deleted file mode 100644 index 30d8e9c..0000000 --- a/debian/patches/gcc-13-bis.patch +++ /dev/null @@ -1,37 +0,0 @@ -Description: fix a new wave of ftbfs with gcc-13. -Author: Étienne Mollier <emollier@debian.org> -Bug-Debian: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1059139 -Forwarded: https://github.com/paoloshasta/shasta/pull/20 -Last-Update: 2023-12-20 ---- -This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ ---- shasta.orig/src/CompactUndirectedGraph.hpp -+++ shasta/src/CompactUndirectedGraph.hpp -@@ -26,6 +26,7 @@ - #include <boost/graph/iteration_macros.hpp> - - // Standard library. -+#include "algorithm.hpp" - #include "array.hpp" - #include "iostream.hpp" - #include <limits> ---- shasta.orig/src/shortestPath.hpp -+++ shasta/src/shortestPath.hpp -@@ -32,6 +32,7 @@ - #include <boost/graph/iteration_macros.hpp> - - // Standard library. -+#include "algorithm.hpp" - #include "cstddef.hpp" - #include "cstdint.hpp" - #include <queue> ---- shasta.orig/src/mode3-PathGraph.cpp -+++ shasta/src/mode3-PathGraph.cpp -@@ -13,6 +13,7 @@ - #include <boost/icl/interval_set.hpp> - - // Standard library. -+#include <bitset> - #include "fstream.hpp" - #include "iostream.hpp" - #include <queue> diff --git a/debian/patches/gcc-13.patch b/debian/patches/gcc-13.patch deleted file mode 100644 index 89290b5..0000000 --- a/debian/patches/gcc-13.patch +++ /dev/null @@ -1,77 +0,0 @@ -Description: fix build failure with gcc 13. -Author: Étienne Mollier <emollier@debian.org> -Bug-Debian: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1042196 -Forwarded: https://github.com/paoloshasta/shasta/pull/15 -Last-Update: 2023-08-23 ---- -This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ ---- shasta.orig/src/Base.hpp -+++ shasta/src/Base.hpp -@@ -11,6 +11,7 @@ - #include "SHASTA_ASSERT.hpp" - - #include "array.hpp" -+#include "cstdint.hpp" - #include "iostream.hpp" - #include "stdexcept.hpp" - #include "string.hpp" ---- shasta.orig/src/PeakFinder.hpp -+++ shasta/src/PeakFinder.hpp -@@ -50,6 +50,7 @@ - ***********************************************************************************************************************/ - - -+#include "cstdint.hpp" - #include "stdexcept.hpp" - #include "iostream.hpp" - #include "utility.hpp" ---- shasta.orig/src/PngImage.hpp -+++ shasta/src/PngImage.hpp -@@ -2,6 +2,7 @@ - #define SHASTA_PNG_IMAGE_HPP - - #include <png.h> -+#include "cstdint.hpp" - #include "string.hpp" - #include "vector.hpp" - ---- shasta.orig/src/dset64-gccAtomic.hpp -+++ shasta/src/dset64-gccAtomic.hpp -@@ -1,6 +1,7 @@ - #if !defined(__DSET64_GCC_ATOMIC_HPP) - #define __DSET64_GCC_ATOMIC_HPP - -+#include <cstdint> - #include <stdexcept> - - /** ---- shasta.orig/src/platformDependent.hpp -+++ shasta/src/platformDependent.hpp -@@ -1,6 +1,7 @@ - #ifndef SHASTA_PLATFORM_DEPENDENT_HPP - #define SHASTA_PLATFORM_DEPENDENT_HPP - -+#include "cstdint.hpp" - #include "string.hpp" - - namespace shasta { ---- shasta.orig/src/shortestPath.hpp -+++ shasta/src/shortestPath.hpp -@@ -33,6 +33,7 @@ - - // Standard library. - #include "cstddef.hpp" -+#include "cstdint.hpp" - #include <queue> - #include "vector.hpp" - ---- shasta.orig/src/span.hpp -+++ shasta/src/span.hpp -@@ -2,6 +2,7 @@ - #define SHASTA_SPAN_HPP - - #include "algorithm.hpp" -+#include "cstdint.hpp" - #include "iostream.hpp" - #include "iterator.hpp" - #include <span> diff --git a/debian/patches/series b/debian/patches/series deleted file mode 100644 index 9073569..0000000 --- a/debian/patches/series +++ /dev/null @@ -1,2 +0,0 @@ -gcc-13.patch -gcc-13-bis.patch diff --git a/debian/rules b/debian/rules index 1dc623c..7734717 100755 --- a/debian/rules +++ b/debian/rules @@ -31,13 +31,6 @@ override_dh_install-arch: dh_install -a # Rename shastaDynamic to shasta for ease mv debian/shasta/usr/bin/shastaDynamic debian/shasta/usr/bin/shasta - # The library is in a more unusual place (nested within /usr/lib), so modify - # the ELF - chrpath -r /usr/lib/python3/dist-packages \ - debian/shasta/usr/bin/shasta - patchelf \ - --replace-needed shasta.so $(MULTIARCH_SONAME) \ - debian/shasta/usr/bin/shasta execute_after_dh_python3-arch: patchelf \ @@ -57,3 +50,12 @@ override_dh_missing: # Remove rest of files which have already been installed rm -rf debian/tmp/${CURDIR} dh_missing --list-missing + +execute_after_dh_shlibdeps: + # The library is in a more unusual place (nested within /usr/lib), so + # modify the ELF + chrpath -r /usr/lib/python3/dist-packages \ + debian/shasta/usr/bin/shasta + patchelf \ + --replace-needed shasta.so $(MULTIARCH_SONAME) \ + debian/shasta/usr/bin/shasta diff --git a/docs/CommandLineOptions.html b/docs/CommandLineOptions.html index 29afe05..7ce8524 100644 --- a/docs/CommandLineOptions.html +++ b/docs/CommandLineOptions.html @@ -219,6 +219,33 @@ Implemented for Linux only (uses the <a href='http://man7.org/linux/man-pages/ma Can help performance, but only use it if you know you will not need to access the input files again soon. + +<tr id='Reads.handleDuplicates'> +<td><code>--Reads.handleDuplicates</code><td class=centered><code>useOneCopy</code><td> +Specifies how to handle reads with duplicate names (the name of a read is its +id in an input fasta or fastq file). +These can occasionally occur, typically due to glitches in the basecalling or subsequent +pipelines before assembly starts. +Can be one of the following: +<ul> +<li><code>useAllCopies</code>: +All copies of reads with duplicate names are used in the assembly. +This can cause artifacts in some cases. +This was the Shasta behavior before this option was introduced. +<li><code>useOneCopy</code>: +For each set of reads with duplicate names, only one read is used in the assembly. +This is the default. +<li><code>useNone</code>: +None of the reads with duplicate names are used in the assembly. +<li><code>forbid</code>: +If any reads with duplicate names are found, the assembly stops. +</ul> +In all cases, a message is written with the number of reads with duplicate names +found, and the number of reads that were discarded for that reason. +A file <code>DuplicateReads.csv</code> , +listing details for all reads wth duplicate names, +is also written to the assembly directory. + <tr id='Reads.palindromicReads.skipFlagging'> <td><code>--Reads.palindromicReads.skipFlagging</code><td class=centered><code>False</code><td> Skip flagging palindromic reads. Oxford Nanopore reads should be flagged for better results. @@ -263,21 +290,26 @@ Can be one of the following: <li>1: Random selection, excluding k-mers that are globally overenriched, as defined by their global frequency in input reads, and by the value specified as <code>--Kmers.enrichmentThreshold</code>. +Only supported when <code>--Kmers.k</code> is less than 16. <li>2: Random selection, excluding k-mers that are overenriched even in a single read, as defined by the value specified as <code>--Kmers.enrichmentThreshold</code>. +Only supported when <code>--Kmers.k</code> is less than 16. <li>3: Read from file. Use <code>--Kmers.file</code> to specify the file. +Only supported when <code>--Kmers.k</code> is less than 16. <li>4: Random selection, excluding k-mers that appear in two copies close to each other, even in a single read. The two k-mer copies are considered close if they occur at a distance from each other less than <code>--Kmers.distanceThreshold</code> RLE bases. +Only supported when <code>--Kmers.k</code> is less than 16. </ul> <tr id='Kmers.k'> <td><code>--Kmers.k</code><td class=centered><code>10</code><td> Length of marker <i>k</i>-mers (in run-length representation). +Can be up to 31 for Mode 0 assembly, 30 for Mode 2 assembly. <a class=qm href='ComputationalMethods.html#Markers'/> <tr id='Kmers.probability'> @@ -306,7 +338,7 @@ Only used if <code>--Kmers.generationMethod</code> is 3. <tr id='MinHash.version'> <td><code>--MinHash.version</code><td class=centered><code>0</code><td> The version of the MinHash/LowHash algorithm to be used. -Can be 0 (default) or 1 (experimental). +Must be 0 (default). <tr id='MinHash.m'> <td><code>--MinHash.m</code><td class=centered><code>4</code><td> @@ -335,11 +367,15 @@ If <code>--MinHash.minHashIterationCount</code> is not 0, this is not used. <tr id='MinHash.minBucketSize'> <td><code>--MinHash.minBucketSize</code><td class=centered><code>0</code><td> The minimum size for a bucket to be used by the MinHash/LowHash algoritm. +If minBucketSize and maxBucketSize are both 0, they are adjusted automatically +at each iteration using simple heuristics. <a class=qm href='ComputationalMethods.html#FindingOverlappingReads'/> <tr id='MinHash.maxBucketSize'> <td><code>--MinHash.maxBucketSize</code><td class=centered><code>10</code><td> The maximum size for a bucket to be used by the MinHash/LowHash algoritm. +If minBucketSize and maxBucketSize are both 0, they are adjusted automatically +at each iteration using simple heuristics. <a class=qm href='ComputationalMethods.html#FindingOverlappingReads'/> <tr id='MinHash.minFrequency'> @@ -364,7 +400,7 @@ The alignment method to be used to compute marker alignments between reads: <li>0 = Old Shasta alignment method. Use this to reproduce Shasta behavior before release 0.5.0. <li>1 = SeqAn. This gives the best alignment results but it is slow and should only be used for testing. <li>3 = Banded SeqAn. -<li>4 = New Shasta alignment method (experimental). +<li>4 and 5 = experimental. </ul> <a class=qm href='ComputationalMethods.html#OptimalAlignments'/> @@ -471,6 +507,14 @@ Only used for alignment method 4 (experimental). <td><code>--Align.align4.maxDistanceFromBoundary</code><td class=centered><code>100</code><td> Only used for alignment method 4 (experimental). +<tr id='Align.align5.driftRateTolerance'> +<td><code>--Align.align5.driftRateTolerance</code><td class=centered><code>0.02</code><td> +Maximum allowed drift rate for alignment method 5. + +<tr id='Align.align5.minBandExtend'> +<td><code>--Align.align5.minBandExtend</code><td class=centered><code>10</code><td> +Minimum band extension, in markers, for alignment method 5. + <tr id='ReadGraph.creationMethod'> <td><code>--ReadGraph.creationMethod</code><td class=centered><code>0</code><td> The method used to create the read graph (0 or 2). @@ -667,14 +711,6 @@ with average edge coverage less than this value are removed, together with the corresponding marker graph edges. A cross edge is defined as an edge v0->v1 with out-degree(v0)>1, in-degree(v1)>1. - -<tr id='MarkerGraph.reverseTransitiveReduction'> -<td><code>--MarkerGraph.reverseTransitiveReduction</code><td class=centered><code>False</code><td> -This is a -<a href="#BooleanSwitches">Boolean switch</a>. -If set, approximate reverse transitive reduction of the marker -graph in the reverse direction is also performed. - <tr id='MarkerGraph.peakFinder.minAreaFraction'> <td><code>--MarkerGraph.peakFinder.minAreaFraction</code><td class=centered><code>0.08</code><td> Used in the automatic selection of @@ -927,8 +963,181 @@ This is a If set, output of the haploid representation of the assembly is suppressed. Mode 2 assembly only. <a class=qm href='ComputationalMethods.html#Mode2Assembly'/> +<tr id='Assembly.mode3.minPrimaryCoverage'> +<td><code>--Assembly.mode3.minPrimaryCoverage</code><td class=centered><code>0</code><td> +Minimum primary coverage. +If <code>minPrimaryCoverage</code> and <code>maxPrimaryCoverage</code> are both 0, +they are set automatically to appropriate values using a simple heuristic. +Only used with <code>--Assembly.mode 3</code>. + +<tr id='Assembly.mode3.maxPrimaryCoverage'> +<td><code>--Assembly.mode3.maxPrimaryCoverage</code><td class=centered><code>0</code><td> +Maximum primary coverage. +If <code>minPrimaryCoverage</code> and <code>maxPrimaryCoverage</code> are both 0, +they are set automatically to appropriate values using a simple heuristic. +Only used with <code>--Assembly.mode 3</code>. + +<tr id='Assembly.mode3.primaryGraph.maxLoss'> +<td><code>--Assembly.mode3.primaryGraph.maxLoss</code> +<td class=centered><code>0.1</code><td> +Used for weak edge removal in the primary graph. +Mode 3 assembly only. + +<tr id='Assembly.mode3.primaryGraph.crossEdgesLowCoverageThreshold'> +<td><code>--Assembly.mode3.primaryGraph.crossEdgesLowCoverageThreshold</code> +<td class=centered><code>1</code><td> +Low coverage threshold for cross edge removal in the primary graph. +Mode 3 assembly only. + +<tr id='Assembly.mode3.primaryGraph.crossEdgesHighCoverageThreshold'> +<td><code>--Assembly.mode3.primaryGraph.crossEdgesHighCoverageThreshold</code> +<td class=centered><code>3</code><td> +High coverage threshold for cross edge removal in the primary graph. +Mode 3 assembly only. + +<tr id='Assembly.mode3.assemblyGraph.detangleToleranceLow'> +<td><code>--Assembly.mode3.assemblyGraph.detangleToleranceLow</code> +<td class=centered><code>0</code><td> +Used for detangling of the assembly graph. +Mode 3 assembly only. + +<tr id='Assembly.mode3.assemblyGraph.detangleToleranceHigh'> +<td><code>--Assembly.mode3.assemblyGraph.detangleToleranceHigh</code> +<td class=centered><code>2</code><td> +Used for detangling of the assembly graph. +Mode 3 assembly only. + +<tr id='Assembly.mode3.assemblyGraph.epsilon'> +<td><code>--Assembly.mode3.assemblyGraph.epsilon</code> +<td class=centered><code>0.1</code><td> +ε value for the Bayesian model used for detangling the assembly graph. +Mode 3 assembly only. + +<tr id='Assembly.mode3.assemblyGraph.minLogP'> +<td><code>--Assembly.mode3.assemblyGraph.minLogP</code> +<td class=centered><code>20</code><td> +<code>MinLogP</code> value (in dB) for the Bayesian model used for detangling the assembly graph. +Mode 3 assembly only. + +<tr id='Assembly.mode3.assemblyGraph.longBubbleThreshold'> +<td><code>--Assembly.mode3.assemblyGraph.longBubbleThreshold</code> +<td class=centered><code>5000</code><td> +Long bubble threshold . +Mode 3 assembly only. + +<tr id='Assembly.mode3.assemblyGraph.phaseErrorThreshold'> +<td><code>--Assembly.mode3.assemblyGraph.phaseErrorThreshold</code> +<td class=centered><code>0.1</code><td> +Phase error threshold for phasing. +Mode 3 assembly only. + +<tr id='Assembly.mode3.assemblyGraph.bubbleErrorThreshold'> +<td><code>--Assembly.mode3.assemblyGraph.bubbleErrorThreshold</code> +<td class=centered><code>0.03</code><td> +Bubble error threshold for bubble cleanup. +Mode 3 assembly only. + +<tr id='Assembly.mode3.assemblyGraph.bubbleCleanupMaxOffset'> +<td><code>--Assembly.mode3.assemblyGraph.bubbleCleanupMaxOffset</code> +<td class=centered><code>1000</code><td> +Maximum bubble offset for bubble cleanup. +Mode 3 assembly only. + +<tr id='Assembly.mode3.assemblyGraph.chainTerminalCommonThreshold'> +<td><code>--Assembly.mode3.assemblyGraph.chainTerminalCommonThreshold</code> +<td class=centered><code>3</code><td> +Used for bubble cleanup. +Mode 3 assembly only. + +<tr id='Assembly.mode3.assemblyGraph.superbubbleLengthThreshold1'> +<td><code>--Assembly.mode3.assemblyGraph.superbubbleLengthThreshold1</code> +<td class=centered><code>30000</code><td> +Length threshold used for superbubble cleanup. +Mode 3 assembly only. + +<tr id='Assembly.mode3.assemblyGraph.superbubbleLengthThreshold2'> +<td><code>--Assembly.mode3.assemblyGraph.superbubbleLengthThreshold2</code> +<td class=centered><code>10000</code><td> +Low length threshold used for superbubble removal. +Mode 3 assembly only. + +<tr id='Assembly.mode3.assemblyGraph.superbubbleLengthThreshold3'> +<td><code>--Assembly.mode3.assemblyGraph.superbubbleLengthThreshold3</code> +<td class=centered><code>30000</code><td> +High length threshold used for superbubble removal. +Mode 3 assembly only. + +<tr id='Assembly.mode3.assemblyGraph.superbubbleLengthThreshold4'> +<td><code>--Assembly.mode3.assemblyGraph.superbubbleLengthThreshold4</code> +<td class=centered><code>30000</code><td> +Length threshold used for superbubble detangling. +Mode 3 assembly only. + +<tr id='Assembly.mode3.localAssembly.estimatedOffsetRatio'> +<td><code>--Assembly.mode3.localAssembly.estimatedOffsetRatio</code> +<td class=centered><code>1.1</code><td> +For local assembly, the estimated offset between the left and right gets +extended by this ratio to decide how much to extend reads that only appear on one side only. +Mode 3 assembly only. + +<tr id='Assembly.mode3.localAssembly.vertexSamplingRate'> +<td><code>--Assembly.mode3.localAssembly.vertexSamplingRate</code> +<td class=centered><code>0.8</code><td> +Vertex sampling rate for local assembly, used to set minVertexCoverage. +Mode 3 assembly only. + +<tr id='Assembly.mode3.localAssembly.matchScore'> +<td><code>--Assembly.mode3.localAssembly.matchScore</code> +<td class=centered><code>6</code><td> +Match score for alignment computation in local assembly. +Mode 3 assembly only. + +<tr id='Assembly.mode3.localAssembly.mismatchScore'> +<td><code>--Assembly.mode3.localAssembly.mismatchScore</code> +<td class=centered><code>-1</code><td> +Mismatch score for alignment computation in local assembly. +Mode 3 assembly only. + +<tr id='Assembly.mode3.localAssembly.gapScore'> +<td><code>--Assembly.mode3.localAssembly.gapScore</code> +<td class=centered><code>-1</code><td> +Gap score for alignment computation in local assembly. +Mode 3 assembly only. + +<tr id='Assembly.mode3.localAssembly.maxSkipBases'> +<td><code>--Assembly.mode3.localAssembly.maxSkipBases</code> +<td class=centered><code>500</code><td> +Number of bases (not markers) that can be skipped by an alignment in local assembly. +Mode 3 assembly only. + +<tr id='Assembly.mode3.localAssembly.maxDrift'> +<td><code>--Assembly.mode3.localAssembly.maxDrift</code> +<td class=centered><code>0.005</code><td> +The maximum tolerated length drift of each read. +Used to compute the band for banded alignments in local assembly. +Mode 3 assembly only. + +<tr id='Assembly.mode3.localAssembly.minHalfBand'> +<td><code>--Assembly.mode3.localAssembly.minHalfBand</code> +<td class=centered><code>100</code><td> +Minimum half band, in markers, for alignment computations in local assembly. +Mode 3 assembly only. + +<tr id='Assembly.mode3.localAssembly.minScoreRatio'> +<td><code>--Assembly.mode3.localAssembly.minScoreRatio</code> +<td class=centered><code>0.7</code><td> +Score threshold for discarding alignments in for local assembly. +Mode 3 assembly only. + +<tr id='Assembly.mode3.localAssembly.maxMsaLength'> +<td><code>--Assembly.mode3.localAssembly.maxMsaLength</code> +<td class=centered><code>5000</code><td> +Maximum allowed length of a multiple sequence alignment computation for local assembly. +Mode 3 assembly only. + </table> + <div class="goto-index"><a href="index.html">Table of contents</a></div> </main> diff --git a/docs/Configurations.html b/docs/Configurations.html index 0571ab9..7b4f6f2 100644 --- a/docs/Configurations.html +++ b/docs/Configurations.html @@ -92,7 +92,7 @@ use Shasta command <code>listConfigurations</code> as follows: shasta --command listConfigurations </pre> -At the time of writing (May 2022), this outputs the following +At the time of writing (May 2024), this outputs the following list of built-in configurations: <pre> @@ -120,6 +120,7 @@ Nanopore-R10-Fast-Nov2022 Nanopore-R10-Slow-Nov2022 Nanopore-Phased-R10-Fast-Nov2022 Nanopore-Phased-R10-Slow-Nov2022 +Nanopore-ncm23-May2024 </pre> <p> @@ -156,11 +157,21 @@ under the following conditions: <td class=centered><code>Nanopore-R10-Fast-Nov2022</code> <td class=centered><code>Nanopore-Phased-R10-Fast-Nov2022</code> -<tr><th>R10, slow mode<th>Standard<th>Human genome with two flowcells +<tr><th>R10, slow mode<br>(no longer in use)<th>Standard<th>Human genome with two flowcells (about 45x) <td class=centered><code>Nanopore-R10-Slow-Nov2022</code> <td class=centered><code>Nanopore-Phased-R10-Slow-Nov2022</code> +<tr> +<th><a href='https://labs.epi2me.io/gm24385_ncm23_preview/'> +ONT December 2023 Data release</a><br> +(<i>"Experimental extremely high-accuracy, ultra-long +sequencing kit"</i>) +<th>Ultra-Long (UL) +<th>Tested at 40x to 60x but may be functional outside this range +<td> +<td class=centered><code>Nanopore-ncm23-May2024</code> + </table> diff --git a/docs/MakeRelease.html b/docs/MakeRelease.html index 5432e61..1381293 100644 --- a/docs/MakeRelease.html +++ b/docs/MakeRelease.html @@ -38,7 +38,7 @@ Wait for that build to complete. <li>Download the 3 artifacts. Unzip them and rename them to the following: <ul> <li><code>shasta-Linux-X.Y.Z</code> -<li><code>shasta-Ubuntu-20.04-X.Y.Z.tar</code> +<li><code>shasta-Ubuntu-22.04-X.Y.Z.tar</code> <li><code>shasta-docs-X.Y.Z.tar</code> </ul> <li>Make sure <code>shasta-Linux-X.Y.Z</code> is executable. @@ -51,7 +51,7 @@ cd shastaBuild cmake ../shasta -DBUILD_ID="Shasta Release X.Y.Z for 64 bit ARM" make install/strip -j </code></li> -<li>Download the <code>aarch64</code> Shasta binary (using <code>scp</code>) and +<li>Download the <code>aarch64</code> Shasta binary and rename it to <code>shasta-Linux-ARM-X.Y.Z</code>, then make sure it is executable. </li> </ul> diff --git a/docs/Mode3-0.12.0.html b/docs/Mode3-0.12.0.html new file mode 100644 index 0000000..67c4b33 --- /dev/null +++ b/docs/Mode3-0.12.0.html @@ -0,0 +1,132 @@ +<!DOCTYPE html> +<html> + +<head> +<link rel=stylesheet href=style.css /> +</head> + +<body> +<main> +<div class="goto-index"><a href="index.html">Table of contents</a></div> + +<h1>Shasta Mode 3 assembly</h1> +<h2>Summary</h2> +<ul> + +<li>Uses new computational techniques to extract phased sequence from the marker graph. + +<li>Preliminary version released with Shasta 0.12.0, despite known issues, to encourage experimentation. +Please share your experiences by filing +<a href='https://github.com/paoloshasta/shasta/issues'>issues on the Shasta GitHub repository</a>. + +<li>Initially only supported for the new high accuracy Oxford Nanopore reads from the +<a href='https://labs.epi2me.io/gm24385_ncm23_preview/'>2023.12 data release</a>. +It is possible that additional future releases will also support ONT R10 reads. + +<li>Despite the known issues, it produces useful phased assemblies. +See <a href='Shasta-0.12.0.pdf'>this presentation</a> for an analysis of assembly results. + +<li>Released with minimal usage documentation (this page). +A description of computational techniques is not yet available. + +<li>Invoke using <code>--config Nanopore-ncm23-May2024</code>. +This assembly configuration was only tested on human genomes +at coverage 40x to 60x, but may be functional at higher or lower coverage, +within reasonable limits. +It includes limited adaptivity to coverage. + +</ul> + + + +<h2>Output files</h2> + +<p> +Shasta uses <a href='https://github.com/GFA-spec/GFA-spec'>GFA</a> terminology. +A contiguous piece of assembled sequence is a <i>Segment</i>. +<i>Links</i> define adjacency between segments. + +<table> +<tr> +<td><code>Assembly.gfa</code> +<td>The assembly graph in GFA 1.0 format. +All link records include a Cigar string defining an exact overlap of a small +but variable number of bases between adjacent segments. + +<tr> +<td><code>Assembly-NoSequence.gfa</code> +<td>Identical to <code>Assembly.gfa</code>, but does not contain any sequence. +Faster to download, manipulate, and visualize in +<a href='https://github.com/asl/BandageNG'>Bandage</a>. + +<tr> +<td><code>Assembly.fasta</code> +<td>The sequences of all assembled segments, in FASTA format. + +<tr> +<td><code>Assembly.csv</code> +<td>Contains one line of information for each assembled segment. +It can be loaded in Bandage and also provides custom coloring of segments. +</table> + + + +<h2>Naming of assembled segments</h2> +<p> +Assembled segments are organized in bubble chains. +A bubble chain is a linear sequence of bubbles of any ploidy +without any incoming/outgoing connections to/from +the middle of the bubble chain. +Some of the bubbles have ploidy 1 (haploid) and usually correspond +to low heterozygosity region where haplotypes could not be separated. + +<p> +Assembled segment names are of the form <code>a-b-c-d-Pn</code>, +where: +<ul> +<li><code>a-b</code> identifies the bubble chain. +<li><code>c</code> is the position of the bubble in the bubble chain. +<li><code>d</code> identifies the haplotype in the bubble. +<li><code>n</code> is the ploidy of the bubble. +</ul> +For example, the figure below illustrates segment naming for bubble chain +<code>1-341</code>. Segment lengths are not to scale. +This bubble chain consists of 7 bubbles, numbered from 0 to 6. +Bubbles 0, 2, 4, and 6 are haploid. +Bubbles 1, 3, and 5 are diploid. + +Segment <code>1-341-3-1-P2</code> is haplotype <code>1</code> of the diploid +bubble at position <code>3</code> in bubble chain <code>1-341</code>. + +<img src='Mode3Chain.png'> + +<p> +The assembly will contain trivial bubble chains consisting of a single haploid bubble, +that is, a single assembled segment. +These segments have similar naming, but <code>c</code>, <code>d</code>, and <code>n</code> are always +<code>0</code>. For example, <code>1-136-0-0-P0</code>. + +<p> +If <code>Assembly.csv</code> is loaded in Bandage, segments are displayed +with custom colors as follows: +<ul> +<li>Segments of haploid bubbles of non-trivial bubble chains (names ending with <code>-P1</code>): red. +<li>Segments of diploid bubbles of non-trivial bubble chains (names ending with <code>-P2</code>): green. +<li>Segments of higher ploidy bubbles of non-trivial bubble chains +(names ending with <code>-Pn</code>) with <code>n > 2</code> : yellow. +<li>Segments of trivial bubble chains consisting of a single haploid bubble +(names ending with <code>-P0</code>): +<ul> +<li>If isolated (two free ends): blue. +<li>If dangling (one free end): cyan. +<li>All others: purple. +</ul> +</ul> + + +<p> +<div class="goto-index"><a href="index.html">Table of contents</a></div> +</main> +</body> +</html> + diff --git a/docs/Mode3Chain.png b/docs/Mode3Chain.png Binary files differnew file mode 100644 index 0000000..65c1282 --- /dev/null +++ b/docs/Mode3Chain.png diff --git a/docs/QuickStart.html b/docs/QuickStart.html index e11992f..e2eb2db 100644 --- a/docs/QuickStart.html +++ b/docs/QuickStart.html @@ -12,7 +12,7 @@ <h1>Quick start</h1> Note that the Shasta executable has no dependencies and requires no installation -or set up. This means that you can use it immediately afterdownloading it and setting its execute permission. +or set up. This means that you can use it immediately after downloading it and setting its execute permission. See below for more information. @@ -21,26 +21,16 @@ See below for more information. You can use the following commands to download the executable from the latest release and run an assembly: <pre> # Download the executable for the latest release. -curl -O -L https://github.com/chanzuckerberg/shasta/releases/download/0.10.0/shasta-Linux-0.10.0 +curl -O -L https://github.com/paoloshasta/shasta/releases/download/0.11.1/shasta-Linux-0.11.1 # Grant execute permissions. -chmod ugo+x shasta-Linux-0.10.0 +chmod ugo+x shasta-Linux-0.11.1 # Run an assembly. -./shasta-Linux-0.10.0 --input input.fasta --config Nanopore-May2022 +./shasta-Linux-0.11.1 --input input.fasta --config Nanopore-May2022 </pre> <p> -<b>The above is valid for releases up to 0.10.0. Newer releases will appear in the <code>paoloshasta/shasta</code> -repository instead, so the download command would be:</b> - -<pre> -curl -O -L https://github.com/paoloshasta/shasta/releases/download/x.y.z/shasta-Linux-x.y.z -</pre> - -(Replace <code>x.y.z</code> with the identifier for the release you want to use). - -<p> You can specify multiple input FASTA files, if necessary. On a typical laptop, this will run in minutes for a bacterial genome. For a human size assembly, AWS instance type <code>x1.32xlarge</code> diff --git a/docs/Running.html b/docs/Running.html index 4d967bc..cbd9eef 100644 --- a/docs/Running.html +++ b/docs/Running.html @@ -72,6 +72,10 @@ including the output of this script would be helpful. <h2 id=MemoryRequirements>Memory requirements</h2> +<p><b><i> +Note that in this section "performance" refers to assembly time only. +</i></b> + <p> For best performance, the Shasta assembler uses a single large machine rather than a cluster of smaller machines, @@ -113,6 +117,11 @@ a compute cost of around $20 per genome. <h2 id=LowMemory>Running with less than optimal memory</h2> +<p><b><i> +Note that in this section "performance" refers to assembly time only. +The memory options discussed here don't affect assembly results in any way. +</i></b> + <p> Shasta also supports a mode of operation with data structures physically on disk @@ -277,8 +286,11 @@ there is one core for every virtual processor. <h2 id=MemoryModes>Memory modes</h2> -<p> -<i>(This section does not apply to macOS).</i> +<p><b><i> +Note that in this section "performance" refers to assembly time only. +The memory options described here don't affect assembly results in any way. +</i></b> + <p> For performance, the Shasta executable operates in memory, diff --git a/docs/Shasta-0.12.0.pdf b/docs/Shasta-0.12.0.pdf Binary files differnew file mode 100644 index 0000000..bfe92c1 --- /dev/null +++ b/docs/Shasta-0.12.0.pdf diff --git a/docs/SupportedPlatforms.html b/docs/SupportedPlatforms.html index 4986182..53226b1 100644 --- a/docs/SupportedPlatforms.html +++ b/docs/SupportedPlatforms.html @@ -20,16 +20,7 @@ platforms: <ul> <li> Most current 64-bit Linux distributions for the -<code>x86_64</code> architecture, including the following -on which it was actually tested: -<ul> -<li>Ubuntu 16.04 LTS -<li>Ubuntu 18.04 LTS -<li>Ubuntu 20.04 LTS -</ul> - -<li> -macOS, using the macOS specific version of the Shasta executable. +<code>x86_64</code> architecture. <li> Windows, using the Linux version of the Shasta executable and @@ -45,7 +36,7 @@ See <a href=Running.html>here</a> for more information. <h2>Extended functionality</h2> <p> Extended Shasta functionality (http server, Python API) -is only available on Ubuntu 16.04, Ubuntu 18.04 and Ubuntu 20.04 LTS. +is only available on Ubuntu 22.04 LTS. Porting to other Linux platforms is possible. diff --git a/docs/index.html b/docs/index.html index 7d89e77..bca4f8f 100644 --- a/docs/index.html +++ b/docs/index.html @@ -11,6 +11,12 @@ <h2>Shasta Documentation</h2> <p> +🆕 <a href=Shasta-0.12.0.pdf>Mode 3 assembly: presentation of assembly results</a></li> + +<p> +🆕 <a href=Mode3-0.12.0.html>Mode 3 assembly: usage notes</a></li> + +<p> If you are looking at this documentation on GitHub Pages (<code>https://paoloshasta.github.io/shasta/</code>), this documentation applies to the latest Shasta code on GitHub @@ -42,6 +48,7 @@ in the <code>docs</code> directory. <li><a href=Running.html#ScriptedApproaches>Scripting with Python</a></li> <li><a href=Running.html#Errors>Dealing with errors</a></li> </ul> +<li><a href=Mode3-0.12.0.html>Mode 3 assembly: usage notes</a></li> <li><a href=InspectingResults.html>Exploring assembly results</a></li> <li><a href=Performance.html>Maximizing assembly performance</a></li> <li><a href=Commands.html>Shasta commands</a></li> diff --git a/dynamicExecutable/CMakeLists.txt b/dynamicExecutable/CMakeLists.txt index 1afcbce..6a015c1 100644 --- a/dynamicExecutable/CMakeLists.txt +++ b/dynamicExecutable/CMakeLists.txt @@ -5,7 +5,7 @@ project(shastaDynamicExecutable) add_definitions(-std=c++20) # Compilation warnings. -add_definitions(-Wall -Wconversion -Wno-unused-result) +add_definitions(-Wall -Wconversion -Wno-unused-result -Wno-trigraphs -Wno-psabi) # Optimization and debug options. if(BUILD_DEBUG) @@ -67,13 +67,13 @@ if(X86_64) target_link_libraries( shastaDynamicExecutable shastaDynamicLibrary - atomic boost_system boost_program_options boost_chrono spoa png z + atomic boost_system boost_program_options boost_chrono boost_serialization spoa png z lapack blas gfortran quadmath pthread) else(X86_64) target_link_libraries( shastaDynamicExecutable shastaDynamicLibrary - atomic boost_system boost_program_options boost_chrono spoa png z + atomic boost_system boost_program_options boost_chrono boost_serialization spoa png z lapack blas gfortran pthread) endif(X86_64) diff --git a/dynamicLibrary/CMakeLists.txt b/dynamicLibrary/CMakeLists.txt index 60d7269..e378e07 100644 --- a/dynamicLibrary/CMakeLists.txt +++ b/dynamicLibrary/CMakeLists.txt @@ -6,7 +6,7 @@ project(shastaDynamicLibrary) add_definitions(-std=c++20) # Compilation warnings. -add_definitions(-Wall -Wconversion -Wno-unused-result -Wno-trigraphs) +add_definitions(-Wall -Wconversion -Wno-unused-result -Wno-trigraphs -Wno-psabi) # Optimization and debug options. if(BUILD_DEBUG) @@ -77,7 +77,7 @@ SET(CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} ${SHASTA_PYTHON_LIBRARIES}") # Libraries to link with. target_link_libraries( shastaDynamicLibrary - atomic png boost_program_options pthread z spoa lapack blas ${SHASTA_PYTHON_LIBRARIES}) + atomic png boost_program_options boost_serialization pthread z spoa lapack blas ${SHASTA_PYTHON_LIBRARIES}) # Install the shared library into the bin directory. install(TARGETS shastaDynamicLibrary DESTINATION shasta-install/bin) diff --git a/scripts/AlignPseudoPaths.py b/scripts/AlignPseudoPaths.py deleted file mode 100755 index 8a62712..0000000 --- a/scripts/AlignPseudoPaths.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/python3 - -import shasta - -import argparse - -parser = argparse.ArgumentParser() -parser.add_argument('readId0', type=int) -parser.add_argument('strand0', type=int, choices=range(2)) -parser.add_argument('readId1', type=int) -parser.add_argument('strand1', type=int, choices=range(2)) -arguments = parser.parse_args() - - -a = shasta.Assembler() -a.accessMarkers() -a.accessReadGraph() -a.accessMarkerGraphVertices() -a.accessMarkerGraphEdges() -a.accessAssemblyGraphVertices() -a.accessAssemblyGraphEdges() -a.accessAssemblyGraphEdgeLists() -a.alignPseudoPaths(arguments.readId0, arguments.strand0, arguments.readId1, arguments.strand1) - - - - - - - diff --git a/scripts/AnalyzeMode3Subgraph.py b/scripts/AnalyzeMode3Subgraph.py deleted file mode 100755 index 704470f..0000000 --- a/scripts/AnalyzeMode3Subgraph.py +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/python3 - -import shasta - -segmentIds = [int(token) for token in input('Enter segment ids on one line: ').split()] - -a = shasta.Assembler() -a.accessMode3AssemblyGraph() -a.analyzeMode3Subgraph(segmentIds) - - diff --git a/scripts/ComputeAlignments.py b/scripts/ComputeAlignments.py index 887a195..8f83ae6 100755 --- a/scripts/ComputeAlignments.py +++ b/scripts/ComputeAlignments.py @@ -37,7 +37,10 @@ alignOptions.align4MinEntryCountPerCell = int(config['Align']['align4.minEntryCo alignOptions.align4MaxDistanceFromBoundary = int(config['Align']['align4.maxDistanceFromBoundary']) # Do the computation. +shasta.openPerformanceLog('ComputeAlignments.log') +a.computeMarkerKmerIds(0); a.computeAlignments(alignOptions, 0) +a.cleanupMarkerKmerIds(); diff --git a/scripts/CreateConfigurationTable.py b/scripts/CreateConfigurationTable.py index a25d73c..f5eb144 100755 --- a/scripts/CreateConfigurationTable.py +++ b/scripts/CreateConfigurationTable.py @@ -51,6 +51,7 @@ configurations = [ 'Nanopore-R10-Slow-Nov2022', 'Nanopore-Phased-R10-Fast-Nov2022', 'Nanopore-Phased-R10-Slow-Nov2022', + 'Nanopore-ncm23-May2024', ] diff --git a/scripts/CreateMode3Detangler.py b/scripts/CreateMode3Detangler.py deleted file mode 100755 index d558763..0000000 --- a/scripts/CreateMode3Detangler.py +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/python3 - -import shasta - -a = shasta.Assembler() -a.accessMarkers() -a.accessMarkerGraphEdges() -a.accessMode3AssemblyGraph() - -path = a.createMode3Detangler() - diff --git a/scripts/CreateMode3PathGraph.py b/scripts/CreateMode3PathGraph.py deleted file mode 100755 index 06a6452..0000000 --- a/scripts/CreateMode3PathGraph.py +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/python3 - -import shasta - -a = shasta.Assembler() -a.accessMarkers() -a.accessMarkerGraphEdges() -a.accessMode3AssemblyGraph() - -path = a.createMode3PathGraph() - - diff --git a/scripts/FindAlignmentCandidatesLowHash1.py b/scripts/FindAlignmentCandidatesLowHash1.py deleted file mode 100755 index 87afa06..0000000 --- a/scripts/FindAlignmentCandidatesLowHash1.py +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/python3 - -import shasta -import GetConfig -import sys - -helpMessage=""" -Invoke without arguments. -""" - -# Check that there are no arguments. -if not len(sys.argv)==1: - print(helpMessage) - exit(1) - -# Read the config file. -config = GetConfig.getConfig() - -# Initialize the assembler and access what we need. -a = shasta.Assembler() -a.accessKmers() -a.accessMarkers() - -# Do the computation. -a.findAlignmentCandidatesLowHash1( - m = int(config['MinHash']['m']), - hashFraction = float(config['MinHash']['hashFraction']), - minHashIterationCount = int(config['MinHash']['minHashIterationCount']), - minBucketSize = int(config['MinHash']['minBucketSize']), - maxBucketSize = int(config['MinHash']['maxBucketSize']), - minFrequency = int(config['MinHash']['minFrequency'])) -# a.writeAlignmentCandidates() - - diff --git a/scripts/FlagPrimaryMarkerGraphEdges.py b/scripts/FlagPrimaryMarkerGraphEdges.py new file mode 100644 index 0000000..b9c9054 --- /dev/null +++ b/scripts/FlagPrimaryMarkerGraphEdges.py @@ -0,0 +1,18 @@ +#!/usr/bin/python3 + +import shasta +import GetConfig + +# Read the config file. +config = GetConfig.getConfig() + +a = shasta.Assembler() +a.accessMarkers() +a.accessMarkerGraphVertices() +a.accessMarkerGraphEdges(True) +a.accessDisjointSetsHistogram() +a.flagPrimaryMarkerGraphEdges( + int(config['Assembly']['mode3.minPrimaryCoverage']), + int(config['Assembly']['mode3.maxPrimaryCoverage']), + 0) + diff --git a/scripts/GenerateRandomHaplotypes.py b/scripts/GenerateRandomHaplotypes.py new file mode 100755 index 0000000..a29160b --- /dev/null +++ b/scripts/GenerateRandomHaplotypes.py @@ -0,0 +1,103 @@ +#!/usr/bin/python3 + +helpMessage = """ +Generate "random" haplotypes for all bubble chains +of a Shasta phased assembly. + +Each bubble chain generates two haplotypes obtained +by concatenating UR and PR contigs in Assembly-Phased.fasta +in the appropriate order. + +Because UR contigs are not phased relative to each other, +this will generate switch errors. + +Run this while in the assembly directory. +This uses as input PhasingRegions.csv and Assembly-Phased.fasta. +It generates output files Assembly-Random-Haplotype0.fasta +and Assembly-Random-Haplotype1.fasta + +This script has no dependencies other than python3 +and can invoked directly without any installation required. + +""" + +# Import what we need. +import argparse +import csv + +# Make sure we have a --help option. +parser = argparse.ArgumentParser(description=helpMessage) +parser.parse_args() + +# Read the bubble chains file. +csvFile = open('PhasingRegions.csv', 'r') +reader = csv.DictReader(csvFile) + +bubbleChains = {} +for row in reader: + bubbleChainId = int(row['Bubble chain id']) + if not bubbleChainId in bubbleChains: + bubbleChains[bubbleChainId] = [] + bubbleChains[bubbleChainId].append(row) +if not bubbleChains: + print("No bubble chains were found." + "Run this script from a Shasta phased assembly directory.") + + +# Read the Assembly-Phased.fasta file. +# Shasta writes each contig in a header line plus +# a single line containing sequence. +inputFastaFile = open('Assembly-Phased.fasta', 'r') +inputContigs = {} +while True: + header = inputFastaFile.readline() + if not header: + break; + if not header[0] == ">": + raise RuntimeError("Invalid FASTA header: " + header) + name = header[1:].split(" ")[0] + sequence = inputFastaFile.readline().rstrip("\n") + + # We only want to keep it if the name begins with "UR," or "PR.". + if len(name) < 3: + continue; + prefix = name[0:3] + if not (prefix == "UR." or prefix == "PR."): + continue; + + inputContigs[name] = sequence + + +# Open the output files, one for each haplotype. +outputFileNames = [("Assembly-Random-Haplotype%i.fasta" % haplotypeId) for haplotypeId in range(2)] +outputFiles = [open(outputFileName, "w") for outputFileName in outputFileNames] + + +# Loop over bubble chains. +for bubbleChainId, bubbleChain in bubbleChains.items(): + print("Working on bubble chain %i of %i" % (bubbleChainId, len(bubbleChains))) + + # Check the bubble chain id. + for x in bubbleChain: + assert not x["Bubble chain id"] == bubbleChainId + + # Generate the two haplotypes for this bubble chain. + for haplotypeId in range(2): + sequence = "" + for position in range(len(bubbleChain)): + row = bubbleChain[position] + if (row["Phased"]) == "No": + name = "UR.%i.%i" % (bubbleChainId, position) + else: + component = int(row["Component"]) + name = "PR.%i.%i.%i.%i" % (bubbleChainId, position, component, haplotypeId) + assert name in inputContigs + sequence += inputContigs[name] + print("Bubble chain %i random haplotype %i has length %i" % (bubbleChainId, haplotypeId, len(sequence))) + outputFiles[haplotypeId].write(">BC.%i.%i %i\n%s\n" % (bubbleChainId, haplotypeId, len(sequence), sequence)) + +print("Generation of random haplotypes is complete.") +print("These haplotypes can contain a switch error at each phased region.") +print("Output is in %s and %s" % (outputFileNames[0], outputFileNames[1])) + + diff --git a/scripts/InstallPrerequisites-Ubuntu.sh b/scripts/InstallPrerequisites-Ubuntu.sh index b50f7da..a1e59f7 100755 --- a/scripts/InstallPrerequisites-Ubuntu.sh +++ b/scripts/InstallPrerequisites-Ubuntu.sh @@ -99,12 +99,11 @@ tar -xvf 4.0.8.tar.gz # To avoid these additional dependencies, we turn off the dispatcher feature for now. # We could turn it back on if we see significant performance degradation in this area. spoaBuildFlags="-Dspoa_generate_dispatch=ON" -if [[ "$isArm" == true ]]; then - spoaBuildFlags="-Dspoa_generate_dispatch=OFF -Dspoa_optimize_for_portability=OFF -Dspoa_optimize_for_native=OFF" -fi # Per the above comment, turn off the dispatcher feature for now. spoaBuildFlags="-DCMAKE_BUILD_TYPE=Release -Dspoa_optimize_for_portability=ON" - +if [[ "$isArm" == true ]]; then + spoaBuildFlags="-DCMAKE_BUILD_TYPE=Release -Dspoa_build_tests=OFF" +fi # Build the shared library. diff --git a/scripts/Mode3AssembleComponent.py b/scripts/Mode3AssembleComponent.py new file mode 100644 index 0000000..6ee8336 --- /dev/null +++ b/scripts/Mode3AssembleComponent.py @@ -0,0 +1,39 @@ +#!/usr/bin/python3 + +import shasta +import argparse + +parser = argparse.ArgumentParser(description= + 'Load a mode3::AssemblyGraph representing a connected component of the primary graph and assemble it.') + +parser.add_argument('component', type=int, + help='The connected component to assemble.') + +parser.add_argument( + "--no-assemble-sequence", + dest="dontAssembleSequence", + action="store_true", +) + +parser.add_argument( + "--debug", + dest="debug", + action="store_true", +) + +arguments = parser.parse_args() + + + +options = shasta.AssemblerOptions('shasta.conf') +a = shasta.Assembler() +a.accessMarkers() +a.accessMarkerGraphVertices() +a.accessMarkerGraphEdges() +a.accessMarkerGraphReverseComplementEdge() +a.accessMarkerGraphConsensus() +shasta.openPerformanceLog('Mode3AssembleComponent.log') +fileName = 'AssemblyGraph-' + str(arguments.component) + '.data' +a.mode3AssembleComponent(fileName, 0, + options.assemblyOptions.mode3Options, not arguments.dontAssembleSequence, arguments.debug) + diff --git a/scripts/Mode3Assembly.py b/scripts/Mode3Assembly.py index e126b69..8b07425 100755..100644 --- a/scripts/Mode3Assembly.py +++ b/scripts/Mode3Assembly.py @@ -1,31 +1,45 @@ #!/usr/bin/python3 -""" - -This run the final portion of Mode 3 assembly. -It assumes that the marker graph has already been created. - -""" - -import ast import shasta +import argparse import GetConfig +# Read the config file. config = GetConfig.getConfig() -shasta.openPerformanceLog('Mode3Assembly.log') +# Parse the command line arguments. +parser = argparse.ArgumentParser(description= + 'Run Mode 3 assembly starting from the marker graph.') + +parser.add_argument( + "--debug", + dest="debug", + action="store_true", +) + +arguments = parser.parse_args() + + +# Create the Assembler object and access what we need. +options = shasta.AssemblerOptions('shasta.conf') a = shasta.Assembler() -a.setupConsensusCaller(config['Assembly']['consensusCaller']) a.accessMarkers() a.accessMarkerGraphVertices() -a.accessMarkerGraphReverseComplementVertex() -a.accessMarkerGraphEdges() +a.accessMarkerGraphEdges(True) a.accessMarkerGraphReverseComplementEdge() a.accessMarkerGraphConsensus() +a.accessDisjointSetsHistogram() -a.mode3Assembly() - - +# Open a performance log. +shasta.openPerformanceLog('Mode3Assembly.log') +# Flag primary marker graph edges. +a.flagPrimaryMarkerGraphEdges( + int(config['Assembly']['mode3.minPrimaryCoverage']), + int(config['Assembly']['mode3.maxPrimaryCoverage']), + 0) +# Run Mode 3 assembly. +a.mode3Assembly(0, options.assemblyOptions.mode3Options, arguments.debug) + diff --git a/scripts/RandomlySelectKmers.py b/scripts/RandomlySelectKmers.py deleted file mode 100755 index c788cd3..0000000 --- a/scripts/RandomlySelectKmers.py +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/python3 - -import shasta -import GetConfig - -# Read the config file. -config = GetConfig.getConfig() - -# Initialize the assembler and access what we need. -a = shasta.Assembler() - -# Generate the k-mers and write them out. -a.randomlySelectKmers( - k = int(config['Kmers']['k']), - probability = float(config['Kmers']['probability'])) -a.writeKmers() diff --git a/scripts/ReverseTransitiveReduction.py b/scripts/ReverseTransitiveReduction.py deleted file mode 100755 index 4fad068..0000000 --- a/scripts/ReverseTransitiveReduction.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/python3 - -import shasta -import GetConfig - -# Read the config file. -config = GetConfig.getConfig() - - -# Initialize the assembler and access what we need. -a = shasta.Assembler() -a.accessMarkerGraphVertices() -a.accessMarkerGraphEdges(accessEdgesReadWrite=True) -a.accessMarkerGraphReverseComplementEdge() -a.reverseTransitiveReduction( - lowCoverageThreshold = int(config['MarkerGraph']['lowCoverageThreshold']), - highCoverageThreshold = int(config['MarkerGraph']['highCoverageThreshold']), - maxDistance = int(config['MarkerGraph']['maxDistance']) - ) - - diff --git a/scripts/SelectKmers2.py b/scripts/SelectKmers2.py deleted file mode 100755 index 08a43e3..0000000 --- a/scripts/SelectKmers2.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/python3 - -import shasta -import GetConfig - -# Read the config file. -config = GetConfig.getConfig() - -# Initialize the assembler and access what we need. -a = shasta.Assembler() - -# Generate the k-mers and write them out. -a.selectKmers2( - k = int(config['Kmers']['k']), - markerDensity = float(config['Kmers']['probability']), - enrichmentThreshold = float(config['Kmers']['enrichmentThreshold'])) - diff --git a/scripts/SelectKmers4.py b/scripts/SelectKmers4.py deleted file mode 100755 index 817095c..0000000 --- a/scripts/SelectKmers4.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/python3 - -import shasta - -# Read the config file. -import GetConfig -config = GetConfig.getConfig() - -# Create the assembler. -a = shasta.Assembler() - -# select k-mers. -a.selectKmers4( - k = int(config['Kmers']['k']), - markerDensity = float(config['Kmers']['probability']), - distanceThreshold = int(config['Kmers']['distanceThreshold'])) - diff --git a/scripts/SelectKmersBasedOnFrequency.py b/scripts/SelectKmersBasedOnFrequency.py deleted file mode 100755 index 70e152c..0000000 --- a/scripts/SelectKmersBasedOnFrequency.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/python3 - -import shasta -import GetConfig - -# Read the config file. -config = GetConfig.getConfig() - -# Initialize the assembler and access what we need. -a = shasta.Assembler() - -# Generate the k-mers and write them out. -a.selectKmersBasedOnFrequency( - k = int(config['Kmers']['k']), - markerDensity = float(config['Kmers']['probability']), - enrichmentThreshold = float(config['Kmers']['enrichmentThreshold'])) - diff --git a/scripts/VertexCoverageStatisticsByKmerId.py b/scripts/VertexCoverageStatisticsByKmerId.py deleted file mode 100755 index 70e04f2..0000000 --- a/scripts/VertexCoverageStatisticsByKmerId.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/python3 - -import shasta -import GetConfig - -config = GetConfig.getConfig() - -# To get meaningful results from this, use -# the following options when running the assembly, -# to make sure all vertices are generated: -# --MarkerGraph.allowDuplicateMarkers -# --MarkerGraph.minCoverage 1 -# --MarkerGraph.minCoverage 1000000000 - - -a = shasta.Assembler() -a.accessKmers() -a.accessMarkers() -a.accessMarkerGraphVertices() - -a.vertexCoverageStatisticsByKmerId() - - diff --git a/scripts/WriteAlignmentDetails.py b/scripts/WriteAlignmentDetails.py deleted file mode 100755 index 435a55b..0000000 --- a/scripts/WriteAlignmentDetails.py +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/python3 - -import shasta -import argparse - - -parser = argparse.ArgumentParser(description= - 'Write CSVs with details for each alignment') - -arguments = parser.parse_args() - -a = shasta.Assembler() -a.accessMarkers() -a.accessAlignmentCandidates() -a.accessCompressedAlignments() -a.accessAlignmentData() -a.writeAlignmentDetails() - - diff --git a/scripts/WriteMarkersFrequency.py b/scripts/WriteMarkersFrequency.py deleted file mode 100755 index 8a2ff26..0000000 --- a/scripts/WriteMarkersFrequency.py +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/python3 - -import shasta - -a = shasta.Assembler() -a.accessMarkers() -a.writeMarkerFrequency() diff --git a/scripts/testGlobalMsa.py b/scripts/testGlobalMsa.py new file mode 100644 index 0000000..6504a39 --- /dev/null +++ b/scripts/testGlobalMsa.py @@ -0,0 +1,33 @@ +#!/usr/bin/python3 + +import shasta + +sequences = [ + ("AGGTCCGACAGCGCGCCCAGATCCAGCCACGCCACGGTCCGCCTCTCCCGCCGCCCTGGCCTGTCCTTAGCCCCAGGC", 1), + ("AGGTCCGACAGCGCGCCCATACTCCAGCCACGCCACCGGTCCGCCTCTCCCGCCGCCCTGGCCTGTCCTTAGCCCCAGGC", 1), + ("AGGTCCGACAGCGCGCCCAGATCCAGCCACGCCACCGGTCCGCCTCTCCCGCCCTGGCCTGTCCTTAGCCCCAGGC", 1), + ("AGGTCCGACAGCGCGCCCAGATCCAGCCACGCCACCGGTCCGCCTCTCCCGCCGCCCTGGCCTGTCCTTAGCCCCAGGC", 5), + ("AGGTCCGACAGCGCCCAGATCCAGCCACGCCACCGGTCCGCCTCTCCCGCCGCCCTGGCCTGTCCTTAGCCCCAGGC", 1), + ("AGGTCCGACAGCGCGCCAGATCCAGCCACGCCACCGGTCCGCCTCTCCCGCCGCCCTGGCCTGTCCTTAGCCCCAGGC", 1), + ("AGGTCCGACAGCGCGCCCAGATCCAGCCACGCCACCGGTCCGCCTCTTCCCGCCGCCCTGGCCTGTCCTTAGCCCCAGGC", 2), + ("AGGTCCGACAGCGCGCCCAGATCCAGCCACGCCACCGGTCCGCCGCTCGCCCGCCGCCCTGGCCTGTCCTTAGCCCCAGGC", 1), + ] + +expectedConsensus = "AGGTCCGACAGCGCGCCCAGATCCAGCCACGCCACCGGTCCGCCTCTCCCGCCGCCCTGGCCTGTCCTTAGCCCCAGGC" + + +consensus = shasta.globalMsaPython(sequences, 30, 14) +print(consensus) + +pureSpoaConsensus = shasta.globalMsaPython(sequences, 1000000000, 14) + +if consensus == expectedConsensus: + print("Consensus agrees with expected consensus.") +else: + print("Consensus DOES NOT AGREE with expected consensus.") + +if consensus == pureSpoaConsensus: + print("Consensus agrees with pure spoa consensus.") +else: + print("Consensus DOES NOT AGREE with pure spoa consensus.") + diff --git a/src/Align4.cpp b/src/Align4.cpp index 48339d6..b16b6c7 100644 --- a/src/Align4.cpp +++ b/src/Align4.cpp @@ -28,15 +28,15 @@ using namespace Align4; void shasta::Align4::align( - const array<CompressedMarkers, 2>& compressedMarkers, - const array<span< const pair<KmerId, uint32_t> >, 2> sortedMarkers, + const array< span<KmerId>, 2>& kmerIds, + const array<span< pair<KmerId, uint32_t> >, 2> sortedMarkers, const Options& options, MemoryMapped::ByteAllocator& byteAllocator, Alignment& alignment, AlignmentInfo& alignmentInfo, bool debug) { - Align4::Aligner graph(compressedMarkers, sortedMarkers, + Align4::Aligner graph(kmerIds, sortedMarkers, options, byteAllocator, alignment, alignmentInfo, debug); } @@ -44,15 +44,15 @@ void shasta::Align4::align( Aligner::Aligner( - const array<CompressedMarkers, 2>& compressedMarkers, - const array<span< const pair<KmerId, uint32_t> >, 2> sortedMarkers, + const array<span<KmerId>, 2>& kmerIds, + const array<span< pair<KmerId, uint32_t> >, 2> sortedMarkers, const Options& options, MemoryMapped::ByteAllocator& byteAllocator, Alignment& alignment, AlignmentInfo& alignmentInfo, bool debug) : - nx(uint32_t(compressedMarkers[0].size())), - ny(uint32_t(compressedMarkers[1].size())), + nx(uint32_t(kmerIds[0].size())), + ny(uint32_t(kmerIds[1].size())), deltaX(int32_t(options.deltaX)), deltaY(int32_t(options.deltaY)), byteAllocator(byteAllocator) @@ -109,7 +109,7 @@ Aligner::Aligner( } vector< pair<Alignment, AlignmentInfo> > alignments; computeBandedAlignments( - compressedMarkers, + kmerIds, options.minAlignedMarkerCount, options.minAlignedFraction, options.maxSkip, @@ -192,7 +192,7 @@ SignedCoordinates Aligner::getxy(Coordinates XY) const -void Aligner::createAlignmentMatrix(const array<span< const pair<KmerId, uint32_t> >, 2> sortedMarkers) +void Aligner::createAlignmentMatrix(const array<span< pair<KmerId, uint32_t> >, 2> sortedMarkers) { alignmentMatrix.clear(); @@ -873,7 +873,7 @@ void Aligner::findActiveCellsConnectedComponents() // active cells. Return the ones that match requirements on // minAlignedMarkerCount, minAlignedFraction, maxSkip, maxDrift, maxTrim. void Aligner::computeBandedAlignments( - const array<CompressedMarkers, 2>& compressedMarkers, + const array<span<KmerId>, 2>& kmerIds, uint64_t minAlignedMarkerCount, double minAlignedFraction, uint64_t maxSkip, @@ -936,7 +936,7 @@ void Aligner::computeBandedAlignments( // Compute an alignment with this band. Alignment alignment; AlignmentInfo alignmentInfo; - computeBandedAlignment(compressedMarkers, bandMin, bandMax, + computeBandedAlignment(kmerIds, bandMin, bandMax, alignment, alignmentInfo, debug); // Skip it, if it does not satisfy the requirements on @@ -991,7 +991,7 @@ void Aligner::computeBandedAlignments( // Compute a banded alignment with a given band. bool Aligner::computeBandedAlignment( - const array<CompressedMarkers, 2>& compressedMarkers, + const array<span<KmerId>, 2>& kmerIds, int32_t bandMin, int32_t bandMax, Alignment& alignment, @@ -1014,8 +1014,8 @@ bool Aligner::computeBandedAlignment( // Add 100 to kMerIds to prevent collision from the seqan gap value. array<TSequence, 2> sequences; for(uint64_t i=0; i<2; i++) { - for(const CompressedMarker& marker: compressedMarkers[i]) { - appendValue(sequences[i], marker.kmerId + 100); + for(const KmerId& kmerId: kmerIds[i]) { + appendValue(sequences[i], kmerId + 100); } } @@ -1056,7 +1056,7 @@ bool Aligner::computeBandedAlignment( i<alignmentLength and ordinal0<nx and ordinal1<ny; i++) { if( align[i] != seqanGapValue and align[i + alignmentLength] != seqanGapValue and - compressedMarkers[0][ordinal0].kmerId == compressedMarkers[1][ordinal1].kmerId) { + kmerIds[0][ordinal0] == kmerIds[1][ordinal1]) { alignment.ordinals.push_back(array<uint32_t, 2>{ordinal0, ordinal1}); } if(align[i] != seqanGapValue) { diff --git a/src/Align4.hpp b/src/Align4.hpp index 892385a..1ba94dc 100644 --- a/src/Align4.hpp +++ b/src/Align4.hpp @@ -80,14 +80,12 @@ namespace shasta { // we can end up with negative values. using SignedCoordinates = pair<uint32_t, uint32_t>; - // The markers of an oriented read. - using CompressedMarkers = span<const CompressedMarker>; - - // Compute the alginment. - // The sorted markers are pairs(KmerId, ordinal) sorted by KmnerId. + // Compute the alignment. + // The KmerIds are the KmerIds for the two reads, in position order. + // The sorted markers are pairs(KmerId, ordinal) sorted by KmerId. void align( - const array<CompressedMarkers, 2>&, - const array<span< const pair<KmerId, uint32_t> >, 2> sortedMarkers, + const array< span<KmerId>, 2>& kmerIds, + const array<span<pair<KmerId, uint32_t> >, 2> sortedMarkers, const Align4::Options&, MemoryMapped::ByteAllocator&, Alignment&, @@ -135,10 +133,11 @@ class shasta::Align4::Aligner { public: // The constructor does all the work. - // The sorted markers are pairs(KmerId, ordinal) sorted by KmnerId. + // The kmerIds are in position orders. + // The sorted markers are pairs(KmerId, ordinal) sorted by KmerId. Aligner( - const array<CompressedMarkers, 2>& compressedMarkers, - const array<span< const pair<KmerId, uint32_t> >, 2> sortedMarkers, + const array< span<KmerId>, 2>& kmerIds, + const array<span< pair<KmerId, uint32_t> >, 2> sortedMarkers, const Options&, MemoryMapped::ByteAllocator&, Alignment&, @@ -174,7 +173,7 @@ private: using AlignmentMatrixEntryVector = vector<AlignmentMatrixEntry, AlignmentMatrixAllocator>; // For one iY using AlignmentMatrix = vector<AlignmentMatrixEntryVector>; // Indexed by iY. AlignmentMatrix alignmentMatrix; - void createAlignmentMatrix(const array<span< const pair<KmerId, uint32_t> >, 2> sortedMarkers); + void createAlignmentMatrix(const array<span< pair<KmerId, uint32_t> >, 2> sortedMarkers); void writeAlignmentMatrixCsv(const string& fileName) const; void writeAlignmentMatrixPng( const string& fileName, @@ -280,7 +279,7 @@ private: // active cells. Return the ones that match requirements on // minAlignedMarkerCount, minAlignedFraction, maxSkip, maxDrift, maxTrim. void computeBandedAlignments( - const array<CompressedMarkers, 2>& compressedMarkers, + const array<span<KmerId>, 2>& kmerIds, uint64_t minAlignedMarkerCount, double minAlignedFraction, uint64_t maxSkip, @@ -292,7 +291,7 @@ private: // Compute a banded alignment with a given band. bool computeBandedAlignment( - const array<CompressedMarkers, 2>& compressedMarkers, + const array<span<KmerId>, 2>& kmerIds, int32_t bandMin, int32_t bandMax, Alignment&, diff --git a/src/Alignment.hpp b/src/Alignment.hpp index 6915ae6..13d72f5 100644 --- a/src/Alignment.hpp +++ b/src/Alignment.hpp @@ -193,6 +193,10 @@ public: // Flag that is set if this alignment is used in the read graph. uint8_t isInReadGraph : 1; + // Uniqueness metric (alignment method 5 only). + // See Assembler::alignOrientedReads5. + float uniquenessMetric = std::numeric_limits<float>::signaling_NaN(); + void clearFlags() { isInReadGraph = 0; diff --git a/src/Assembler.cpp b/src/Assembler.cpp index 56f2b27..073e03e 100644 --- a/src/Assembler.cpp +++ b/src/Assembler.cpp @@ -1,7 +1,10 @@ #include "Assembler.hpp" +#include "AssemblerOptions.hpp" #include "buildId.hpp" #include "Coverage.hpp" +#include "KmerCheckerFactory.hpp" #include "MedianConsensusCaller.hpp" +#include "MurmurHash2.hpp" #include "Reads.hpp" #include "SimpleConsensusCaller.hpp" #include "SimpleBayesianConsensusCaller.hpp" @@ -13,14 +16,13 @@ template class MultithreadedObject<Assembler>; // Constructor to be called one to create a new run. Assembler::Assembler( - const string& largeDataFileNamePrefix, + const string& largeDataFileNamePrefixArgument, bool createNew, uint64_t readRepresentation, // 0 = raw sequence, 1 = RLE sequence. Only used if createNew. size_t largeDataPageSizeArgument) : - - MultithreadedObject(*this), - largeDataFileNamePrefix(largeDataFileNamePrefix) + MultithreadedObject(*this) { + largeDataFileNamePrefix = largeDataFileNamePrefixArgument; if(createNew) { @@ -171,3 +173,62 @@ void Assembler::storePeakMemoryUsage(uint64_t peakMemoryUsage) { assemblerInfo->peakMemoryUsage = peakMemoryUsage; } + + +void Assembler::createKmerChecker( + const KmersOptions& kmersOptions, + uint64_t threadCount) +{ + if(threadCount == 0) { + threadCount = std::thread::hardware_concurrency(); + } + + assemblerInfo->k = kmersOptions.k; + assemblerInfo->kmerGenerationMethod = kmersOptions.generationMethod; + + kmerChecker = KmerCheckerFactory::createNew( + kmersOptions, + threadCount, + getReads(), + *this); +} + + + +void Assembler::accessKmerChecker() +{ + kmerChecker = KmerCheckerFactory::createFromBinaryData( + assemblerInfo->k, + assemblerInfo->kmerGenerationMethod, + getReads(), + *this); +} + + + +// Hash a KmerId in such a way that it has the same hash as its reverse +// complement. This is used by alignment method 3 to downsample markers. +uint32_t Assembler::hashKmerId(KmerId kmerId) const +{ + const uint64_t k = assemblerInfo->k; + + // Construct the k-mer and its reverse complement. + const Kmer kmer(kmerId, k); + const Kmer kmerRc = kmer.reverseComplement(k); + + // Compute the id of the reverse complement k-mer. + const KmerId kmerIdRc = KmerId(kmerRc.id(k)); + + // Hash the sum of the two KmerIds. + // This guarantees that we return the same hash + // for a k-mer and its reverse complement. + const uint64_t sum = kmerId + kmerIdRc; + + return MurmurHash2(&sum, sizeof(sum), 13477); +} + + + + + + diff --git a/src/Assembler.hpp b/src/Assembler.hpp index d6e69e8..d160025 100644 --- a/src/Assembler.hpp +++ b/src/Assembler.hpp @@ -6,9 +6,12 @@ #include "AlignmentCandidates.hpp" #include "AssemblyGraph2Statistics.hpp" #include "HttpServer.hpp" +#include "invalid.hpp" #include "Kmer.hpp" +#include "MappedMemoryOwner.hpp" #include "Marker.hpp" #include "MarkerGraph.hpp" +#include "MarkerGraphEdgePairInfo.hpp" #include "MemoryMappedObject.hpp" #include "MultithreadedObject.hpp" #include "ReadGraph.hpp" @@ -25,7 +28,6 @@ namespace shasta { class Assembler; class AssemblerInfo; - class AssemblyGraph; class Alignment; class AlignmentData; class AlignmentGraph; @@ -38,21 +40,28 @@ namespace shasta { class ConsensusCaller; class Histogram2; class InducedAlignment; + class KmerChecker; + class KmersOptions; class LocalAssemblyGraph; class LocalAlignmentCandidateGraph; class LocalAlignmentGraph; - class LocalMarkerGraph; + class LocalMarkerGraph0; class LocalReadGraph; class LocalReadGraphTriangles; - class LocalMarkerGraphRequestParameters; + class LocalMarkerGraph0RequestParameters; class LongBaseSequences; class MarkerConnectivityGraph; class MarkerConnectivityGraphVertexMap; class Mode2AssemblyOptions; + class Mode3AssemblyOptions; + class Mode3Assembler; class OrientedReadPair; class Reads; class ReferenceOverlapMap; + namespace mode0 { + class AssemblyGraph; + } namespace MemoryMapped { class ByteAllocator; @@ -70,10 +79,6 @@ namespace shasta { class Options; } - namespace mode3 { - class AssemblyGraph; - } - extern template class MultithreadedObject<Assembler>; } @@ -102,6 +107,9 @@ public: // The length of k-mers used to define markers. size_t k; + // The method used to generate kmers (--Kmers.generationMethod). + uint64_t kmerGenerationMethod; + // The page size in use for this run. size_t largeDataPageSize; @@ -187,8 +195,9 @@ public: class shasta::Assembler : - public MultithreadedObject<Assembler> - , public HttpServer { + public MultithreadedObject<Assembler>, + public MappedMemoryOwner, + public HttpServer { public: @@ -230,8 +239,6 @@ public: void findMarkers(size_t threadCount); void accessMarkers(); void writeMarkers(ReadId, Strand, const string& fileName); - vector<KmerId> getMarkers(ReadId, Strand); - void writeMarkerFrequency(); // Write the reads that overlap a given read. void writeOverlappingReads(ReadId, Strand, const string& fileName); @@ -270,7 +277,6 @@ public: ); void accessAlignmentData(); void accessAlignmentDataReadWrite(); - void writeAlignmentDetails() const; // Loop over all alignments in the read graph @@ -322,9 +328,6 @@ public: vector< tuple<ReadId, Strand, uint32_t> > getGlobalMarkerGraphVertexMarkers(MarkerGraph::VertexId) const; - // Compute marker graph vertex coverage statistics by KmerId. - void vertexCoverageStatisticsByKmerId() const; - // Approximate transitive reduction of the marker graph. @@ -358,45 +361,6 @@ public: - // Approximate reverse transitive reduction of the marker graph. - // The goal is to remove local back-edges. - // This works similarly to transitive reduction, - // but in the opposite direction. - // This does the following: - // - Edges with coverage greater than lowCoverageThreshold - // and less then highCoverageThreshold are processed in - // ordered of increasing coverage: - // * For each such edge A->B, we look for a path of length - // at most maxDistance starting at B and ending at A - // that does not use edge A->B and also does not use any - // edges already marked wasRemovedByTransitiveReduction. - // * If such a path is found, the edge is marked - // wasRemovedByTransitiveReduction. - void reverseTransitiveReduction( - size_t lowCoverageThreshold, - size_t highCoverageThreshold, - size_t maxDistance); - - - -private: - - // Data filled in by the constructor. - string largeDataFileNamePrefix; - size_t largeDataPageSize; - - // Function to construct names for binary objects. - string largeDataName(const string& name) const - { - if(largeDataFileNamePrefix.empty()) { - return ""; // Anonymous; - } else { - return largeDataFileNamePrefix + name; - } - } - - - // Various pieces of assembler information stored in shared memory. // See class AssemblerInfo for more information. public: @@ -425,160 +389,44 @@ public: void computeReadIdsSortedByName(); + // Find duplicate reads, as determined by name (not sequence). + // This also sets the isDuplicate and discardDueToDuplicates read flags + // and summarizes what it found Duplicates.csv. + void findDuplicateReads(const string& handleDuplicates); -private: - - - - // Table of all k-mers of length k. - // Among all 4^k k-mers of length k, we choose a subset - // that we call "markers". - // The value of k used is stored in assemblerInfo. - // The k-mer table is a vector of 4^k pairs, - // indexed by k-mer id as computed using Kmer::id(k). - // The markers are selected at the beginning of an assembly - // and never changed, and selected in such a way that, - // if (and only if) a k-mer is a marker, its reverse complement - // is also a marker. That is, for all permitted values of i, 0 <= i < 4^k: - // kmerTable[i].isMarker == kmerTable[kmerTable[i].reverseComplementKmerId].isMarker - MemoryMapped::Vector<KmerInfo> kmerTable; - void checkKmersAreOpen() const; - -public: - void accessKmers(); - void writeKmers(const string& fileName) const; - - // Select marker k-mers randomly. - void randomlySelectKmers( - size_t k, // k-mer length. - double probability, // The probability that a k-mer is selected as a marker. - int seed // For random number generator. - ); - - - - // Select marker k-mers randomly, but excluding - // the ones that have high frequency in the reads. - void selectKmersBasedOnFrequency( - - // k-mer length. - size_t k, - - // The desired marker density - double markerDensity, - - // Seed for random number generator. - int seed, - - // Exclude k-mers enriched by more than this amount. - // Enrichment is the ratio of k-mer frequency in reads - // over what a random distribution would give. - double enrichmentThreshold, - - size_t threadCount - ); - - - - // In this version, marker k-mers are selected randomly, but excluding - // any k-mer that is over-enriched even in a single oriented read. - void selectKmers2( - - // k-mer length. - size_t k, - // The desired marker density - double markerDensity, - - // Seed for random number generator. - int seed, - - // Exclude k-mers enriched by more than this amount, - // even in a single oriented read. - // Enrichment is the ratio of k-mer frequency in reads - // over what a random distribution would give. - double enrichmentThreshold, - - size_t threadCount - ); private: - class SelectKmers2Data { - public: - - double enrichmentThreshold; - - // The number of times each k-mer appears in an oriented read. - // Indexed by KmerId. - MemoryMapped::Vector<uint64_t> globalFrequency; - - // The number of oriented reads that each k-mer is - // over-enriched in by more than a factor enrichmentThreshold. - // Indexed by KmerId. - MemoryMapped::Vector<ReadId> overenrichedReadCount; - - }; - SelectKmers2Data selectKmers2Data; - void selectKmers2ThreadFunction(size_t threadId); - - - - // In this version, marker k-mers are selected randomly, but excluding - // k-mers that appear repeated at short distances in any oriented read. - // More precisely, for each k-mer we compute the minimum distance - // (in RLE bases) at which any two copies of that k-mer appear in any oriented read. - // K-mers for which this minimum distance is less than distanceThreshold - // are not used as markers. Marker k-mers are selected randomly among the - // remaining k-mers, until the desired marker density is achieved. -public: - void selectKmers4( - - // k-mer length. - uint64_t k, - // The desired marker density - double markerDensity, - // Seed for random number generator. - uint64_t seed, - // Exclude k-mers that appear in any read in two copies, - // with the two copies closer than this distance (in RLE bases). - uint64_t distanceThreshold, - - size_t threadCount - ); -private: - void selectKmers4ThreadFunction(size_t threadId); - class SelectKmers4Data { + // The KmerChecker can find out if a given KmerId is a marker. + shared_ptr<KmerChecker> kmerChecker; public: + void createKmerChecker( + const KmersOptions& kmersOptions, + uint64_t threadCount); + void accessKmerChecker(); + + // This one should eventually go away, but there are several scripts + // that depend on it. + void accessKmers() + { + accessKmerChecker(); + } - // The number of times each k-mer appears in an oriented read. - // Indexed by KmerId. - MemoryMapped::Vector<uint64_t> globalFrequency; - - // The minimum distance at which two copies of each k-mer - // appear in any oriented read. - // Indexed by KmerId. - MemoryMapped::Vector< pair<std::mutex, uint32_t> > minimumDistance; - - }; - SelectKmers4Data selectKmers4Data; - - - - // Read the k-mers from file. -public: - void readKmersFromFile(uint64_t k, const string& fileName); private: - void computeKmerFrequency(size_t threadId); - void initializeKmerTable(); + // Hash a KmerId in such a way that it has the same hash as its reverse + // complement. This is used by alignment method 3 to downsample markers. + uint32_t hashKmerId(KmerId) const; // The markers on all oriented reads. Indexed by OrientedReadId::getValue(). +public: MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t> markers; +private: void checkMarkersAreOpen() const; // Get markers sorted by KmerId for a given OrientedReadId. @@ -588,7 +436,9 @@ private: // Given a marker by its OrientedReadId and ordinal, // return the corresponding global marker id. +public: MarkerId getMarkerId(OrientedReadId, uint32_t ordinal) const; +private: MarkerId getReverseComplementMarkerId(OrientedReadId, uint32_t ordinal) const; MarkerId getMarkerId(const MarkerDescriptor& m) const { @@ -606,20 +456,95 @@ private: // an additional 4 bytes per marker. public: pair<OrientedReadId, uint32_t> findMarkerId(MarkerId) const; -private: + // KmerIds for all markers. Indexed by OrientedReadId::getValue(). + // Only stored during alignment computation, and then freed. + MemoryMapped::VectorOfVectors<KmerId, uint64_t> markerKmerIds; + void computeMarkerKmerIds(uint64_t threadCount); + void cleanupMarkerKmerIds(); +private: + void computeMarkerKmerIdsThreadFunction(size_t threadId); + // Pairs (KmerId, ordinal), sorted by KmerId, for each oriented read. // Indexed by orientedReadId.getValue(). // Used by alignment method 4. - MemoryMapped::VectorOfVectors< pair<KmerId, uint32_t>, uint64_t> sortedMarkers; public: + MemoryMapped::VectorOfVectors< pair<KmerId, uint32_t>, uint64_t> sortedMarkers; void computeSortedMarkers(uint64_t threadCount); bool accessSortedMarkers(); private: - void computeSortedMarkersThreadFunction1(size_t threadId); - void computeSortedMarkersThreadFunction2(size_t threadId); + void computeSortedMarkersThreadFunction(size_t threadId); + // void computeSortedMarkersThreadFunction1(size_t threadId); + // void computeSortedMarkersThreadFunction2(size_t threadId); + + + + // Low frequency markers for each oriented read. + // This stores, for each oriented read, the ordinals corresponding + // to marker with low frequency (up to maxMarkerFrequency), sorted by KmerId. + // Used by alignment method 5. It is only stored durign alignment + // computation. +public: + MemoryMapped::VectorOfVectors<uint32_t, uint64_t> lowFrequencyMarkers; + void computeLowFrequencyMarkers(uint64_t maxMarkerFrequency, uint64_t threadCount); + void computeLowFrequencyMarkers( + const span<const KmerId>&, // The marker k-mers for the oriented reads (sorted by ordinal) + uint64_t maxMarkerFrequency, + vector<uint32_t>&); // The ordinals of the low frequency markers, sorted by KmerId. +private: + void computeLowFrequencyMarkersThreadFunctionPass1(uint64_t threadId); + void computeLowFrequencyMarkersThreadFunctionPass2(uint64_t threadId); + void computeLowFrequencyMarkersThreadFunctionPass12(uint64_t pass); + class ComputeLowFrequencyMarkersData { + public: + uint64_t maxMarkerFrequency; + }; + ComputeLowFrequencyMarkersData computeLowFrequencyMarkersData; + + + + // Low level functions to get marker Kmers/KmerIds of an oriented read. + // They are obtained from the reads and not from CompressedMarker::kmerId, + // which will soon go away. + + // Get the marker Kmer for an oriented read and ordinal. + Kmer getOrientedReadMarkerKmer(OrientedReadId, uint32_t ordinal) const; + Kmer getOrientedReadMarkerKmerStrand0(ReadId, uint32_t ordinal) const; + Kmer getOrientedReadMarkerKmerStrand1(ReadId, uint32_t ordinal) const; + + // Get the marker KmerId for an oriented read and ordinal. + KmerId getOrientedReadMarkerKmerId(OrientedReadId, uint32_t ordinal) const; + + // Get all marker Kmers for an oriented read. + void getOrientedReadMarkerKmers(OrientedReadId, const span<Kmer>&) const; + void getOrientedReadMarkerKmersStrand0(ReadId, const span<Kmer>&) const; + void getOrientedReadMarkerKmersStrand1(ReadId, const span<Kmer>&) const; + + // Get all marker KmerIds for an oriented read. + void getOrientedReadMarkerKmerIds(OrientedReadId, const span<KmerId>&) const; + void getOrientedReadMarkerKmerIdsStrand0(ReadId, const span<KmerId>&) const; + void getOrientedReadMarkerKmerIdsStrand1(ReadId, const span<KmerId>&) const; + + // Get all MarkerWithOrdinals for an oriented read (includes position, KmerId, and ordinal). + void getOrientedReadMarkers(OrientedReadId, const span<MarkerWithOrdinal>&) const; + void getOrientedReadMarkersStrand0(ReadId, const span<MarkerWithOrdinal>&) const; + void getOrientedReadMarkersStrand1(ReadId, const span<MarkerWithOrdinal>&) const; + + // Get all marker Kmers/KmerIds for a read in both orientations. + void getReadMarkerKmers( + ReadId, + const span<Kmer>& Kmers0, + const span<Kmer>& Kmers1) const; + void getReadMarkerKmerIds( + ReadId, + const span<KmerId>& kmerIds0, + const span<KmerId>& kmerIds1) const; + + // Get the Kmer/KmerId for an oriented read at a given marker ordinal. + Kmer getOrientedReadMarkerKmer(OrientedReadId, uint64_t ordinal) const; + KmerId getOrientedReadMarkerKmerId(OrientedReadId, uint64_t ordinal) const; @@ -698,16 +623,6 @@ public: size_t minFrequency, // Minimum number of lowHash hits for a pair to become a candidate. size_t threadCount ); - void findAlignmentCandidatesLowHash1( - size_t m, // Number of consecutive k-mers that define a feature. - double hashFraction, // Low hash threshold. - size_t minHashIterationCount, - size_t log2MinHashBucketCount, // Base 2 log of number of buckets for lowHash. - size_t minBucketSize, // The minimum size for a bucket to be used. - size_t maxBucketSize, // The maximum size for a bucket to be used. - size_t minFrequency, // Minimum number of lowHash hits for a pair to become a candidate. - size_t threadCount - ); void markAlignmentCandidatesAllPairs(); void accessAlignmentCandidates(); void accessAlignmentCandidateTable(); @@ -850,7 +765,7 @@ public: uint64_t maxBand, int64_t matchScore, int64_t mismatchScore, - int64_t gapScore) const; + int64_t gapScore); // Align two reads using alignment method 4. // If debug is true, detailed output to html is produced. @@ -862,7 +777,7 @@ public: MemoryMapped::ByteAllocator&, Alignment&, AlignmentInfo&, - bool debug) const; + bool debug); // Intermediate level version used by the http server. void alignOrientedReads4( @@ -883,7 +798,20 @@ public: int64_t gapScore, Alignment&, AlignmentInfo& - ) const; + ); + + // Alignment method 5. + void alignOrientedReads5( + OrientedReadId, + OrientedReadId, + int matchScore, + int mismatchScore, + int gapScore, + double driftRateTolerance, + uint64_t minBandExtend, + Alignment&, + AlignmentInfo&, + ostream& html); private: @@ -939,7 +867,9 @@ private: // The good alignments we found. // They are stored with readId0<readId1 and with strand0==0. // The order in compressedAlignments matches that in alignmentData. +public: MemoryMapped::Vector<AlignmentData> alignmentData; +private: MemoryMapped::VectorOfVectors<char, uint64_t> compressedAlignments; void checkAlignmentDataAreOpen() const; @@ -1011,8 +941,8 @@ private: // Read graph and related functions and data. // For more information, see comments in ReadGraph.hpp. - ReadGraph readGraph; public: + ReadGraph readGraph; void createReadGraph( uint32_t maxAlignmentCount, uint32_t maxTrim); @@ -1291,6 +1221,7 @@ public: void accessMarkerGraphVertices(bool readWriteAccess = false); void accessMarkerGraphReverseComplementVertex(bool readWriteAccess = false); void removeMarkerGraphVertices(); + void accessDisjointSetsHistogram(); private: void findMarkerGraphReverseComplementVerticesThreadFunction1(size_t threadId); void findMarkerGraphReverseComplementVerticesThreadFunction2(size_t threadId); @@ -1309,7 +1240,8 @@ private: uint32_t maxSkip, vector<MarkerGraphVertexId>&) const; - + // Find the common KmerId for all the markers of a marker graph vertex. + KmerId getMarkerGraphVertexKmerId(MarkerGraphVertexId) const; // Clean up marker graph vertices that have duplicate markers // (more than one marker on the same oriented reads). @@ -1461,6 +1393,38 @@ public: + // Analyze and compare the read compositions of two marker graph edges. + // This can only be done if the two edges have no duplicate OrientedReadIds + // in the markers. In that case, each OrientedReadId of an edge + // corresponds to one and only one markerInterval for each edge. + bool analyzeMarkerGraphEdgePair( + MarkerGraphEdgeId, + MarkerGraphEdgeId, + MarkerGraphEdgePairInfo& + ) const; + void writeHtmlMarkerGraphEdgePairInfo( + ostream& html, + MarkerGraphEdgeId, + MarkerGraphEdgeId, + const MarkerGraphEdgePairInfo& + ) const; + + // Count the number of common oriented reads between two marker graph edges. + // This assumes, WITHOUT CHECKING, that each of the two edges has no duplicate + // oriented reads. This assumption is satisfied for primary marker graph edges + // in Mode 3 assembly. + uint64_t countCommonOrientedReadsUnsafe(MarkerGraphEdgeId, MarkerGraphEdgeId) const; + + // Estimate the offset, in bases, between two marker graph edges. + // This assumes, WITHOUT CHECKING, that each of the two edges has no duplicate + // oriented reads. This assumption is satisfied for primary marker graph edges + // in Mode 3 assembly. + // If there are common oriented reads between the two edges, this uses + // countCommonOrientedReadsUnsafe. + // This can fail, in which case it returns invalid<uint64_t>. + uint64_t estimateBaseOffsetUnsafe(MarkerGraphEdgeId, MarkerGraphEdgeId) const; + + // Function createMarkerGraphSecondaryEdges can be called after createMarkerGraphEdgesStrict // to create a minimal amount of additional non-strict edges (secondary edges) // sufficient to restore contiguity. @@ -1563,9 +1527,11 @@ private: // it belongs to, plus the ordinal of the marker in the oriented read. // If the marker is not contained in any vertex, return // MarkerGraph::invalidVertexId. +public: MarkerGraph::VertexId getGlobalMarkerGraphVertex( OrientedReadId, uint32_t ordinal) const; +private: // Get pairs (ordinal, marker graph vertex id) for all markers of an oriented read. // The pairs are returned sorted by ordinal. @@ -1711,7 +1677,7 @@ private: bool useSuperBubbleEdges, bool useLowCoverageCrossEdges, bool useRemovedSecondaryEdges, - LocalMarkerGraph& + LocalMarkerGraph0& ); bool extractLocalMarkerGraph( MarkerGraph::VertexId, @@ -1724,7 +1690,7 @@ private: bool useSuperBubbleEdges, bool useLowCoverageCrossEdges, bool useRemovedSecondaryEdges, - LocalMarkerGraph& + LocalMarkerGraph0& ); // Compute consensus sequence for a vertex of the marker graph. @@ -1777,20 +1743,6 @@ private: - // Get the RLE sequence implied by a MarkerInterval. - // If the markers overlap, returns the number of - // overlapping RLE bases in overlappingRleBaseCount - // and empty rleSequence. - // Otherwise, returns zero overlappingRleBaseCount - // and the intervening sequence in rleSequence - // (which can be empty if the two markers are exactly adjacent). - void getMarkerIntervalRleSequence( - const MarkerInterval&, - uint64_t& overlappingRleBaseCount, - vector<Base>& rleSequence) const; - - - // Use spoa to compute consensus sequence for an edge of the marker graph. // This does not include the bases corresponding to the flanking markers. void computeMarkerGraphEdgeConsensusSequenceUsingSpoa( @@ -1840,7 +1792,7 @@ public: // A directed vertex A->B is created if the last marker graph vertex // of the edge chain corresponding to A coincides with the // first marker graph vertex of the edge chain corresponding to B. - shared_ptr<AssemblyGraph> assemblyGraphPointer; + shared_ptr<mode0::AssemblyGraph> assemblyGraphPointer; void removeAssemblyGraph() { assemblyGraphPointer.reset(); @@ -2045,7 +1997,17 @@ public: private: + + // Assemble Mode 3 sequence for all marker graph edges. + // See the comments before MarkerGraph::edgeSequence for more information. + // For now this is done sequentially. +public: + void assembleMarkerGraphEdgesMode3(); + + + // Assemble sequence for an edge of the assembly graph. +private: void assembleAssemblyGraphEdge( AssemblyGraphEdgeId, bool storeCoverageData, @@ -2085,24 +2047,9 @@ private: - // Assemble the RLE sequence of a path of the marker graph, under the assumption - // that, for each edge, all oriented reads have exactly the same sequence. - // This will be the case if edges were created by Assembler::createMarkerGraphEdgesStrict. -public: - void assembleMarkerGraphPathRleStrict( - span<const MarkerGraphEdgeId> path, - vector<Base>& rleSequence - ) const; - // Same, but for an assembly graph edge. - void assembleAssemblyGraphEdgeRleStrict( - AssemblyGraphEdgeId, - vector<Base>& rleSequence - ) const; - - - // Write the assembly graph in GFA 1.0 format defined here: // https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md +public: void writeGfa1(const string& fileName); void writeGfa1BothStrands(const string& fileName); void writeGfa1BothStrandsNoSequence(const string& fileName); @@ -2151,11 +2098,6 @@ public: - // Analyze pseudo-paths of oriented reads. - void alignPseudoPaths(ReadId, Strand, ReadId, Strand); - - - // Data and functions used for the http server. // This function puts the server into an endless loop // of processing requests. @@ -2247,12 +2189,14 @@ public: // Functions and data used by the http server // for display of the local marker graph. - void exploreMarkerGraph(const vector<string>&, ostream&); - void getLocalMarkerGraphRequestParameters( + void exploreMarkerGraph0(const vector<string>&, ostream&); + void exploreMarkerGraph1(const vector<string>&, ostream&); + void getLocalMarkerGraph0RequestParameters( const vector<string>&, - LocalMarkerGraphRequestParameters&) const; + LocalMarkerGraph0RequestParameters&) const; void exploreMarkerGraphVertex(const vector<string>&, ostream&); void exploreMarkerGraphEdge(const vector<string>&, ostream&); + void exploreMarkerGraphEdgePair(const vector<string>&, ostream&); void exploreMarkerCoverage(const vector<string>&, ostream&); void exploreMarkerGraphInducedAlignment(const vector<string>&, ostream&); void followReadInMarkerGraph(const vector<string>&, ostream&); @@ -2275,6 +2219,8 @@ public: uint64_t align4DeltaY, uint64_t align4MinEntryCountPerCell, uint64_t align4MaxDistanceFromBoundary, + double align5DriftRateTolerance, + uint64_t align5MinBandExtend, ostream& html ); void writeColorPicker(ostream& html, string svgId); @@ -2305,7 +2251,7 @@ public: // Compute all alignments for a given read. // This can be slow for large assemblies, - // and therefore the computation in multithreaded. + // and therefore the computation is multithreaded. void computeAllAlignments(const vector<string>&, ostream&); void computeAllAlignmentsThreadFunction(size_t threadId); class ComputeAllAlignmentsData { @@ -2329,6 +2275,8 @@ public: uint64_t align4DeltaY; uint64_t align4MinEntryCountPerCell; uint64_t align4MaxDistanceFromBoundary; + double align5DriftRateTolerance; + uint64_t align5MinBandExtend; // The alignments found by each thread. vector< vector< pair<OrientedReadId, AlignmentInfo> > > threadAlignments; }; @@ -2374,16 +2322,6 @@ private: void exploreAssemblyGraphEdgesSupport(const vector<string>&, ostream&); - // Http server functionality specific to mode 3 assembly. - void exploreMode3AssemblyGraph(const vector<string>&, ostream&); - void exploreMode3AssemblyGraphSegment(const vector<string>&, ostream&); - void exploreMode3AssemblyGraphSegmentPair(const vector<string>&, ostream&); - void exploreMode3AssemblyGraphLink(const vector<string>&, ostream&); - void exploreMode3MetaAlignment(const vector<string>&, ostream&); - void exploreMode3AssemblyPath(const vector<string>&, ostream&); - void exploreMode3LinkAssembly(const vector<string>&, ostream&); - - // Set up the ConsensusCaller used to compute the "best" // base and repeat count at each assembly position. @@ -2415,16 +2353,30 @@ public: bool debug); + // Mode 3 assembly. - void mode3Assembly( - size_t threadCount); - shared_ptr<mode3::AssemblyGraph> assemblyGraph3Pointer; - void accessMode3AssemblyGraph(); - void analyzeMode3Subgraph(const vector<uint64_t>& segmentIds); - void createMode3PathGraph(); - void createMode3Detangler(); + shared_ptr<Mode3Assembler> mode3Assembler; + void flagPrimaryMarkerGraphEdges( + uint64_t minPrimaryCoverage, + uint64_t maxPrimaryCoverage, + uint64_t threadCount); + // Assemble sequence between two primary edges. + void fillMode3AssemblyPathStep(const vector<string>&, ostream&); + // Top level function for Mode 3 assembly, starting from the MarkerGraph. + void mode3Assembly( + uint64_t threadCount, + const Mode3AssemblyOptions&, + bool debug + ); + + void mode3AssembleComponent( + const string& fileName, + uint64_t threadCount, + const Mode3AssemblyOptions&, + bool assembleSequence, + bool debug) const; public: void test(); diff --git a/src/AssemblerAlign.cpp b/src/AssemblerAlign.cpp index 7f9a1bc..88990f7 100644 --- a/src/AssemblerAlign.cpp +++ b/src/AssemblerAlign.cpp @@ -214,13 +214,14 @@ void Assembler::computeAlignments( size_t threadCount ) { + const auto tBegin = steady_clock::now(); performanceLog << timestamp << "Begin computing alignments for "; performanceLog << alignmentCandidates.candidates.size() << " alignment candidates." << endl; // Check that we have what we need. reads->checkReadsAreOpen(); - checkKmersAreOpen(); + SHASTA_ASSERT(kmerChecker); checkMarkersAreOpen(); checkAlignmentCandidatesAreOpen(); @@ -239,6 +240,12 @@ void Assembler::computeAlignments( computeSortedMarkers(threadCount); } + // For alignment method 5, compute low frequency markers. + if(alignOptions.alignMethod == 5) { + cout << timestamp << "Computing unique markers." << endl; + computeLowFrequencyMarkers(1, threadCount); + } + // Pick the batch size for computing alignments. size_t batchSize = 10; if(batchSize > alignmentCandidates.candidates.size()/threadCount) { @@ -254,9 +261,11 @@ void Assembler::computeAlignments( data.threadCompressedAlignments.resize(threadCount); performanceLog << timestamp << "Alignment computation begins." << endl; + cout << timestamp << "Alignment computation begins." << endl; setupLoadBalancing(alignmentCandidates.candidates.size(), batchSize); runThreads(&Assembler::computeAlignmentsThreadFunction, threadCount); performanceLog << timestamp << "Alignment computation completed." << endl; + cout << timestamp << "Alignment computation completed." << endl; // Store the alignments found by each thread. performanceLog << timestamp << "Storing the alignment found by each thread." << endl; @@ -286,10 +295,13 @@ void Assembler::computeAlignments( alignmentData.unreserve(); compressedAlignments.unreserve(); - // For alignment method 4, remove the sorted markers. + // Cleanup. if(alignOptions.alignMethod == 4) { sortedMarkers.remove(); } + if(alignOptions.alignMethod == 5) { + lowFrequencyMarkers.remove(); + } cout << "Found and stored " << alignmentData.size() << " good alignments." << endl; performanceLog << timestamp << "Creating alignment table." << endl; @@ -331,6 +343,8 @@ void Assembler::computeAlignmentsThreadFunction(size_t threadId) const int bandExtend = data.alignOptions->bandExtend; const int maxBand = data.alignOptions->maxBand; const bool suppressContainments = data.alignOptions->suppressContainments; + const double align5DriftRateTolerance = data.alignOptions->align5DriftRateTolerance; + const uint64_t align5MinBandExtend = data.alignOptions->align5MinBandExtend; // Align4-specific items. @@ -365,13 +379,10 @@ void Assembler::computeAlignmentsThreadFunction(size_t threadId) largeDataName("tmp-ThreadGlobalCompressedAlignments-" + to_string(threadId)), largeDataPageSize); + const uint64_t messageFrequency = min(1000000UL, alignmentCandidates.candidates.size()/20); + uint64_t begin, end; while(getNextBatch(begin, end)) { - if((begin % 1000000) == 0){ - std::lock_guard<std::mutex> lock(mutex); - performanceLog << timestamp << "Working on alignment " << begin; - performanceLog << " of " << alignmentCandidates.candidates.size() << endl; - } for(size_t i=begin; i!=end; i++) { const OrientedReadPair& candidate = alignmentCandidates.candidates[i]; @@ -381,6 +392,13 @@ void Assembler::computeAlignmentsThreadFunction(size_t threadId) orientedReadIds[0] = OrientedReadId(candidate.readIds[0], 0); orientedReadIds[1] = OrientedReadId(candidate.readIds[1], candidate.isSameStrand ? 0 : 1); + if((i % messageFrequency) == 0){ + std::lock_guard<std::mutex> lock(mutex); + performanceLog << timestamp << "Working on alignment " << i; + performanceLog << " of " << alignmentCandidates.candidates.size(); + // performanceLog << ": " << orientedReadIds[0] << " " << orientedReadIds[1]; + performanceLog << endl; + } // Compute the alignment. @@ -413,6 +431,13 @@ void Assembler::computeAlignmentsThreadFunction(size_t threadId) alignment, alignmentInfo, false); SHASTA_ASSERT(byteAllocator.isEmpty()); + } else if(alignmentMethod == 5) { + ofstream nullStream; + alignOrientedReads5(orientedReadIds[0], orientedReadIds[1], + matchScore, mismatchScore, gapScore, + align5DriftRateTolerance, align5MinBandExtend, + alignment, alignmentInfo, + nullStream); } else { SHASTA_ASSERT(0); } @@ -1008,70 +1033,6 @@ uint32_t Assembler::countCommonMarkersWithOffsetIn( } -void Assembler::writeAlignmentDetails() const -{ - string directoryName = "Alignments/"; - string header = "kmerId,ordinal0,ordinal1,rlePosition0,rlePosition1,"; - - SHASTA_ASSERT(std::filesystem::create_directory(directoryName)); - - for (uint32_t alignmentIndex=0; alignmentIndex<alignmentData.size(); alignmentIndex++){ - // Access the stored information we have about this alignment. - AlignmentData alignmentDatum = alignmentData[alignmentIndex]; - span<const char> compressedAlignment = compressedAlignments[alignmentIndex]; - - Alignment alignment; - decompress(compressedAlignment, alignment); - OrientedReadId orientedReadId0 = OrientedReadId(alignmentDatum.readIds[0], 0); - OrientedReadId orientedReadId1 = OrientedReadId(alignmentDatum.readIds[1], alignmentDatum.isSameStrand ? 0 : 1); - - string name0 = string(reads->getReadName(orientedReadId0.getReadId()).begin(), - reads->getReadName(orientedReadId0.getReadId()).end()); - - string name1 = string(reads->getReadName(orientedReadId1.getReadId()).begin(), - reads->getReadName(orientedReadId1.getReadId()).end()); - - string filename = name0 + "_" + name1 + "_" + (alignmentDatum.isSameStrand ? "1" : "0") + ".csv"; - - // Create a writeable a csv file - ofstream csv(directoryName + filename); - if (not (csv.is_open() and csv.good())){ - throw runtime_error("ERROR: file could not be written: " + directoryName + filename); - } - - csv << header << '\n'; - - // Access the markers for the two oriented reads. - const auto markers0 = markers[orientedReadId0.getValue()]; - const auto markers1 = markers[orientedReadId1.getValue()]; - - // Compute the raw position corresponding to each RLE position. - const vector<uint32_t> rawPositions0 = reads->getRawPositions(orientedReadId0); - const vector<uint32_t> rawPositions1 = reads->getRawPositions(orientedReadId1); - - // Loop over all markers. - for(const auto& ordinals: alignment.ordinals) { - const auto ordinal0 = ordinals[0]; - const auto ordinal1 = ordinals[1]; - - const auto& marker0 = markers0[ordinal0]; - const auto& marker1 = markers1[ordinal1]; - - const uint32_t rlePosition0 = marker0.position; - const uint32_t rlePosition1 = marker1.position; - - const auto kmerId = marker0.kmerId; - SHASTA_ASSERT(marker1.kmerId == kmerId); - - csv << kmerId << ',' - << ordinal0 << ',' - << ordinal1 << ',' - << rlePosition0 << ',' - << rlePosition1 << ',' << '\n'; - } - } -} - // Check if an alignment between two reads should be suppressed, // bases on the setting of command line option diff --git a/src/AssemblerAlign1.cpp b/src/AssemblerAlign1.cpp index a407d0f..adeea60 100644 --- a/src/AssemblerAlign1.cpp +++ b/src/AssemblerAlign1.cpp @@ -72,52 +72,41 @@ void Assembler::alignOrientedReads1( using TDepStringSet = StringSet<TSequence, Dependent<> >; using TAlignGraph = Graph<Alignment<TDepStringSet> >; +#if 0 // Access the markers of our oriented reads. const span<CompressedMarker> markers0 = markers[orientedReadId0.getValue()]; const span<CompressedMarker> markers1 = markers[orientedReadId1.getValue()]; +#endif - - - // Seqan uses the integer 45 to represent a gap - // and I did not find a good way to control that. - // So if KmerId 45 is a marker we replace it with the first KmerId - // that does not represent a marker. - // This is messy but I did not find a better solution. - bool replacementIsNeeded = false; - const KmerId seqanGapValue = 45; - KmerId replacementValue = seqanGapValue; - if(kmerTable[seqanGapValue].isMarker) { - replacementIsNeeded = true; - for(uint64_t i=0; i<kmerTable.size(); i++) { - if(!kmerTable[i].isMarker) { - replacementValue = KmerId(i); - break; - } - } - // cout << "Replacement value " << replacementValue << endl; - SHASTA_ASSERT(replacementValue != seqanGapValue); + // Get the marker KmerIds for the two oriented reads. + array<span<KmerId>, 2> allMarkerKmerIds; + array<vector<KmerId>, 2> allMarkerKmerIdsVectors; + if(markerKmerIds.isOpen()) { + allMarkerKmerIds[0] = markerKmerIds[orientedReadId0.getValue()]; + allMarkerKmerIds[1] = markerKmerIds[orientedReadId1.getValue()]; + } else { + // This is slower and will happen if markerKmerIds is not available. + // Resize the vectors and make the spans point to the vectors. + // Then call getOrientedReadMarkerKmerIds to fill them in. + allMarkerKmerIdsVectors[0].resize(markers.size(orientedReadId0.getValue())); + allMarkerKmerIdsVectors[1].resize(markers.size(orientedReadId1.getValue())); + allMarkerKmerIds[0] = span<KmerId>(allMarkerKmerIdsVectors[0]); + allMarkerKmerIds[1] = span<KmerId>(allMarkerKmerIdsVectors[1]); + getOrientedReadMarkerKmerIds(orientedReadId0, allMarkerKmerIds[0]); + getOrientedReadMarkerKmerIds(orientedReadId1, allMarkerKmerIds[1]); } - - // Construct the sequences of KmerId's we want to align. + // SeqAn uses 45 to represent gaps, so we add 100 to the KmerIds passed to SeqAn. TSequence seq0; - for(const CompressedMarker marker: markers0) { - if(replacementIsNeeded && marker.kmerId == seqanGapValue) { - appendValue(seq0, replacementValue); - } else { - appendValue(seq0, marker.kmerId); - } + for(const KmerId kmerId: allMarkerKmerIds[0]) { + appendValue(seq0, kmerId + 100); } TSequence seq1; - for(const CompressedMarker marker: markers1) { - if(replacementIsNeeded && marker.kmerId == seqanGapValue) { - appendValue(seq1, replacementValue); - } else { - appendValue(seq1, marker.kmerId); - } + for(const KmerId kmerId: allMarkerKmerIds[1]) { + appendValue(seq1, kmerId + 100); } // Store them in a SeqAn string set. @@ -157,11 +146,15 @@ void Assembler::alignOrientedReads1( alignment.clear(); uint32_t ordinal0 = 0; uint32_t ordinal1 = 0; + const uint32_t seqanGapValue = 45; for(int i=0; - i<alignmentLength and ordinal0<markers0.size() and ordinal1<markers1.size(); i++) { + i<alignmentLength and + ordinal0<allMarkerKmerIds[0].size() and + ordinal1<allMarkerKmerIds[1].size(); + i++) { if( align[i] != seqanGapValue and align[i + alignmentLength] != seqanGapValue and - markers0[ordinal0].kmerId == markers1[ordinal1].kmerId) { + allMarkerKmerIds[0][ordinal0] == allMarkerKmerIds[1][ordinal1]) { alignment.ordinals.push_back(array<uint32_t, 2>{ordinal0, ordinal1}); } if(align[i] != seqanGapValue) { @@ -173,7 +166,7 @@ void Assembler::alignOrientedReads1( } // Store the alignment info. - alignmentInfo.create(alignment, uint32_t(markers0.size()), uint32_t(markers1.size())); + alignmentInfo.create(alignment, uint32_t(allMarkerKmerIds[0].size()), uint32_t(allMarkerKmerIds[1].size())); // Debugging. @@ -186,8 +179,8 @@ void Assembler::alignOrientedReads1( alignment[0].resize(alignmentLength); alignment[1].resize(alignmentLength); for(int i=0; i<alignmentLength; i++) { - alignment[0][i] = align[i]; - alignment[1][i] = align[i + alignmentLength]; + alignment[0][i] = align[i] - 100; + alignment[1][i] = align[i + alignmentLength] - 100; } diff --git a/src/AssemblerAlign3.cpp b/src/AssemblerAlign3.cpp index 9b54d27..435e25b 100644 --- a/src/AssemblerAlign3.cpp +++ b/src/AssemblerAlign3.cpp @@ -53,11 +53,25 @@ void Assembler::alignOrientedReads3( using TDepStringSet = StringSet<TSequence, Dependent<> >; using TAlignGraph = Graph<Alignment<TDepStringSet> >; + // Get the marker KmerIds for the two oriented reads. + array<span<KmerId>, 2> allMarkerKmerIds; + array<vector<KmerId>, 2> allMarkerKmerIdsVectors; + if(markerKmerIds.isOpen()) { + allMarkerKmerIds[0] = markerKmerIds[orientedReadId0.getValue()]; + allMarkerKmerIds[1] = markerKmerIds[orientedReadId1.getValue()]; + } else { + // This is slower and will happen if markerKmerIds is not available. + // Resize the vectors and make the spans point to the vectors. + // Then call getOrientedReadMarkerKmerIds to fill them in. + allMarkerKmerIdsVectors[0].resize(markers.size(orientedReadId0.getValue())); + allMarkerKmerIdsVectors[1].resize(markers.size(orientedReadId1.getValue())); + allMarkerKmerIds[0] = span<KmerId>(allMarkerKmerIdsVectors[0]); + allMarkerKmerIds[1] = span<KmerId>(allMarkerKmerIdsVectors[1]); + getOrientedReadMarkerKmerIds(orientedReadId0, allMarkerKmerIds[0]); + getOrientedReadMarkerKmerIds(orientedReadId1, allMarkerKmerIds[1]); + } + - // Get the markers for the two oriented reads. - array<span<CompressedMarker>, 2> allMarkers; - allMarkers[0] = markers[orientedReadId0.getValue()]; - allMarkers[1] = markers[orientedReadId1.getValue()]; // Vectors to contain downsampled markers. // For each of the two reads we store vectors of @@ -66,14 +80,14 @@ void Assembler::alignOrientedReads3( array<TSequence, 2> downsampledSequences; // Fill in downsampled markers. - // SeqAn uses 45 to represent gaps, so we add 45 to the KmerIds passed to SeqAn. + // SeqAn uses 45 to represent gaps, so we add 100 to the KmerIds passed to SeqAn. // This means that we can't do k=16. const uint32_t hashThreshold = uint32_t(downsamplingFactor * double(std::numeric_limits<uint32_t>::max())); for(uint64_t i=0; i<2; i++) { - for(uint32_t ordinal=0; ordinal<uint32_t(allMarkers[i].size()); ordinal++) { - const KmerId kmerId = allMarkers[i][ordinal].kmerId; - if(kmerTable[kmerId].hash < hashThreshold) { + for(uint32_t ordinal=0; ordinal<uint32_t(allMarkerKmerIds[i].size()); ordinal++) { + const KmerId kmerId = allMarkerKmerIds[i][ordinal]; + if(hashKmerId(kmerId) < hashThreshold) { downsampledMarkers[i].push_back(make_pair(ordinal, kmerId)); appendValue(downsampledSequences[i], kmerId + 100); } @@ -82,7 +96,7 @@ void Assembler::alignOrientedReads3( if(debug) { cout << "Aligning two oriented reads with " << - allMarkers[0].size() << " and " << allMarkers[1].size() << " markers." << endl; + allMarkerKmerIds[0].size() << " and " << allMarkerKmerIds[1].size() << " markers." << endl; cout << "Downsampled markers for step 1 to " << downsampledMarkers[0].size() << " and " << downsampledMarkers[1].size() << " markers." << endl; @@ -101,7 +115,7 @@ void Assembler::alignOrientedReads3( // One of the downsampled sequences is empty. Return an empty alignment. alignment.clear(); alignmentInfo.create( - alignment, uint32_t(allMarkers[0].size()), uint32_t(allMarkers[1].size())); + alignment, uint32_t(allMarkerKmerIds[0].size()), uint32_t(allMarkerKmerIds[1].size())); return; } @@ -186,7 +200,7 @@ void Assembler::alignOrientedReads3( downsampledMarkers[0].size() + downsampledMarkers[1].size()) { alignment.clear(); alignmentInfo.create( - alignment, uint32_t(allMarkers[0].size()), uint32_t(allMarkers[1].size())); + alignment, uint32_t(allMarkerKmerIds[0].size()), uint32_t(allMarkerKmerIds[1].size())); return; } @@ -234,7 +248,7 @@ void Assembler::alignOrientedReads3( if((bandMax - bandMin) > maxBand) { alignment.clear(); alignmentInfo.create( - alignment, uint32_t(allMarkers[0].size()), uint32_t(allMarkers[1].size())); + alignment, uint32_t(allMarkerKmerIds[0].size()), uint32_t(allMarkerKmerIds[1].size())); return; } @@ -243,8 +257,8 @@ void Assembler::alignOrientedReads3( // Now, do a alignment using this band and all markers. array<TSequence, 2> sequences; for(uint64_t i=0; i<2; i++) { - for(uint32_t ordinal=0; ordinal<uint32_t(allMarkers[i].size()); ordinal++) { - const KmerId kmerId = allMarkers[i][ordinal].kmerId; + for(uint32_t ordinal=0; ordinal<uint32_t(allMarkerKmerIds[i].size()); ordinal++) { + const KmerId kmerId = allMarkerKmerIds[i][ordinal]; appendValue(sequences[i], kmerId + 100); } } @@ -280,10 +294,10 @@ void Assembler::alignOrientedReads3( uint32_t ordinal0 = 0; uint32_t ordinal1 = 0; for(int i=0; - i<alignmentLength and ordinal0<allMarkers[0].size() and ordinal1<allMarkers[1].size(); i++) { + i<alignmentLength and ordinal0<allMarkerKmerIds[0].size() and ordinal1<allMarkerKmerIds[1].size(); i++) { if( align[i] != seqanGapValue and align[i + alignmentLength] != seqanGapValue and - allMarkers[0][ordinal0].kmerId == allMarkers[1][ordinal1].kmerId) { + allMarkerKmerIds[0][ordinal0] == allMarkerKmerIds[1][ordinal1]) { alignment.ordinals.push_back(array<uint32_t, 2>{ordinal0, ordinal1}); } if(align[i] != seqanGapValue) { @@ -308,7 +322,7 @@ void Assembler::alignOrientedReads3( } // Store the alignment info. - alignmentInfo.create(alignment, uint32_t(allMarkers[0].size()), uint32_t(allMarkers[1].size())); + alignmentInfo.create(alignment, uint32_t(allMarkerKmerIds[0].size()), uint32_t(allMarkerKmerIds[1].size())); } diff --git a/src/AssemblerAlign4.cpp b/src/AssemblerAlign4.cpp index 31823ee..666eb81 100644 --- a/src/AssemblerAlign4.cpp +++ b/src/AssemblerAlign4.cpp @@ -25,7 +25,7 @@ void Assembler::alignOrientedReads4( uint64_t maxBand, int64_t matchScore, int64_t mismatchScore, - int64_t gapScore) const + int64_t gapScore) { // Fill in the options. Align4::Options options; @@ -81,7 +81,7 @@ void Assembler::alignOrientedReads4( int64_t gapScore, Alignment& alignment, AlignmentInfo& alignmentInfo - ) const + ) { // Fill in the options. Align4::Options options; @@ -122,18 +122,32 @@ void Assembler::alignOrientedReads4( MemoryMapped::ByteAllocator& byteAllocator, Alignment& alignment, AlignmentInfo& alignmentInfo, - bool debug) const + bool debug) { - // Access the markers for the two oriented reads. - array<span<const CompressedMarker>, 2> orientedReadMarkers; - orientedReadMarkers[0] = markers[orientedReadId0.getValue()]; - orientedReadMarkers[1] = markers[orientedReadId1.getValue()]; + + // Get the marker KmerIds for the two oriented reads. + array<span<KmerId>, 2> orientedReadKmerIds; + array<vector<KmerId>, 2> orientedReadKmerIdsVectors; + if(markerKmerIds.isOpen()) { + orientedReadKmerIds[0] = markerKmerIds[orientedReadId0.getValue()]; + orientedReadKmerIds[1] = markerKmerIds[orientedReadId1.getValue()]; + } else { + // This is slower and will happen if markerKmerIds is not available. + // Resize the vectors and make the spans point to the vectors. + // Then call getOrientedReadMarkerKmerIds to fill them in. + orientedReadKmerIdsVectors[0].resize(markers.size(orientedReadId0.getValue())); + orientedReadKmerIdsVectors[1].resize(markers.size(orientedReadId1.getValue())); + orientedReadKmerIds[0] = span<KmerId>(orientedReadKmerIdsVectors[0]); + orientedReadKmerIds[1] = span<KmerId>(orientedReadKmerIdsVectors[1]); + getOrientedReadMarkerKmerIds(orientedReadId0, orientedReadKmerIds[0]); + getOrientedReadMarkerKmerIds(orientedReadId1, orientedReadKmerIds[1]); + } // Align4 needs markers sorted by KmerId. // Use the ones from sortedMarkers if available, or else compute them. - array<span< const pair<KmerId, uint32_t> >, 2> orientedReadSortedMarkersSpans; + array<span< pair<KmerId, uint32_t> >, 2> orientedReadSortedMarkersSpans; array<vector< pair<KmerId, uint32_t> >, 2> orientedReadSortedMarkers; if(sortedMarkers.isOpen()) { @@ -154,33 +168,30 @@ void Assembler::alignOrientedReads4( for(uint64_t i=0; i<2; i++) { // Unsorted markers for this oriented read. - const span<const CompressedMarker>& um = orientedReadMarkers[i]; + const span<const KmerId>& km = orientedReadKmerIds[i]; // Sorted markers for this oriented read. vector<pair<KmerId, uint32_t> >& sm = orientedReadSortedMarkers[i]; // Copy the unsorted markers. - const uint64_t n = um.size(); + const uint64_t n = km.size(); sm.resize(n); for(uint64_t ordinal=0; ordinal<n; ordinal++) { - const CompressedMarker& cm = um[ordinal]; - sm[ordinal] = make_pair(cm.kmerId, uint32_t(ordinal)); + sm[ordinal] = make_pair(km[ordinal], uint32_t(ordinal)); } // Sort them. sort(sm.begin(), sm.end(), OrderPairsByFirstOnly<KmerId, uint32_t>()); // Make the span point to the data in the vector. - const pair<KmerId, uint32_t> * const smBegin = &sm.front(); - orientedReadSortedMarkersSpans[i] = - span< const pair<KmerId, uint32_t> >(smBegin, smBegin + n); + orientedReadSortedMarkersSpans[i] = sm; } } // Compute the alignment. - Align4::align(orientedReadMarkers, orientedReadSortedMarkersSpans, + Align4::align(orientedReadKmerIds, orientedReadSortedMarkersSpans, options, byteAllocator, alignment, alignmentInfo, debug); } @@ -192,6 +203,8 @@ void Assembler::computeSortedMarkers(uint64_t threadCount) // Check that we have what we need. checkMarkersAreOpen(); const uint64_t orientedReadCount = markers.size(); + SHASTA_ASSERT(markerKmerIds.isOpen()); + SHASTA_ASSERT(markerKmerIds.size() == orientedReadCount); // Adjust the numbers of threads, if necessary. if(threadCount == 0) { @@ -200,18 +213,48 @@ void Assembler::computeSortedMarkers(uint64_t threadCount) // Do it. sortedMarkers.createNew(largeDataName("SortedMarkers"), largeDataPageSize); - sortedMarkers.beginPass1(orientedReadCount); - const uint64_t batchSize = 10000; - setupLoadBalancing(orientedReadCount, batchSize); - runThreads(&Assembler::computeSortedMarkersThreadFunction1, threadCount); - sortedMarkers.beginPass2(); - sortedMarkers.endPass2(false); + for(uint64_t i=0; i<orientedReadCount; i++) { + sortedMarkers.appendVector(markers[i].size()); + } + const uint64_t batchSize = 100; setupLoadBalancing(orientedReadCount, batchSize); - runThreads(&Assembler::computeSortedMarkersThreadFunction2, threadCount); + runThreads(&Assembler::computeSortedMarkersThreadFunction, threadCount); +} + + + +void Assembler::computeSortedMarkersThreadFunction(size_t threadId) +{ + // Loop over all batches assigned to this thread. + uint64_t begin, end; + while(getNextBatch(begin, end)) { + + // Loop over oriented reads in this batch. + for(uint64_t i=begin; i!=end; i++) { + + // Access the marker KmerIs and sorted markers for this oriented read. + const auto kmerIds = markerKmerIds[i]; + const uint64_t markerCount = kmerIds.size(); + const span< pair<KmerId, uint32_t> > sm = sortedMarkers[i]; + SHASTA_ASSERT(sm.size() == markerCount); + + // Copy the KmerId's and ordinals. + for(uint32_t ordinal=0; ordinal<markerCount; ordinal++) { + auto& p = sm[ordinal]; + p.first = kmerIds[ordinal]; + p.second = ordinal; + } + + // Sort them by KmerId. + sort(sm.begin(), sm.end(), OrderPairsByFirstOnly<KmerId, uint32_t>()); + } + } + } +#if 0 void Assembler::computeSortedMarkersThreadFunction1(size_t threadId) { // Loop over all batches assigned to this thread. @@ -259,6 +302,8 @@ void Assembler::computeSortedMarkersThreadFunction2(size_t threadId) } } +#endif + bool Assembler::accessSortedMarkers() diff --git a/src/AssemblerAlign5.cpp b/src/AssemblerAlign5.cpp new file mode 100644 index 0000000..c9d267f --- /dev/null +++ b/src/AssemblerAlign5.cpp @@ -0,0 +1,737 @@ +#include "Assembler.hpp" +#include "deduplicate.hpp" +#include "Reads.hpp" +#include "seqan.hpp" +using namespace shasta; + + + +// Version that uses banded alignments. +void Assembler::alignOrientedReads5( + OrientedReadId orientedReadId0, + OrientedReadId orientedReadId1, + int matchScore, + int mismatchScore, + int gapScore, + double driftRateTolerance, + uint64_t minBandExtend, + Alignment& alignment, + AlignmentInfo& alignmentInfo, + ostream& html) +{ + + // Get the marker KmerIds for the two oriented reads. + array<span<KmerId>, 2> allMarkerKmerIds; + array<vector<KmerId>, 2> allMarkerKmerIdsVectors; + if(markerKmerIds.isOpen()) { + allMarkerKmerIds[0] = markerKmerIds[orientedReadId0.getValue()]; + allMarkerKmerIds[1] = markerKmerIds[orientedReadId1.getValue()]; + } else { + // This is slower and will happen if markerKmerIds is not available. + // Resize the vectors and make the spans point to the vectors. + // Then call getOrientedReadMarkerKmerIds to fill them in. + allMarkerKmerIdsVectors[0].resize(markers.size(orientedReadId0.getValue())); + allMarkerKmerIdsVectors[1].resize(markers.size(orientedReadId1.getValue())); + allMarkerKmerIds[0] = span<KmerId>(allMarkerKmerIdsVectors[0]); + allMarkerKmerIds[1] = span<KmerId>(allMarkerKmerIdsVectors[1]); + getOrientedReadMarkerKmerIds(orientedReadId0, allMarkerKmerIds[0]); + getOrientedReadMarkerKmerIds(orientedReadId1, allMarkerKmerIds[1]); + } + + + // Get the low frequency markers in the two oriented reads, sorted by KmerId. + array< span<uint32_t>, 2> lowFrequencyOrdinals; + array< vector<uint32_t>, 2> lowFrequencyOrdinalsVectors; + if(lowFrequencyMarkers.isOpen()) { + // Use the stored copy. + lowFrequencyOrdinals[0] = lowFrequencyMarkers[orientedReadId0.getValue()]; + lowFrequencyOrdinals[1] = lowFrequencyMarkers[orientedReadId1.getValue()]; + } + else { + // Compute them and store in the local vectors, then have the spans point to them. + for(uint64_t i=0; i<2; i++) { + computeLowFrequencyMarkers(allMarkerKmerIds[i], 1, lowFrequencyOrdinalsVectors[i]); + lowFrequencyOrdinals[i] = span<uint32_t>(lowFrequencyOrdinalsVectors[i]); + } + } + + + + if(html) { + for(uint64_t i=0; i<2; i++) { + html << "<br>" << (i==0 ? orientedReadId0 : orientedReadId1) << " has " << allMarkerKmerIds[i].size() << + " markers of which " << lowFrequencyOrdinals[i].size() << " are unique." << endl; + } + } + + + + // Find pairs of ordinals in the two oriented reads that correspond to + // the same low frequency k-mers. + class CommonKmerInfo { + public: + uint32_t ordinal0; + uint32_t ordinal1; + KmerId kmerId; + uint64_t rank0 = invalid<uint64_t>; + uint64_t rank1 = invalid<uint64_t>; + uint64_t ordinalSum() const + { + return ordinal0 + ordinal1; + } + int64_t ordinalOffset() const + { + return int64_t(ordinal0) - int64_t(ordinal1); + } + }; + vector<CommonKmerInfo> commonKmerInfos; + + // Joint loop over the ordinals corresponding to low frequency markers. + // They are both sorted by KmerId. + const auto begin0 = lowFrequencyOrdinals[0].begin(); + const auto begin1 = lowFrequencyOrdinals[1].begin(); + const auto end0 = lowFrequencyOrdinals[0].end(); + const auto end1 = lowFrequencyOrdinals[1].end(); + auto it0 = begin0; + auto it1 = begin1; + while((it0 != end0) and (it1 != end1)) { + const uint32_t ordinal0 = *it0; + const uint32_t ordinal1 = *it1; + const KmerId kmerId0 = allMarkerKmerIds[0][ordinal0]; + const KmerId kmerId1 = allMarkerKmerIds[1][ordinal1]; + + if(kmerId0 < kmerId1) { + + // Go past the streak with this KmerId in lowFrequencyOrdinals[0]. + while(it0 != end0 and allMarkerKmerIds[0][*it0] == kmerId0) { + ++it0; + } + + + } else if(kmerId1 < kmerId0) { + + // Go past the streak with this KmerId in lowFrequencyOrdinals[1]. + while(it1 != end1 and allMarkerKmerIds[1][*it1] == kmerId1) { + ++it1; + } + + } else { + + // We found a common low frequency marker k-mer. + SHASTA_ASSERT(kmerId0 == kmerId1); + const KmerId kmerId = kmerId0; + + // Look for the streak with this KmerId in lowFrequencyOrdinals[0]. + auto streakBegin0 = it0; + auto streakEnd0 = it0 + 1; + while(streakEnd0 != end0 and allMarkerKmerIds[0][*streakEnd0] == kmerId) { + ++streakEnd0; + } + + // Look for the streak with this KmerId in lowFrequencyOrdinals[1]. + auto streakBegin1 = it1; + auto streakEnd1 = it1 + 1; + while(streakEnd1 != end1 and allMarkerKmerIds[1][*streakEnd1] == kmerId) { + ++streakEnd1; + } + + // Look over pairs of markers in these streaks. + for(auto jt0=streakBegin0; jt0!=streakEnd0; jt0++) { + for(auto jt1=streakBegin1; jt1!=streakEnd1; jt1++) { + commonKmerInfos.push_back({*jt0, *jt1, kmerId}); + } + } + + // Point to the next marker in lowFrequencyOrdinals[0] and lowFrequencyOrdinals[1]. + it0 = streakEnd0; + it1 = streakEnd1; + } + } + + + + // Write the common unique markers. + if(html) { + html << "<h3>Common unique markers</h3>"; + html << "There are " << commonKmerInfos.size() << " common unique markers." << endl; + html << "<p><table>" + "<tr><th>Ordinal0<th>Ordinal1<th>Ordinal<br>offset<th>Ordinal<br>sum<th>KmerId<th>Kmer"; + const uint64_t k = assemblerInfo->k; + for(const CommonKmerInfo& commonKmerInfo: commonKmerInfos) { + const Kmer kmer(commonKmerInfo.kmerId, k); + html << "<tr>" + "<td class=centered>" << commonKmerInfo.ordinal0 << + "<td class=centered>" << commonKmerInfo.ordinal1 << + "<td class=centered>" << commonKmerInfo.ordinalOffset() << + "<td class=centered>" << commonKmerInfo.ordinalSum(); + + // Write the KmerId in hex with the appropriate number of digits. + const char oldFill = html.fill('0'); + html << "<td class=centered style='font-family:monospace'>" << + std::hex << std::setw(int(k/2)) << commonKmerInfo.kmerId << std::dec; + html.fill(oldFill); + + // Write the Kmer. + html << "<td class=centered style='font-family:monospace'>"; + kmer.write(html, k); + } + html << "</table>"; + } + + + + // Create a histogram of ordinal offsets for the common unique markers. + std::map<int64_t, uint64_t> histogramMap; + for(const CommonKmerInfo& commonKmerInfo: commonKmerInfos) { + const int64_t offset = commonKmerInfo.ordinalOffset(); + auto it = histogramMap.find(offset); + if(it == histogramMap.end()) { + histogramMap.insert({offset, 1}); + } else { + ++it->second; + } + } + vector< pair<int64_t, uint64_t> > histogram; + copy(histogramMap.begin(), histogramMap.end(), back_inserter(histogram)); + if(html) { + html << "<h3>Histogram of ordinal offsets for the common unique markers</h3>" + "<table>" + "<tr><th>Ordinal<br>offset<th>Frequency"; + for(const auto& p: histogram) { + html << "<tr>" + "<td class=centered>" << p.first << + "<td class=centered>" << p.second; + } + html << "</table>"; + } + + + + // Find clusters of ordinal offsets. + class Cluster { + public: + int64_t firstOffset; + int64_t lastOffset; + uint64_t uniqueMarkerCount; + }; + vector<Cluster> clusters; + const uint64_t minMarkerCount = min(allMarkerKmerIds[0].size(), allMarkerKmerIds[1].size()); + const int64_t offsetDeltaTolerance = int64_t(std::round(driftRateTolerance * double(minMarkerCount))); + for(uint64_t i=0; i<histogram.size(); /* Increment later */) { + Cluster cluster; + const uint64_t firstOffsetIndexInHistogram = i; + cluster.firstOffset = histogram[firstOffsetIndexInHistogram].first; + for(++i; i < histogram.size(); ++i) { + if(histogram[i].first > histogram[i-1].first + offsetDeltaTolerance) { + break; + } + } + const uint64_t lastOffsetIndexInHistogram = i-1; + cluster.lastOffset = histogram[lastOffsetIndexInHistogram].first; + cluster.uniqueMarkerCount = 0; + for(uint64_t j=firstOffsetIndexInHistogram; j<=lastOffsetIndexInHistogram; j++) { + cluster.uniqueMarkerCount += histogram[j].second; + } + clusters.push_back(cluster); + } + + // Find the largest cluster. + uint64_t largestClusterIndex = invalid<uint64_t>; + uint64_t largestClusterSize = 0; + for(uint64_t i=0; i<clusters.size(); i++) { + const uint64_t clusterSize = clusters[i].uniqueMarkerCount; + if(clusterSize > largestClusterSize) { + largestClusterSize = clusterSize; + largestClusterIndex = i; + } + } + const Cluster& largestCluster = clusters[largestClusterIndex]; + + // Write the clusters. + if(html) { + html << "<h3>Ordinal offset clusters</h3>"; + html << "<p>Ordinal offset clusters were computed using offset tolerance " << offsetDeltaTolerance; + html << "<table><tr><th>First<br>offset<th>Last<br>offset<th>Size"; + for(uint64_t i=0; i<clusters.size(); i++) { + const Cluster& cluster = clusters[i]; + html << "<tr"; + if(i == largestClusterIndex) { + html << " style='background-color:pink'"; + } + html << ">" + "<td class=centered>" << cluster.firstOffset << + "<td class=centered>" << cluster.lastOffset << + "<td class=centered>" << cluster.uniqueMarkerCount; + } + html << "</table>"; + } + + + + // The active markers are the common unique markers on the largest cluster. + // These are the ones that will be used to compute the alignment. + vector<CommonKmerInfo> activeKmerInfos; + for(const CommonKmerInfo& commonKmerInfo: commonKmerInfos) { + const int64_t offset = commonKmerInfo.ordinalOffset(); + if(offset >= largestCluster.firstOffset and offset <= largestCluster.lastOffset) { + activeKmerInfos.push_back(commonKmerInfo); + } + } + + + + // Fill in the ordinal ranks. + std::ranges::sort(activeKmerInfos, std::ranges::less(), &CommonKmerInfo::ordinal0); + for(uint64_t rank=0; rank<activeKmerInfos.size(); rank++) { + activeKmerInfos[rank].rank0 = rank; + } + std::ranges::sort(activeKmerInfos, std::ranges::less(), &CommonKmerInfo::ordinal1); + for(uint64_t rank=0; rank<activeKmerInfos.size(); rank++) { + activeKmerInfos[rank].rank1 = rank; + } + + + // If there are any markers that don't have the same rank, remove them. + { + vector<CommonKmerInfo> newActiveKmerInfos; + for(const CommonKmerInfo& commonKmerInfo: activeKmerInfos) { + if(commonKmerInfo.rank0 == commonKmerInfo.rank1) { + newActiveKmerInfos.push_back(commonKmerInfo); + } + } + activeKmerInfos.swap(newActiveKmerInfos); + + } + + + + // Sort them by ordinalSum. + class OrderByOrdinalSum { + public: + bool operator()(const CommonKmerInfo& x, const CommonKmerInfo& y) const + { + return x.ordinalSum() < y.ordinalSum(); + } + }; + sort(activeKmerInfos.begin(), activeKmerInfos.end(), OrderByOrdinalSum()); + + + + // Write the active markers we kept. + if(html) { + html << "<h3>Active common unique markers</h3>"; + html << "There are " << activeKmerInfos.size() << " active common unique markers, " + "shown in the table sorted by ordinal sum." + "<p><table>" + "<tr><th>Ordinal0<th>Ordinal1<th>Ordinal<br>offset<th>Ordinal<br>sum<th>Rank0<th>Rank1<th>KmerId<th>Kmer"; + const uint64_t k = assemblerInfo->k; + for(const CommonKmerInfo& commonKmerInfo: activeKmerInfos) { + const Kmer kmer(commonKmerInfo.kmerId, k); + html << "<tr>" + "<td class=centered>" << commonKmerInfo.ordinal0 << + "<td class=centered>" << commonKmerInfo.ordinal1 << + "<td class=centered>" << commonKmerInfo.ordinalOffset() << + "<td class=centered>" << commonKmerInfo.ordinalSum() << + "<td class=centered>" << commonKmerInfo.rank0 << + "<td class=centered>" << commonKmerInfo.rank1; + + // Write the KmerId in hex with the appropriate number of digits. + const char oldFill = html.fill('0'); + html << "<td class=centered style='font-family:monospace'>" << + std::hex << std::setw(int(k/2)) << commonKmerInfo.kmerId << std::dec; + html.fill(oldFill); + + // Write the Kmer. + html << "<td class=centered style='font-family:monospace'>"; + kmer.write(html, k); + } + html << "</table>"; + } + + + + // We should remove common unique markers that have a different rank + // in the two oriented reads. This does not happen frequently and + // for now just check for them. + for(const CommonKmerInfo& commonKmerInfo: activeKmerInfos) { + SHASTA_ASSERT(commonKmerInfo.rank0 == commonKmerInfo.rank1); + } + + + if(activeKmerInfos.size() < 2) { + alignment.clear(); + alignmentInfo.create(alignment, uint32_t(allMarkerKmerIds[0].size()), uint32_t(allMarkerKmerIds[1].size())); + alignmentInfo.uniquenessMetric = 0.; + return; + } + + + + // Create the alignment by stitching together alignments computed + // between each pair of consecutive unique k-mers that survived + // the above process (the "active" markers). + alignment.clear(); + SHASTA_ASSERT(activeKmerInfos.size() > 1); + + // First, do an alignment between the beginning and the + // first active unique marker. + // This alignment is constrained on the right only. + { + const CommonKmerInfo& firstCommonKmerInfo = activeKmerInfos.front(); + const uint32_t ordinalB0 = firstCommonKmerInfo.ordinal0; + const uint32_t ordinalB1 = firstCommonKmerInfo.ordinal1; + if(ordinalB0 > 0 and ordinalB1 > 0) { + const span<const KmerId> kmerIds0(&allMarkerKmerIds[0][0], &allMarkerKmerIds[0][ordinalB0]); + const span<const KmerId> kmerIds1(&allMarkerKmerIds[1][0], &allMarkerKmerIds[1][ordinalB1]); + + // Compute the band. + int64_t bandMin = int64_t(ordinalB0) - int64_t(ordinalB1); + int64_t bandMax = bandMin; + const uint64_t totalBandExtend = minBandExtend + + uint64_t(std::round(0.5 * driftRateTolerance * double(min(ordinalB0, ordinalB1)))); + bandMin -= int64_t(totalBandExtend); + bandMax += int64_t(totalBandExtend); + + if(html) { + html << "<br>Initial step: alignment lengths " << kmerIds0.size() << " " << kmerIds1.size() << + ", band " << bandMin << " " << bandMax; + } + + vector< pair<bool, bool> > seqanAlignment; + seqanAlign( + kmerIds0.begin(), kmerIds0.end(), + kmerIds1.begin(), kmerIds1.end(), + matchScore, mismatchScore, gapScore, + bandMin, bandMax, + true, false, // Free on left + seqanAlignment); + uint32_t ordinal0 = 0; + uint32_t ordinal1 = 0; + for(const auto& p: seqanAlignment) { + if(p.first and p.second and allMarkerKmerIds[0][ordinal0] == allMarkerKmerIds[1][ordinal1]) { + alignment.ordinals.push_back({ordinal0, ordinal1}); + } + if(p.first) { + ++ordinal0; + } + if(p.second) { + ++ordinal1; + } + } + SHASTA_ASSERT(ordinal0 == ordinalB0); + SHASTA_ASSERT(ordinal1 == ordinalB1); + } + } + + + for(uint64_t step=1; step<activeKmerInfos.size(); step++) { + const CommonKmerInfo& commonKmerInfoA = activeKmerInfos[step-1]; + const CommonKmerInfo& commonKmerInfoB = activeKmerInfos[step]; + SHASTA_ASSERT(commonKmerInfoB.rank0 > commonKmerInfoA.rank0); + SHASTA_ASSERT(commonKmerInfoB.rank1 > commonKmerInfoA.rank1); + + const uint32_t ordinalA0 = commonKmerInfoA.ordinal0; + const uint32_t ordinalA1 = commonKmerInfoA.ordinal1; + const uint32_t ordinalB0 = commonKmerInfoB.ordinal0; + const uint32_t ordinalB1 = commonKmerInfoB.ordinal1; + + // Get the KmerIds between A and B for the two reads. + // These are the Kmers that we will align in this step. + const span<const KmerId> kmerIds0(&allMarkerKmerIds[0][ordinalA0 + 1], &allMarkerKmerIds[0][ordinalB0]); + const span<const KmerId> kmerIds1(&allMarkerKmerIds[1][ordinalA1 +1 ], &allMarkerKmerIds[1][ordinalB1]); + + // Add to the alignment the first marker of this step. + alignment.ordinals.push_back({commonKmerInfoA.ordinal0, commonKmerInfoA.ordinal1}); + + // If there is nothing to align, we are done for this step, + if(kmerIds0.empty() or kmerIds1.empty()) { + continue; + } + + + // Use seqan to compute the alignment for this step. + // This alignment is constrained on both sides and banded. + + // Compute the band. + int64_t bandMin, bandMax; + if(kmerIds0.size() <= kmerIds1.size()) { + bandMin = -int64_t(kmerIds1.size() - kmerIds0.size()); + bandMax = 0; + } else { + bandMin = 0; + bandMax = int64_t(kmerIds0.size() - kmerIds1.size()); + } + const uint64_t totalBandExtend = minBandExtend + + uint64_t(std::round(0.5 * driftRateTolerance * double(min(kmerIds0.size(), kmerIds1.size())))); + bandMin -= int64_t(totalBandExtend); + bandMax += int64_t(totalBandExtend); + + if(html) { + html << "<br>Step " << step << " alignment lengths " << kmerIds0.size() << " " << kmerIds1.size() << + ", band " << bandMin << " " << bandMax; + } + + // Do the banded alignment. + vector< pair<bool, bool> > seqanAlignment; + const int64_t alignmentScore = seqanAlign( + kmerIds0.begin(), kmerIds0.end(), + kmerIds1.begin(), kmerIds1.end(), + matchScore, mismatchScore, gapScore, + bandMin, bandMax, + false, false, + seqanAlignment); + if(html) { + html << "<br>Alignment score " << alignmentScore; + } + + // Add to the alignment the ordinals of matching alignment positions. + uint32_t ordinal0 = ordinalA0 + 1; + uint32_t ordinal1 = ordinalA1 + 1; + for(const auto& p: seqanAlignment) { + if(p.first and p.second and allMarkerKmerIds[0][ordinal0] == allMarkerKmerIds[1][ordinal1]) { + alignment.ordinals.push_back({ordinal0, ordinal1}); + } + if(p.first) { + ++ordinal0; + } + if(p.second) { + ++ordinal1; + } + } + SHASTA_ASSERT(ordinal0 == ordinalB0); + SHASTA_ASSERT(ordinal1 == ordinalB1); + } + + // Add the last active marker. + const CommonKmerInfo& lastCommonKmerInfo = activeKmerInfos.back(); + alignment.ordinals.push_back({lastCommonKmerInfo.ordinal0, lastCommonKmerInfo.ordinal1}); + + + + // Do an alignment between the last active unique marker and the end. + // This alignment is constrained on the left only. + { + const CommonKmerInfo& lastCommonKmerInfo = activeKmerInfos.back(); + const uint32_t ordinalA0 = lastCommonKmerInfo.ordinal0 + 1; + const uint32_t ordinalA1 = lastCommonKmerInfo.ordinal1 + 1; + const uint32_t ordinalB0 = uint32_t(allMarkerKmerIds[0].size()); + const uint32_t ordinalB1 = uint32_t(allMarkerKmerIds[1].size()); + if( ordinalA0 < ordinalB0 and ordinalA1 < ordinalB1) { + const span<const KmerId> kmerIds0(&allMarkerKmerIds[0][ordinalA0], &allMarkerKmerIds[0][ordinalB0]); + const span<const KmerId> kmerIds1(&allMarkerKmerIds[1][ordinalA1], &allMarkerKmerIds[1][ordinalB1]); + + // Compute the band. + int64_t bandMin = 0; + int64_t bandMax = 0; + const uint64_t totalBandExtend = minBandExtend + + uint64_t(std::round(0.5 * driftRateTolerance * double(min(kmerIds0.size(), kmerIds1.size())))); + bandMin -= int64_t(totalBandExtend); + bandMax += int64_t(totalBandExtend); + + if(html) { + html << "<br>Final step: alignment lengths " << kmerIds0.size() << " " << kmerIds1.size() << + ", band " << bandMin << " " << bandMax; + } + + vector< pair<bool, bool> > seqanAlignment; + seqanAlign( + kmerIds0.begin(), kmerIds0.end(), + kmerIds1.begin(), kmerIds1.end(), + matchScore, mismatchScore, gapScore, + bandMin, bandMax, + false, true, // Free on right + seqanAlignment); + uint32_t ordinal0 = ordinalA0; + uint32_t ordinal1 = ordinalA1; + for(const auto& p: seqanAlignment) { + if(p.first and p.second and allMarkerKmerIds[0][ordinal0] == allMarkerKmerIds[1][ordinal1]) { + alignment.ordinals.push_back({ordinal0, ordinal1}); + } + if(p.first) { + ++ordinal0; + } + if(p.second) { + ++ordinal1; + } + } + SHASTA_ASSERT(ordinal0 == ordinalB0); + SHASTA_ASSERT(ordinal1 == ordinalB1); + } + } + + + // Compute the uniqueness metric defines as k/(2*sqrt(n)) + // where k is the number of active markers and + // n is the number of common unique markers + // IN THE OVERLAP REGION ONLY. + float uniquenessMetric = 0; + { + const uint64_t k = activeKmerInfos.size(); + + const array<uint32_t, 2>& alignmentOrdinalsFirst = alignment.ordinals.front(); + const array<uint32_t, 2>& alignmentOrdinalsLast = alignment.ordinals.back(); + const uint32_t alignmentOrdinalFirst0 = alignmentOrdinalsFirst[0]; + const uint32_t alignmentOrdinalFirst1 = alignmentOrdinalsFirst[1]; + const uint32_t alignmentOrdinalLast0 = alignmentOrdinalsLast[0]; + const uint32_t alignmentOrdinalLast1 = alignmentOrdinalsLast[1]; + uint64_t n = 0; + for(const CommonKmerInfo& commonKmerInfo: commonKmerInfos) { + if( + commonKmerInfo.ordinal0 >= alignmentOrdinalFirst0 and + commonKmerInfo.ordinal0 <= alignmentOrdinalLast0 and + commonKmerInfo.ordinal1 >= alignmentOrdinalFirst1 and + commonKmerInfo.ordinal1 <= alignmentOrdinalLast1 + ) { + ++n; + } + } + + uniquenessMetric = float(double(k) / (2. * sqrt(double(n)))); + } + + + // Store the alignment info. + alignmentInfo.create(alignment, uint32_t(allMarkerKmerIds[0].size()), uint32_t(allMarkerKmerIds[1].size())); + alignmentInfo.uniquenessMetric = uniquenessMetric; + +} + + + +void Assembler::computeLowFrequencyMarkers( + uint64_t maxMarkerFrequency, + uint64_t threadCount) +{ + // Check that we have what we need. + SHASTA_ASSERT(markerKmerIds.isOpen()); + + // Get the number of reads. + const uint64_t readCount = getReads().readCount(); + + // Adjust the number of threads, if necessary. + if(threadCount == 0) { + threadCount = std::thread::hardware_concurrency(); + } + + // Store the maxMarkerFrequency so all threads can see it. + computeLowFrequencyMarkersData.maxMarkerFrequency = maxMarkerFrequency; + + // Initialize the low frequency markers. + lowFrequencyMarkers.createNew(largeDataName("LowFrequencyMarkers"), largeDataPageSize); + + // Pass 1 just counts the number of low frequency markers for each oriented read. + const uint64_t batchSize = 1; + lowFrequencyMarkers.beginPass1(2 * readCount); + setupLoadBalancing(readCount, batchSize); + runThreads(&Assembler::computeLowFrequencyMarkersThreadFunctionPass1, threadCount); + + // Pass 2 stores the low frequency markers for each oriented read. + setupLoadBalancing(getReads().readCount(), batchSize); + lowFrequencyMarkers.beginPass2(); + runThreads(&Assembler::computeLowFrequencyMarkersThreadFunctionPass2, threadCount); + lowFrequencyMarkers.endPass2(false, true); +} + + + +void Assembler::computeLowFrequencyMarkersThreadFunctionPass1(uint64_t threadId) +{ + computeLowFrequencyMarkersThreadFunctionPass12(1); +} +void Assembler::computeLowFrequencyMarkersThreadFunctionPass2(uint64_t threadId) +{ + computeLowFrequencyMarkersThreadFunctionPass12(2); +} +void Assembler::computeLowFrequencyMarkersThreadFunctionPass12(uint64_t pass) +{ + const uint64_t maxMarkerFrequency = computeLowFrequencyMarkersData.maxMarkerFrequency; + vector<uint32_t> lowFrequencyOrdinals; + + // Loop over all batches assigned to this thread. + uint64_t begin, end; + while(getNextBatch(begin, end)) { + + // Loop over oriented reads in this batch. + for(uint32_t readId=ReadId(begin); readId!=ReadId(end); ++readId) { + for(uint32_t strand=0; strand<2; strand++) { + const OrientedReadId orientedReadId(readId, strand); + + // Compute the low frequency markers. + computeLowFrequencyMarkers( + markerKmerIds[orientedReadId.getValue()], + maxMarkerFrequency, + lowFrequencyOrdinals); + + if(pass == 1) { + // Just make space for them. + lowFrequencyMarkers.incrementCountMultithreaded( + orientedReadId.getValue(), + lowFrequencyOrdinals.size()); + } else { + // Store them. + copy(lowFrequencyOrdinals.begin(), lowFrequencyOrdinals.end(), + lowFrequencyMarkers.begin(orientedReadId.getValue())); + } + } + } + } +} + + + +// Compute low frequency markers for a single oriented read. +// On return, the lowFrequencyOrdinals vector contains the ordinals corresponding +// to low frequency markers, sorted by KmerId. +// Low frequency markers are the ones that occur up to maxMarkerFrequency +// times on the oriented read. +void Assembler::computeLowFrequencyMarkers( + const span<const KmerId>& kmerIds, // The marker KmerIds for the oriented reads, sorted by ordinal + uint64_t maxMarkerFrequency, + vector<uint32_t>& lowFrequencyOrdinals) // The ordinals of the low frequency markers, sorted by KmerId +{ + + // Create a vector of ordinals, sorted by ordinal. + const uint64_t markerCount = kmerIds.size(); + vector<uint32_t> allOrdinals(markerCount); + std::iota(allOrdinals.begin(), allOrdinals.end(), uint32_t(0)); + + // Now sort them by KmerId. + class SortHelper { + public: + SortHelper(const span<const KmerId>& kmerIds) : kmerIds(kmerIds) {} + bool operator()(uint32_t ordinal0, uint32_t ordinal1) const + { + return kmerIds[ordinal0] < kmerIds[ordinal1]; + } + private: + const span<const KmerId>& kmerIds; + }; + sort(allOrdinals.begin(), allOrdinals.end(), SortHelper(kmerIds)); + + + + // Loop over streaks with the same KmerId. + lowFrequencyOrdinals.clear(); + for(uint64_t streakBegin=0; streakBegin<markerCount; /* Increment later */) { + const KmerId kmerId = kmerIds[allOrdinals[streakBegin]]; + + // Find the streak with this KmerId. + uint64_t streakEnd = streakBegin + 1; + while(true) { + if(streakEnd == markerCount or kmerIds[allOrdinals[streakEnd]] != kmerId) { + break; + } + ++streakEnd; + } + const uint64_t streakLength = streakEnd - streakBegin; + + // If short enough, copy to the low frequency ordinals. + if(streakLength <= maxMarkerFrequency) { + copy(allOrdinals.begin() + streakBegin, allOrdinals.begin() + streakEnd, + back_inserter(lowFrequencyOrdinals)); + } + + // Prepare to process the next streak. + streakBegin = streakEnd; + } +} diff --git a/src/AssemblerAnalyzePaths.cpp b/src/AssemblerAnalyzePaths.cpp index 4989eab..79d164e 100644 --- a/src/AssemblerAnalyzePaths.cpp +++ b/src/AssemblerAnalyzePaths.cpp @@ -6,6 +6,7 @@ // Shasta. #include "seqan.hpp" using namespace shasta; +using namespace mode0; // Standard library. #include "array.hpp" @@ -139,131 +140,3 @@ void Assembler::getPseudoPathSegments( segmentIds.push_back(pseudoPathEntry.segmentId); } } - - - -void Assembler::alignPseudoPaths( - ReadId readId0, Strand strand0, - ReadId readId1, Strand strand1) -{ - using SegmentId = AssemblyGraph::EdgeId; - const AssemblyGraph& assemblyGraph = *assemblyGraphPointer; - - // Parameters that control the process below. EXPOSE WHEN CODE STABILIZES. ********* - const int matchScore = 1; - const int mismatchScore = -1; - const int gapScore = -1; - - // Gather the oriented read ids. - const array<OrientedReadId, 2> orientedReadIds = - {OrientedReadId(readId0, strand0), OrientedReadId(readId1, strand1)}; - cout << "Aligning pseudo-paths of " << orientedReadIds[0] << - " and " << orientedReadIds[1] << endl; - - - // Compute the two pseudo-paths. - vector<MarkerGraph::EdgeId> path; - vector< pair<uint32_t, uint32_t> > pathOrdinals; - PseudoPath pseudoPath; - array<vector<SegmentId>, 2> pseudoPathSegments; - for(uint64_t i=0; i<2; i++) { - computePseudoPath(orientedReadIds[i], path, pathOrdinals, - pseudoPath); - getPseudoPathSegments(pseudoPath, pseudoPathSegments[i]); - cout << "The pseudo-path of " << orientedReadIds[i] << - " has " << pseudoPathSegments[i].size() << " segments." << endl; - } - - // Align them. - vector< pair<bool, bool> > alignment; - const uint64_t alignmentScore = shasta::seqanAlign( - pseudoPathSegments[0].begin(), pseudoPathSegments[0].end(), - pseudoPathSegments[1].begin(), pseudoPathSegments[1].end(), - matchScore, - mismatchScore, - gapScore, - true, true, - alignment); - cout << "Alignment score " << alignmentScore << endl; - cout << "Alignment length " << alignment.size() << endl; - - - - // Write out the alignment. - uint64_t position0 = 0; - uint64_t position1 = 0; - uint64_t weakMatchCount =0; - uint64_t strongMatchCount =0; - uint64_t mismatchCount =0; - uint64_t gapCount =0; - uint64_t leftUnalignedCount =0; - uint64_t rightUnalignedCount =0; - ofstream csv("PseudoPathsAlignment.csv"); - for(const auto& p: alignment) { - if(p.first) { - const SegmentId segment0 = pseudoPathSegments[0][position0]; - csv << segment0; - } - csv << ","; - if(p.second) { - const SegmentId segment1 = pseudoPathSegments[1][position1]; - csv << segment1; - } - csv << ","; - - // Write an annotation column. - if(p.first and p.second) { - if(pseudoPathSegments[0][position0] != pseudoPathSegments[1][position1]) { - csv << "Mismatch"; - ++mismatchCount; - } else { - // Match. - // Decide if it is a strong or weak match. - const SegmentId segmentId = pseudoPathSegments[0][position0]; - const AssemblyGraph::Edge& edge = assemblyGraph.edges[segmentId]; - const AssemblyGraph::VertexId v0 = edge.source; - const AssemblyGraph::VertexId v1 = edge.target; - const auto out0 = assemblyGraph.outDegree(v0); - const auto in1 = assemblyGraph.inDegree(v1); - if(out0==1 and in1==1) { - csv << "Weak match"; - ++weakMatchCount; - } else { - csv << "Strong match"; - ++strongMatchCount; - } - } - } else if(position0 == 0 or position1==0) { - csv << "Left unaligned portion"; - ++leftUnalignedCount; - } else if( - position0 == pseudoPathSegments[0].size() or - position1 == pseudoPathSegments[1].size()) { - csv << "Right unaligned portion"; - ++rightUnalignedCount; - } else if(not (p.first and p.second)) { - csv << "Gap"; - ++gapCount; - } - csv << "\n"; - - if(p.first) { - ++position0; - } - if(p.second) { - ++position1; - } - } - SHASTA_ASSERT(position0 == pseudoPathSegments[0].size()); - SHASTA_ASSERT(position1 == pseudoPathSegments[1].size()); - - const uint64_t matchCount = weakMatchCount + strongMatchCount; - cout << "Total match "<< matchCount << endl; - cout << "Strong match "<< strongMatchCount << endl; - cout << "Weak match "<< weakMatchCount << endl; - cout << "Mismatch "<< mismatchCount << endl; - cout << "Gap "<< gapCount << endl; - cout << "Left unaligned "<< leftUnalignedCount << endl; - cout << "Right unaligned "<< rightUnalignedCount << endl; - cout << "Mismatch/match ratio " << double(mismatchCount)/double(matchCount) << endl; -} diff --git a/src/AssemblerAssemblyGraph.cpp b/src/AssemblerAssemblyGraph.cpp index b21f16f..091aeee 100644 --- a/src/AssemblerAssemblyGraph.cpp +++ b/src/AssemblerAssemblyGraph.cpp @@ -9,6 +9,7 @@ #include "Reads.hpp" #include "timestamp.hpp" using namespace shasta; +using namespace mode0; // Boost libraries. #include <boost/graph/iteration_macros.hpp> @@ -37,6 +38,7 @@ using namespace shasta; // - assemblyGraph.markerToAssemblyTable void Assembler::createAssemblyGraphEdges() { + // Some shorthands. // using VertexId = AssemblyGraph::VertexId; using EdgeId = AssemblyGraph::EdgeId; @@ -87,6 +89,7 @@ void Assembler::createAssemblyGraphEdges() if(debug) { cout << "Working on start edge " << startEdgeId; cout << " " << startEdge.source << "->" << startEdge.target << endl; + startEdge.writeFlags(cout); } // If this edge is not part of cleaned up marker graph, skip it. @@ -153,6 +156,19 @@ void Assembler::createAssemblyGraphEdges() } std::reverse(reverseComplementedChain.begin(), reverseComplementedChain.end()); + if(debug) { + cout << "Chain:"; + for(const auto edgeId: chain) { + cout << " " << edgeId; + } + cout << endl; + cout << "Reverse complemented chain:"; + for(const auto edgeId: reverseComplementedChain) { + cout << " " << edgeId; + } + cout << endl; + } + // Figure out if the reverse complemented chain is the same @@ -262,6 +278,7 @@ void Assembler::createAssemblyGraphEdges() } } #endif + } @@ -402,12 +419,12 @@ void Assembler::createAssemblyGraphVertices() for(uint64_t i=0; i<chain.size(); i++) { const MarkerGraph::EdgeId markerGraphEdgeId = chain[i]; const MarkerGraph::Edge& markerGraphEdge = markerGraph.edges[markerGraphEdgeId]; - const uint32_t edgeCoverage = markerGraphEdge.coverage; + const uint64_t edgeCoverage = markerGraph.edgeCoverage(markerGraphEdgeId); edgeCoverageSum += edgeCoverage; assemblyGraphEdge.minEdgeCoverage = - min(assemblyGraphEdge.minEdgeCoverage, edgeCoverage); + min(assemblyGraphEdge.minEdgeCoverage, uint32_t(edgeCoverage)); assemblyGraphEdge.maxEdgeCoverage = - max(assemblyGraphEdge.maxEdgeCoverage, edgeCoverage); + max(assemblyGraphEdge.maxEdgeCoverage, uint32_t(edgeCoverage)); if(i != 0) { const MarkerGraph::EdgeId markerGraphVertexId = markerGraphEdge.source; @@ -460,6 +477,24 @@ void Assembler::removeLowCoverageCrossEdges(uint32_t crossEdgeCoverageThreshold) SHASTA_ASSERT(assemblyGraphPointer); AssemblyGraph& assemblyGraph = *assemblyGraphPointer; +#if 0 + // Sanity check on assembly graph edges. + for(AssemblyGraph::EdgeId edgeId=0; edgeId!=assemblyGraph.edges.size(); edgeId++) { + const AssemblyGraph::EdgeId edgeIdRc = assemblyGraph.reverseComplementEdge[edgeId]; + SHASTA_ASSERT(assemblyGraph.reverseComplementEdge[edgeIdRc] == edgeId); + const auto markerGraphEdges = assemblyGraph.edgeLists[edgeId]; + const auto markerGraphEdgesRc = assemblyGraph.edgeLists[edgeIdRc]; + const uint64_t n = markerGraphEdges.size(); + SHASTA_ASSERT(markerGraphEdgesRc.size() == n); + for(uint64_t i=0; i<n; i++) { + const MarkerGraphEdgeId markerGraphEdgeId = markerGraphEdges[i]; + const MarkerGraphEdgeId markerGraphEdgeIdRc = markerGraphEdgesRc[n - 1 - i]; + SHASTA_ASSERT(markerGraph.reverseComplementEdge[markerGraphEdgeId] == markerGraphEdgeIdRc); + SHASTA_ASSERT(markerGraph.reverseComplementEdge[markerGraphEdgeIdRc] == markerGraphEdgeId); + } + } +#endif + // We want to process edges in order of increasing coverage. // Gather edges by coverage. vector< vector<AssemblyGraph::EdgeId> > edgesByCoverage(crossEdgeCoverageThreshold+1); @@ -473,7 +508,7 @@ void Assembler::removeLowCoverageCrossEdges(uint32_t crossEdgeCoverageThreshold) const bool debug = false; ofstream out; if(debug) { - out.open("LowCoverageCrossEdges.csv"); + out.open("LowCoverageCrossEdges.txt"); } // Process assembly graph edges in order of increasing coverage. @@ -482,6 +517,10 @@ void Assembler::removeLowCoverageCrossEdges(uint32_t crossEdgeCoverageThreshold) for(const vector<AssemblyGraph::EdgeId>& edges: edgesByCoverage) { for(const AssemblyGraph::EdgeId edgeId: edges) { AssemblyGraph::Edge& edge = assemblyGraph.edges[edgeId]; + if(edge.removalReason == AssemblyGraph::Edge::RemovalReason::LowCoverageCrossEdge) { + // Was already marked because it is the reverse complement of another marked edge. + continue; + } const AssemblyGraph::VertexId v0 = edge.source; const AssemblyGraph::VertexId v1 = edge.target; @@ -514,14 +553,46 @@ void Assembler::removeLowCoverageCrossEdges(uint32_t crossEdgeCoverageThreshold) ++removedAssemblyGraphEdgeCount; // Mark the corresponding marker graph edges. + if(debug) { + out << "Assembly graph edge A" << edgeId << " marked as low coverage edge " + "together with its marker graph edges:"; + } for(const MarkerGraph::EdgeId markerGraphEdgeId: assemblyGraph.edgeLists[edgeId]) { markerGraph.edges[markerGraphEdgeId].isLowCoverageCrossEdge = 1; if(debug) { - out << markerGraphEdgeId << "\n"; + out << " M" << markerGraphEdgeId << " "; } ++removedMarkerGraphEdgeCount; } + if(debug) { + out << endl; + } + + // Also mark the reverse complement edge. + // This is necessary to keep the assembly graph and marker graph + // in variant under reverse complementing. + const AssemblyGraph::EdgeId reverseComplementEdgeId = assemblyGraph.reverseComplementEdge[edgeId]; + if(reverseComplementEdgeId != edgeId) { + AssemblyGraph::Edge& reverseComplementEdge = assemblyGraph.edges[reverseComplementEdgeId]; + reverseComplementEdge.removalReason = AssemblyGraph::Edge::RemovalReason::LowCoverageCrossEdge; + ++removedAssemblyGraphEdgeCount; + if(debug) { + out << "Reverse complement assembly graph edge A" << reverseComplementEdgeId << " marked as low coverage edge " + "together with its marker graph edges:"; + } + // Mark the corresponding marker graph edges. + for(const MarkerGraph::EdgeId markerGraphEdgeId: assemblyGraph.edgeLists[reverseComplementEdgeId]) { + markerGraph.edges[markerGraphEdgeId].isLowCoverageCrossEdge = 1; + if(debug) { + out << " M" << markerGraphEdgeId << " "; + } + ++removedMarkerGraphEdgeCount; + } + if(debug) { + out << endl; + } + } } } @@ -651,7 +722,7 @@ void Assembler::assemble( AssemblyGraph& assemblyGraph = *assemblyGraphPointer; // Check that we have what we need. - checkKmersAreOpen(); + SHASTA_ASSERT(kmerChecker); reads->checkReadsAreOpen(); checkMarkersAreOpen(); checkMarkerGraphVerticesAreAvailable(); @@ -1785,7 +1856,7 @@ void Assembler::assembleAssemblyGraphEdge( AssembledSegment& assembledSegment) { assembleMarkerGraphPath( - assemblerInfo->readRepresentation, assemblerInfo->k, markers, markerGraph, markerGraphPath, + assemblerInfo->readRepresentation, assemblerInfo->k, *reads, markers, markerGraph, markerGraphPath, storeCoverageData, assembledSegment); } diff --git a/src/AssemblerAssemblyGraph2.cpp b/src/AssemblerAssemblyGraph2.cpp index a6cf6a1..ae94e4a 100644 --- a/src/AssemblerAssemblyGraph2.cpp +++ b/src/AssemblerAssemblyGraph2.cpp @@ -34,6 +34,7 @@ void Assembler::createAssemblyGraph2( assemblerInfo->readRepresentation, assemblerInfo->k, getReads().getFlags(), + getReads(), markers, markerGraph, pruneLength, diff --git a/src/AssemblerCreateReadGraphUsingPseudoPaths.cpp b/src/AssemblerCreateReadGraphUsingPseudoPaths.cpp index 5feb896..27c8174 100644 --- a/src/AssemblerCreateReadGraphUsingPseudoPaths.cpp +++ b/src/AssemblerCreateReadGraphUsingPseudoPaths.cpp @@ -6,15 +6,15 @@ #include "seqan.hpp" #include "timestamp.hpp" using namespace shasta; +using namespace mode0; // Standard library. #include "fstream.hpp" -// This use PseudoPaths to decide which alignments +// This uses PseudoPaths to decide which alignments // should be included in the read graph. -// See Assembler::alignPseudoPaths in AssemblerAnalyzePaths.cpp. void Assembler::createReadGraphUsingPseudoPaths( int64_t matchScore, int64_t mismatchScore, diff --git a/src/AssemblerDetangle.cpp b/src/AssemblerDetangle.cpp index 1759444..d66335c 100644 --- a/src/AssemblerDetangle.cpp +++ b/src/AssemblerDetangle.cpp @@ -5,6 +5,7 @@ #include "performanceLog.hpp" #include "timestamp.hpp" using namespace shasta; +using namespace mode0; // Boost libraries. #include <boost/graph/iteration_macros.hpp> diff --git a/src/AssemblerHttpServer-Alignments.cpp b/src/AssemblerHttpServer-Alignments.cpp index 793e409..c15d6c1 100644 --- a/src/AssemblerHttpServer-Alignments.cpp +++ b/src/AssemblerHttpServer-Alignments.cpp @@ -13,6 +13,7 @@ #include "Reads.hpp" #include "ReferenceOverlapMap.hpp" using namespace shasta; +using namespace mode0; // Boost libraries. #include <boost/icl/interval_map.hpp> @@ -1105,6 +1106,12 @@ void Assembler::exploreAlignment( uint64_t align4MaxDistanceFromBoundary = httpServerData.assemblerOptions->alignOptions.align4MaxDistanceFromBoundary; getParameterValue(request, "align4MaxDistanceFromBoundary", align4MaxDistanceFromBoundary); + // Parameters for alignment method 5. + double align5DriftRateTolerance = httpServerData.assemblerOptions->alignOptions.align5DriftRateTolerance; + getParameterValue(request, "align5DriftRateTolerance", align5DriftRateTolerance); + uint64_t align5MinBandExtend = httpServerData.assemblerOptions->alignOptions.align5MinBandExtend; + getParameterValue(request, "align5MinBandExtend", align5MinBandExtend); + string displayMatrixString; bool displayMatrix = getParameterValue(request, "displayMatrix", displayMatrixString); @@ -1114,6 +1121,8 @@ void Assembler::exploreAlignment( getParameterValue(request, "magnifyFactor", magnifyFactor); string displayDetailsString; bool displayDetails = getParameterValue(request, "displayDetails", displayDetailsString); + string displayDebugInfoString; + bool displayDebugInfo = getParameterValue(request, "displayDebugInfo", displayDebugInfoString); // Write the form. @@ -1151,6 +1160,8 @@ void Assembler::exploreAlignment( align4DeltaY, align4MinEntryCountPerCell, align4MaxDistanceFromBoundary, + align5DriftRateTolerance, + align5MinBandExtend, html ); @@ -1163,6 +1174,8 @@ void Assembler::exploreAlignment( "> times." "<br><input type=checkbox name=displayDetails" << (displayDetails ? " checked=checked" : "") << "> Display alignment details" + "<br><input type=checkbox name=displayDebugInfo" << (displayDebugInfo ? " checked=checked" : "") << + "> Display debug information" "</form>"; @@ -1229,6 +1242,13 @@ void Assembler::exploreAlignment( mismatchScore, gapScore, alignment, alignmentInfo); + } else if(method == 5) { + ofstream nullStream; + alignOrientedReads5( + orientedReadId0, orientedReadId1, + matchScore, mismatchScore, gapScore, + align5DriftRateTolerance, align5MinBandExtend, + alignment, alignmentInfo, displayDebugInfo ? html : nullStream); } else { SHASTA_ASSERT(0); } @@ -1338,7 +1358,7 @@ void Assembler::exploreAlignment( const auto markers0 = markers[orientedReadId0.getValue()]; const auto markers1 = markers[orientedReadId1.getValue()]; - // Compute the raw position corresponding to each RLE position. + // Compute the positions of each marker in the two oriented reads. const vector<uint32_t> rawPositions0 = reads->getRawPositions(orientedReadId0); const vector<uint32_t> rawPositions1 = reads->getRawPositions(orientedReadId1); @@ -1350,8 +1370,10 @@ void Assembler::exploreAlignment( const auto& marker0 = markers0[ordinal0]; const auto& marker1 = markers1[ordinal1]; - const auto kmerId = marker0.kmerId; - SHASTA_ASSERT(marker1.kmerId == kmerId); + const KmerId kmerId0 = getOrientedReadMarkerKmerId(orientedReadId0, ordinal0); + const KmerId kmerId1 = getOrientedReadMarkerKmerId(orientedReadId1, ordinal1); + SHASTA_ASSERT(kmerId0 == kmerId1); + const KmerId kmerId = kmerId0; const Kmer kmer(kmerId, assemblerInfo->k); const uint32_t rlePosition0 = marker0.position; @@ -1759,6 +1781,8 @@ void Assembler::renderEditableAlignmentConfig( uint64_t align4DeltaY, uint64_t align4MinEntryCountPerCell, uint64_t align4MaxDistanceFromBoundary, + double align5DriftRateTolerance, + uint64_t align5MinBandExtend, ostream& html ) { const auto& descriptions = httpServerData.assemblerOptions->allOptionsDescription; @@ -1775,7 +1799,9 @@ void Assembler::renderEditableAlignmentConfig( "<input type=radio name=method value=3" << (method==3 ? " checked=checked" : "") << "> 3 (SeqAn, banded)<br>" "<input type=radio name=method value=4" << - (method==4 ? " checked=checked" : "") << "> 4 (Experimental)" + (method==4 ? " checked=checked" : "") << "> 4 (Experimental)<br>" + "<input type=radio name=method value=5" << + (method==5 ? " checked=checked" : "") << "> 5 (Experimental)" "<td class=smaller>" << descriptions.find("Align.alignMethod", false).description(); html << "<tr><th class=left>maxSkip" @@ -1873,6 +1899,18 @@ void Assembler::renderEditableAlignmentConfig( "<input type=text style='text-align:center;border:none' name=align4MaxDistanceFromBoundary size=16 value=" << align4MaxDistanceFromBoundary << ">" "<td class=smaller>" << descriptions.find("Align.align4.maxDistanceFromBoundary", false).description(); + html << "<tr>" + "<th class=left>align5.driftRateTolerance" + "<td class=centered>" + "<input type=text style='text-align:center;border:none' name=align5DriftRateTolerance size=16 value=" << align5DriftRateTolerance << ">" + "<td class=smaller>" << descriptions.find("Align.align5.driftRateTolerance", false).description(); + + html << "<tr>" + "<th class=left>align5.minBandExtend" + "<td class=centered>" + "<input type=text style='text-align:center;border:none' name=align5MinBandExtend size=16 value=" << align5MinBandExtend << ">" + "<td class=smaller>" << descriptions.find("Align.align5.minBandExtend", false).description(); + html << "</table>"; } @@ -1928,6 +1966,12 @@ void Assembler::computeAllAlignments( computeAllAlignmentsData.align4MaxDistanceFromBoundary = httpServerData.assemblerOptions->alignOptions.align4MaxDistanceFromBoundary; getParameterValue(request, "align4MaxDistanceFromBoundary", computeAllAlignmentsData.align4MaxDistanceFromBoundary); + // Parameters for alignment method 5. + computeAllAlignmentsData.align5DriftRateTolerance = httpServerData.assemblerOptions->alignOptions.align5DriftRateTolerance; + getParameterValue(request, "align5DriftRateTolerance", computeAllAlignmentsData.align5DriftRateTolerance); + computeAllAlignmentsData.align5MinBandExtend = httpServerData.assemblerOptions->alignOptions.align5MinBandExtend; + getParameterValue(request, "align5MinBandExtend", computeAllAlignmentsData.align5MinBandExtend); + // Write the form. html << @@ -1958,6 +2002,8 @@ void Assembler::computeAllAlignments( computeAllAlignmentsData.align4DeltaY, computeAllAlignmentsData.align4MinEntryCountPerCell, computeAllAlignmentsData.align4MaxDistanceFromBoundary, + computeAllAlignmentsData.align5DriftRateTolerance, + computeAllAlignmentsData.align5MinBandExtend, html ); @@ -2386,6 +2432,11 @@ void Assembler::assessAlignments( computeAllAlignmentsData. align4MaxDistanceFromBoundary = httpServerData.assemblerOptions->alignOptions.align4MaxDistanceFromBoundary; getParameterValue(request, "align4MaxDistanceFromBoundary", computeAllAlignmentsData.align4MaxDistanceFromBoundary); + // Parameters for alignment method 5. + computeAllAlignmentsData.align5DriftRateTolerance = httpServerData.assemblerOptions->alignOptions.align5DriftRateTolerance; + getParameterValue(request, "align5DriftRateTolerance", computeAllAlignmentsData.align5DriftRateTolerance); + computeAllAlignmentsData.align5MinBandExtend = httpServerData.assemblerOptions->alignOptions.align5MinBandExtend; + getParameterValue(request, "align5MinBandExtend", computeAllAlignmentsData.align5MinBandExtend); html << "<h1>Alignment statistics</h1>"; @@ -2445,6 +2496,8 @@ void Assembler::assessAlignments( computeAllAlignmentsData.align4DeltaY, computeAllAlignmentsData.align4MinEntryCountPerCell, computeAllAlignmentsData.align4MaxDistanceFromBoundary, + computeAllAlignmentsData.align5DriftRateTolerance, + computeAllAlignmentsData.align5MinBandExtend, html ); @@ -2737,6 +2790,8 @@ void Assembler::computeAllAlignmentsThreadFunction(size_t threadId) const uint64_t align4DeltaY = computeAllAlignmentsData.align4DeltaY; const uint64_t align4MinEntryCountPerCell = computeAllAlignmentsData.align4MinEntryCountPerCell; const uint64_t align4MaxDistanceFromBoundary = computeAllAlignmentsData.align4MaxDistanceFromBoundary; + const double align5DriftRateTolerance = computeAllAlignmentsData.align5DriftRateTolerance; + const uint64_t align5MinBandExtend = computeAllAlignmentsData.align5MinBandExtend; // Vector where this thread will store the alignments it finds. vector< pair<OrientedReadId, AlignmentInfo> >& alignments = @@ -2822,6 +2877,13 @@ void Assembler::computeAllAlignmentsThreadFunction(size_t threadId) alignment, alignmentInfo, false); SHASTA_ASSERT(byteAllocator.isEmpty()); + } else if(method == 5) { + ofstream nullStream; + alignOrientedReads5(orientedReadId0, orientedReadId1, + matchScore, mismatchScore, gapScore, + align5DriftRateTolerance, align5MinBandExtend, + alignment, alignmentInfo, + nullStream); } else { SHASTA_ASSERT(0); } diff --git a/src/AssemblerHttpServer-AssemblyGraph.cpp b/src/AssemblerHttpServer-AssemblyGraph.cpp index 14db868..c17de7a 100644 --- a/src/AssemblerHttpServer-AssemblyGraph.cpp +++ b/src/AssemblerHttpServer-AssemblyGraph.cpp @@ -4,6 +4,7 @@ #include "LocalAssemblyGraph.hpp" #include "platformDependent.hpp" using namespace shasta; +using namespace mode0; // Boost libraries. #include <boost/algorithm/string.hpp> diff --git a/src/AssemblerHttpServer-CompressedAssemblyGraph.cpp b/src/AssemblerHttpServer-CompressedAssemblyGraph.cpp index c5abd04..16b8b3c 100644 --- a/src/AssemblerHttpServer-CompressedAssemblyGraph.cpp +++ b/src/AssemblerHttpServer-CompressedAssemblyGraph.cpp @@ -5,6 +5,7 @@ #include "runCommandWithTimeout.hpp" #include "timestamp.hpp" using namespace shasta; +using namespace mode0; // Boost libraries. #include <boost/graph/iteration_macros.hpp> diff --git a/src/AssemblerHttpServer-MarkerGraph.cpp b/src/AssemblerHttpServer-MarkerGraph0.cpp index 1cd654e..d7ede14 100644 --- a/src/AssemblerHttpServer-MarkerGraph.cpp +++ b/src/AssemblerHttpServer-MarkerGraph0.cpp @@ -6,11 +6,12 @@ #include "Coverage.hpp" #include "hsv.hpp" #include "InducedAlignment.hpp" -#include "LocalMarkerGraph.hpp" +#include "LocalMarkerGraph0.hpp" #include "MarkerConnectivityGraph.hpp" #include "MurmurHash2.hpp" #include "platformDependent.hpp" using namespace shasta; +using namespace mode0; // Boost libraries. #include <boost/algorithm/string.hpp> @@ -31,13 +32,13 @@ using namespace shasta; -void Assembler::exploreMarkerGraph( +void Assembler::exploreMarkerGraph0( const vector<string>& request, ostream& html) { // Get the request parameters. - LocalMarkerGraphRequestParameters requestParameters; - getLocalMarkerGraphRequestParameters(request, requestParameters); + LocalMarkerGraph0RequestParameters requestParameters; + getLocalMarkerGraph0RequestParameters(request, requestParameters); // Write the form. html << "<h1>Display a local subgraph of the global marker graph</h3>"; @@ -72,12 +73,13 @@ void Assembler::exploreMarkerGraph( // Create the local marker graph. - LocalMarkerGraph graph( + LocalMarkerGraph0 graph( assemblerInfo->readRepresentation, uint32_t(assemblerInfo->k), assemblerInfo->assemblyMode, getReads(), markers, + markerGraph, markerGraph.vertexTable, *consensusCaller); const auto createStartTime = steady_clock::now(); @@ -162,7 +164,7 @@ void Assembler::exploreMarkerGraph( // Color legend for vertices when colored by distance. if(requestParameters.vertexColoring == "byDistance") { html << "<h3>Color legend for vertices</h3>"; - LocalMarkerGraph::writeColorLegendVerticesByDistance(html); + LocalMarkerGraph0::writeColorLegendVerticesByDistance(html); } @@ -308,8 +310,8 @@ void Assembler::exploreMarkerGraph( // Make the vertices clickable: Ctrl-click recenters // the graph at that vertex, right click shows vertex details. html << "<script>\n"; - BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph) { - const LocalMarkerGraphVertex& vertex = graph[v]; + BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph0) { + const LocalMarkerGraph0Vertex& vertex = graph[v]; SHASTA_ASSERT(!vertex.markerInfos.empty()); const string url = requestParameters.urlForVertex(vertex.vertexId); html << @@ -332,10 +334,10 @@ void Assembler::exploreMarkerGraph( // Make the edges clickable: Ctrl-click recenters // the graph at the source vertex of that edge, right click shows edge details. html << "<script>\n"; - BGL_FORALL_EDGES(e, graph, LocalMarkerGraph) { - const LocalMarkerGraphEdge& edge = graph[e]; - const LocalMarkerGraph::vertex_descriptor v0 = source(e, graph); - const LocalMarkerGraphVertex& vertex0 = graph[v0]; + BGL_FORALL_EDGES(e, graph, LocalMarkerGraph0) { + const LocalMarkerGraph0Edge& edge = graph[e]; + const LocalMarkerGraph0::vertex_descriptor v0 = source(e, graph); + const LocalMarkerGraph0Vertex& vertex0 = graph[v0]; const string url = requestParameters.urlForVertex(vertex0.vertexId); html << "element = document.getElementById('edge" << edge.edgeId << "');\n" @@ -365,9 +367,9 @@ void Assembler::exploreMarkerGraph( // Extract from the request the parameters for the display // of the local marker graph. -void Assembler::getLocalMarkerGraphRequestParameters( +void Assembler::getLocalMarkerGraph0RequestParameters( const vector<string>& request, - LocalMarkerGraphRequestParameters& parameters) const + LocalMarkerGraph0RequestParameters& parameters) const { parameters.vertexId = 0; parameters.vertexIdIsPresent = getParameterValue( @@ -477,7 +479,7 @@ void Assembler::getLocalMarkerGraphRequestParameters( // highlightedOrientedReads. Each oriented read is assigned a hue // via hashing of the OrientedReadId. This way, an oriented read // is always highlighted in the same color. -void LocalMarkerGraphRequestParameters::parseHighlightedOrientedReads() +void LocalMarkerGraph0RequestParameters::parseHighlightedOrientedReads() { highlightedOrientedReads.clear(); if(highlightedOrientedReadsString.empty()) { @@ -501,7 +503,7 @@ void LocalMarkerGraphRequestParameters::parseHighlightedOrientedReads() -void LocalMarkerGraphRequestParameters::writeForm( +void LocalMarkerGraph0RequestParameters::writeForm( ostream& html, MarkerGraph::VertexId vertexCount) const { @@ -719,7 +721,7 @@ void LocalMarkerGraphRequestParameters::writeForm( -bool LocalMarkerGraphRequestParameters::hasMissingRequiredParameters() const +bool LocalMarkerGraph0RequestParameters::hasMissingRequiredParameters() const { return !vertexIdIsPresent || @@ -729,7 +731,7 @@ bool LocalMarkerGraphRequestParameters::hasMissingRequiredParameters() const -string LocalMarkerGraphRequestParameters::vertexScalingFactorString() const +string LocalMarkerGraph0RequestParameters::vertexScalingFactorString() const { if(vertexScalingFactorIsPresent) { std::ostringstream s; @@ -742,7 +744,7 @@ string LocalMarkerGraphRequestParameters::vertexScalingFactorString() const -string LocalMarkerGraphRequestParameters::arrowScalingFactorString() const +string LocalMarkerGraph0RequestParameters::arrowScalingFactorString() const { if(arrowScalingFactorIsPresent) { std::ostringstream s; @@ -755,7 +757,7 @@ string LocalMarkerGraphRequestParameters::arrowScalingFactorString() const -string LocalMarkerGraphRequestParameters::edgeThicknessScalingFactorString() const +string LocalMarkerGraph0RequestParameters::edgeThicknessScalingFactorString() const { if(edgeThicknessScalingFactorIsPresent) { std::ostringstream s; @@ -768,10 +770,10 @@ string LocalMarkerGraphRequestParameters::edgeThicknessScalingFactorString() con -string LocalMarkerGraphRequestParameters::url() const +string LocalMarkerGraph0RequestParameters::url() const { return - string("exploreMarkerGraph") + + string("exploreMarkerGraph0") + "?vertexId=" + to_string(vertexId) + "&maxDistance=" + to_string(maxDistance) + "&minVertexCoverage=" + to_string(minVertexCoverage) + @@ -799,16 +801,16 @@ string LocalMarkerGraphRequestParameters::url() const -string LocalMarkerGraphRequestParameters::urlForVertex(uint64_t newVertexId) const +string LocalMarkerGraph0RequestParameters::urlForVertex(uint64_t newVertexId) const { - LocalMarkerGraphRequestParameters newParameters = *this; + LocalMarkerGraph0RequestParameters newParameters = *this; newParameters.vertexId = newVertexId; return newParameters.url(); } -string LocalMarkerGraphRequestParameters::vertexLabelsString() const +string LocalMarkerGraph0RequestParameters::vertexLabelsString() const { switch(vertexLabels) { case 0: return "none"; @@ -820,7 +822,7 @@ string LocalMarkerGraphRequestParameters::vertexLabelsString() const -string LocalMarkerGraphRequestParameters::edgeLabelsString() const +string LocalMarkerGraph0RequestParameters::edgeLabelsString() const { switch(edgeLabels) { case 0: return "none"; @@ -862,7 +864,7 @@ void Assembler::exploreMarkerGraphVertex(const vector<string>& request, ostream& SHASTA_ASSERT(markerCount > 0); // Get the marker sequence. - const KmerId kmerId = markers.begin()[markerIds[0]].kmerId; + const KmerId kmerId = getMarkerGraphVertexKmerId(vertexId); const size_t k = assemblerInfo->k; const Kmer kmer(kmerId, k); @@ -953,7 +955,7 @@ void Assembler::exploreMarkerGraphVertex(const vector<string>& request, ostream& // Page title. const string titleUrl = - "exploreMarkerGraph?vertexId=" + to_string(vertexId) + + "exploreMarkerGraph0?vertexId=" + to_string(vertexId) + "&maxDistance=3" "&useWeakEdges=on" "&usePrunedEdges=on" @@ -1212,7 +1214,7 @@ void Assembler::exploreMarkerGraphEdge(const vector<string>& request, ostream& h // Access the edge. const MarkerGraph::Edge& edge = markerGraph.edges[edgeId]; array<MarkerGraph::VertexId, 2> vertexIds = {edge.source, edge.target}; - const size_t markerCount = edge.coverage; + const size_t markerCount = markerGraph.edgeCoverage(edgeId); // The marker intervals of this edge. const span<MarkerInterval> markerIntervals = markerGraph.edgeMarkerIntervals[edgeId]; @@ -1304,7 +1306,7 @@ void Assembler::exploreMarkerGraphEdge(const vector<string>& request, ostream& h // Page title. const string titleUrl = - "exploreMarkerGraph?vertexId=" + to_string(vertexIds[0]) + + "exploreMarkerGraph0?vertexId=" + to_string(vertexIds[0]) + "&maxDistance=3" "&useWeakEdges=on" "&usePrunedEdges=on" @@ -1766,6 +1768,10 @@ void Assembler::exploreMarkerCoverage( const bool readIdIsPresent = getParameterValue(request, "readId", readId); Strand strand = 0; const bool strandIsPresent = getParameterValue(request, "strand", strand); + uint32_t firstOrdinal = 0; + getParameterValue(request, "firstOrdinal", firstOrdinal); + uint32_t lastOrdinal = 0; + getParameterValue(request, "lastOrdinal", lastOrdinal); int width = 600; getParameterValue(request, "width", width); int height = 400; @@ -1781,6 +1787,10 @@ void Assembler::exploreMarkerCoverage( "<tr><td>Strand<td class=centered>"; writeStrandSelection(html, "strand", strandIsPresent && strand==0, strandIsPresent && strand==1); html << + "<tr><td>First ordinal<td class=centered>" + "<input type=text name=firstOrdinal style='text-align:center' size=8 value='" << firstOrdinal << "'>" + "<tr><td>Last ordinal<br>(0 for unlimited)<td class=centered>" + "<input type=text name=lastOrdinal style='text-align:center' size=8 value='" << lastOrdinal << "'>" "<tr><td>Plot width<td class=centered>" "<input type=text name=width style='text-align:center' size=8 value='" << width << "'>" "<tr><td>Plot height<td class=centered>" @@ -1805,7 +1815,11 @@ void Assembler::exploreMarkerCoverage( "plot '-' with points pointtype 7 pointsize 0.5 linecolor rgb '#0000ff' notitle\n"; const uint32_t markerCount = uint32_t(markers.size(orientedReadId.getValue())); - for(uint32_t ordinal=0; ordinal<markerCount; ordinal++) { + if(lastOrdinal == 0) { + lastOrdinal = markerCount - 1; + } + SHASTA_ASSERT(lastOrdinal >= firstOrdinal); + for(uint32_t ordinal=firstOrdinal; ordinal<=lastOrdinal; ordinal++) { const MarkerGraph::VertexId vertexId = getGlobalMarkerGraphVertex(orientedReadId, ordinal); if(vertexId == MarkerGraph::invalidCompressedVertexId) { @@ -2085,6 +2099,10 @@ void Assembler::exploreMarkerConnectivity( const bool ordinalIsPresent = getParameterValue(request, "ordinal", ordinal); string whichAlignments = "ReadGraphAlignments"; getParameterValue(request, "whichAlignments", whichAlignments); + string labelsString; + const bool labels = getParameterValue(request, "labels", labelsString); + double timeout = 30; + getParameterValue(request, "timeout", timeout); // Write the form. html << @@ -2107,7 +2125,14 @@ void Assembler::exploreMarkerConnectivity( html << "<br><input type=radio name=whichAlignments value=ReadGraphAlignments" << (whichAlignments=="ReadGraphAlignments" ? " checked=checked" : "") << "> Only use alignments in the read graph."; - html << "</form>"; + html << "<br><input type=checkbox name=labels" << + (labels ? " checked" : "") << + "> Labels" + "<br>Timeout (seconds) for graph layout" + " <input type=text required name=timeout size=8 style='text-align:center'" << + " value='" << timeout << + "'>" + "</form>"; const bool useReadGraphAlignmentsOnly = (whichAlignments == "ReadGraphAlignments"); // If the required parameters are missing, stop here. @@ -2139,6 +2164,10 @@ void Assembler::exploreMarkerConnectivity( ++frequencyMap[orientedReadId]; } + html << "<br>The marker connectivity graph has " << + num_vertices(graph) << " vertices and " << + num_edges(graph) << " edges."; + // Write the graph out in graphviz format. const string uuid = to_string(boost::uuids::random_generator()()); @@ -2149,15 +2178,19 @@ void Assembler::exploreMarkerConnectivity( const MarkerDescriptor markerDescriptor = graph[v]; const OrientedReadId orientedReadId1 = markerDescriptor.first; const uint32_t ordinal1 = markerDescriptor.second; - dotFile << "\"" << orientedReadId1 << "-" << ordinal1 << "\"" - " [label=\"" << orientedReadId1 << "\\n" << ordinal1 << - "\""; - if(frequencyMap[orientedReadId1] != 1) { - dotFile << " style=filled fillcolor=pink"; - } else { - dotFile << " style=filled fillcolor=cornsilk"; + dotFile << "\"" << orientedReadId1 << "-" << ordinal1 << "\""; + if(labels) { + dotFile << + " [label=\"" << orientedReadId1 << "\\n" << ordinal1 << + "\""; + if(frequencyMap[orientedReadId1] != 1) { + dotFile << " style=filled fillcolor=pink"; + } else { + dotFile << " style=filled fillcolor=cornsilk"; + } + dotFile << "]"; } - dotFile << "];\n"; + dotFile << ";\n"; } BGL_FORALL_EDGES(e, graph, MarkerConnectivityGraph) { const auto v0 = source(e, graph); @@ -2174,8 +2207,9 @@ void Assembler::exploreMarkerConnectivity( // Use graphviz to render it to svg. - const string command = timeoutCommand() + " 30 sfdp -O -T svg " + dotFileName + - " -Goverlap=false -Gsplines=true -Gsmoothing=triangle"; + const string command = timeoutCommand() + " " + to_string(int(timeout)) + " sfdp -O -T svg " + dotFileName + + ( labels ? " -Goverlap=false -Gsplines=true -Gsmoothing=triangle" : + " -Nshape=point -Gsize=10 -Gratio=expand -Epenwidth=0.4"); const int commandStatus = ::system(command.c_str()); if(WIFEXITED(commandStatus)) { const int exitStatus = WEXITSTATUS(commandStatus); diff --git a/src/AssemblerHttpServer-MarkerGraph1.cpp b/src/AssemblerHttpServer-MarkerGraph1.cpp new file mode 100644 index 0000000..f2e536c --- /dev/null +++ b/src/AssemblerHttpServer-MarkerGraph1.cpp @@ -0,0 +1,667 @@ +// Shasta. +#include "Assembler.hpp" +#include "html.hpp" +#include "invalid.hpp" +#include "LocalMarkerGraph1.hpp" +#include "platformDependent.hpp" +#include "Reads.hpp" +using namespace shasta; + +// Boost libraries. +#include <boost/uuid/uuid.hpp> +#include <boost/uuid/uuid_generators.hpp> +#include <boost/uuid/uuid_io.hpp> + +// Standard library. +#include "fstream.hpp" + + + +void Assembler::exploreMarkerGraph1( + const vector<string>& request, + ostream& html) +{ + if(assemblerInfo->assemblyMode != 3) { + throw runtime_error("This is only available for assembly mode 3."); + } + + // This makes the following assumptions. + SHASTA_ASSERT(getReads().representation == 0); // No RLE. + SHASTA_ASSERT((assemblerInfo->k % 2) == 0); // Marker length is even. + + + // Get the request parameters. + uint64_t vertexId = invalid<uint64_t>; + getParameterValue(request, "vertexId", vertexId); + + uint64_t maxDistance = 2; + getParameterValue( request, "maxDistance", maxDistance); + + uint64_t minVertexCoverage = 0; + getParameterValue(request, "minVertexCoverage", minVertexCoverage); + + uint64_t minEdgeCoverage = 0; + getParameterValue(request, "minEdgeCoverage", minEdgeCoverage); + + uint64_t maxPruneCoverage = 0; + getParameterValue(request, "maxPruneCoverage", maxPruneCoverage); + + uint64_t maxLongChainCoverage = 0; + getParameterValue(request, "maxLongChainCoverage", maxLongChainCoverage); + + uint64_t minLongChainLength = 100; + getParameterValue(request, "minLongChainLength", minLongChainLength); + + uint64_t sizePixels = 600; + getParameterValue(request, "sizePixels", sizePixels); + + double thicknessScaling = 1.; + getParameterValue(request, "thicknessScaling", thicknessScaling); + + uint64_t layoutQuality = 2; + getParameterValue(request, "layoutQuality", layoutQuality); + + double edgeResolution = 1.; + getParameterValue(request, "edgeResolution", edgeResolution); + + uint64_t redCoverage = 1; + getParameterValue(request, "redCoverage", redCoverage); + + uint64_t greenCoverage = 5; + getParameterValue(request, "greenCoverage", greenCoverage); + + string coloring; + getParameterValue(request, "coloring", coloring); + + uint64_t readFollowingStartEdgeId = 0; + getParameterValue(request, "readFollowingStartEdgeId", readFollowingStartEdgeId); + + int64_t firstMarkerOffset = 0; + getParameterValue(request, "firstMarkerOffset", firstMarkerOffset); + + int64_t lastMarkerOffset = 0; + getParameterValue(request, "lastMarkerOffset", lastMarkerOffset); + + string showLabelsString; + const bool showLabels = getParameterValue(request, "showLabels", showLabelsString); + + double timeout = 30; + getParameterValue(request, "timeout", timeout); + + string outputType = "svg"; + getParameterValue(request, "outputType", outputType); + + + // Write the form. + html << + "<form>" + + "<h2>Local marker graph</h2>" + "<table>" + + "<tr>" + "<td>Start vertex id" + "<td class=centered><input type=text required name=vertexId size=8 style='text-align:center'" + << ((vertexId == invalid<uint64_t>) ? "" : ("value='" + to_string(vertexId) + "'")) << + ">" + + "<tr title='Maximum distance from start vertex (number of edges)'>" + "<td>Maximum distance" + "<td class=centered><input type=text required name=maxDistance size=8 style='text-align:center'" + "value='" << maxDistance << "'>" + + "<tr>" + "<td>Minimum vertex coverage" + "<td class=centered><input type=text required name=minVertexCoverage size=8 style='text-align:center'" + "value='" << minVertexCoverage << "'>" + + "<tr>" + "<td>Minimum edge coverage" + "<td class=centered><input type=text required name=minEdgeCoverage size=8 style='text-align:center'" + "value='" << minEdgeCoverage << "'>" + + "<tr>" + "<td>Prune leaves with coverage up to" + "<td class=centered><input type=text required name=maxPruneCoverage size=8 style='text-align:center'" + "value='" << maxPruneCoverage << "'>" + + "<tr>" + "<td>Prune long linear sections<br>with low coverage" + "<td>" + "<input type=text required name=maxLongChainCoverage size=8 style='text-align:center'" + "value='" << maxLongChainCoverage << "'> Maximum coverage" + "<br><input type=text required name=minLongChainLength size=8 style='text-align:center'" + "value='" << minLongChainLength << "'> Minimum length (markers)" + + "<tr>" + "<td>Graphics size in pixels" + "<td class=centered><input type=text required name=sizePixels size=8 style='text-align:center'" + " value='" << sizePixels << "'>" + + "<tr>" + "<td>Thickness scaling factor" + "<td class=centered><input type=text required name=thicknessScaling size=8 style='text-align:center'" + " value='" << thicknessScaling << "'>" + + "<tr>" + "<td>Layout quality" + "<td class=centered>" + "<select name=layoutQuality style='text-align:center'>" + "<option value=0" << (layoutQuality==0 ? " selected" : "") << + ">Best speed</option>" + "<option value=1" << (layoutQuality==1 ? " selected" : "") << + ">Intermediate quality and speed</option>" + "<option value=2" << (layoutQuality==2 ? " selected" : "") << + ">Best quality</option>" + "</select>" + + "<tr>" + "<td>Edge resolution "; + writeInformationIcon(html, "Affects edge smoothness and speed of layout computation."); + + html << + "<td class=centered><input type=text required name=edgeResolution size=8 style='text-align:center'" + " value='" << edgeResolution << "'>" + + "<tr>" + "<td>Coloring" + "<td>" + "<select name=coloring style='text-align:center'>" + "<option value=random" << (coloring == "random" ? " selected" : "") << + ">Random</option>" + "<option value=byCoverage" << (coloring == "byCoverage" ? " selected" : "") << + ">By coverage</option>" + "<option value=readFollowing" << (coloring == "readFollowing" ? " selected" : "") << + ">Read following</option>" + "</select>" + "<br><input type=text required name=redCoverage size=8 style='text-align:center'" + " value='" << redCoverage << "'> Red coverage" + "<br><input type=text required name=greenCoverage size=8 style='text-align:center'" + " value='" << greenCoverage << "'> Green coverage" + "<hr><span style='text-align:center'>Read following</span>" + "<br><input type=text required name=readFollowingStartEdgeId size=8 style='text-align:center'" + " value='" << readFollowingStartEdgeId << "'> Start edge for read following" + "<br><input type=text required name=firstMarkerOffset size=8 style='text-align:center'" + " value='" << firstMarkerOffset << "'> First marker offset" + "<br><input type=text required name=lastMarkerOffset size=8 style='text-align:center'" + " value='" << lastMarkerOffset << "'> Last marker offset" + + "<tr>" + "<td>Show labels" + "<td class=centered><input type=checkbox name=showLabels" << + (showLabels ? " checked" : "") << + ">" + + "<tr>" + "<td>Timeout in seconds" + "<td class=centered><input type=text required name=timeout size=8 style='text-align:center'" + " value='" << timeout << "'>" + + "<tr>" + "<td>Output" + "<td>" + "<input type=radio name=outputType value='noOutput'" << + (outputType == "noOutput" ? " checked=on" : "") << + ">Show the number of vertices and edges" + "<br><input type=radio name=outputType value='createGfa'" << + (outputType == "createGfa" ? " checked=on" : "") << + ">Create a GFA file" + "<br><input type=radio name=outputType value='createAndOpenGfa'" << + (outputType == "createAndOpenGfa" ? " checked=on" : "") << + ">Create a GFA file and open it in Bandage"; + + html << + "<br><input type=radio name=outputType value='fastCanvas'" << + (outputType == "fastCanvas" ? " checked=on" : "") << + ">Display vertices only, not interactive "; + writeInformationIcon(html, "The fastest choice. " + "Fast display with one pixel per vertex and no edges, done using canvas. " + "Best for large subgraphs."); + + html << + "<br><input type=radio name=outputType value='fastSvg'" << + (outputType == "fastSvg" ? " checked=on" : "") << + ">Display vertices only, interactive "; + writeInformationIcon(html, "Fast display with one pixel per vertex and no edges, done using svg."); + + html << + "<br><input type=radio name=outputType value='svg'" << + (outputType == "svg" ? " checked=on" : "") << + ">Display vertices and edges, interactive "; + + + html << + "</table>" + + "<br><input type=submit value='Do it'>" + "</form>"; + + + // If the vertex id was not specified, stop here. + if(vertexId == invalid<uint64_t>) { + return; + } + + // If the vertex id is invalid, stop here. + if(vertexId > markerGraph.vertexCount()) { + html << "<p>Invalid vertex id " << vertexId; + html << ". Must be between 0 and " << markerGraph.vertexCount()-1 << " inclusive."; + return; + } + + + + // Create the local marker graph. + LocalMarkerGraph1 graph( + markers, + markerGraph, + vertexId, + maxDistance, + minVertexCoverage, + minEdgeCoverage); + + // Do the requested graph cleanup. + if(maxPruneCoverage > 0) { + graph.pruneLowCoverageLeaves(maxPruneCoverage); + } + if(maxLongChainCoverage > 0) { + graph.removeLongLowCoverageChains(maxLongChainCoverage, minLongChainLength); + } + + html << "<p>The local marker graph has " << num_vertices(graph) << + " vertices and " << num_edges(graph) << " edges."; + + + if(outputType == "noOutput") { + return; + } + + if(outputType == "fastCanvas") { + graph.writeHtml0(html, sizePixels, layoutQuality, timeout, false); + } + + else if(outputType == "fastSvg") { + graph.writeHtml0(html, sizePixels, layoutQuality, timeout, true); + } + + else if(outputType == "svg") { + graph.writeHtml1(html, sizePixels, thicknessScaling, layoutQuality, edgeResolution, + coloring, redCoverage, greenCoverage, + readFollowingStartEdgeId, firstMarkerOffset, lastMarkerOffset, + showLabels, + timeout); + } + + else { + + // Create a gfa file to represent the local marker graph. + const string gfaFileName = tmpDirectory() + to_string(boost::uuids::random_generator()()) + ".gfa"; + graph.writeGfa(gfaFileName); + html << "<p>The local marker graph is in " + "<span id='SpanToBeCopied' style='color:Blue'>" << gfaFileName << "</span>" + ". Remove it when done with it." + "<br><button onClick='copySpanToClipboard()'>Copy GFA file name to clipboard</button>"; + html << R"###( + <script> + function copySpanToClipboard() + { + + // Remove any previous selection. + var selection = window.getSelection(); + selection.removeAllRanges(); + + // Select the span. + var element = document.getElementById("SpanToBeCopied"); + var range = document.createRange(); + range.selectNodeContents(element); + selection.addRange(range); + + // Copy it to the clipboard. + document.execCommand("copy"); + + // Unselect it. + selection.removeAllRanges(); + + + } + </script> + )###"; + + + // If requested, open it in Bandage. + // This is done on the server side, of course. This can have unexpected + // consequences if running remotely. + // Also, because of this the connection with the http client is not closed + // until Bandage terminates, so the browser thinks ore data are coming. + if(outputType == "createAndOpenGfa") { + ::system(("Bandage load " + gfaFileName + "&").c_str()); + } + } +} + + +void Assembler::exploreMarkerGraphEdgePair( + const vector<string>& request, + ostream& html) +{ + // Check that our assumptions are satisfied. + if(assemblerInfo->assemblyMode != 3) { + throw runtime_error("This is only available for assembly mode 3."); + } + SHASTA_ASSERT(getReads().representation == 0); // No RLE. + SHASTA_ASSERT((assemblerInfo->k % 2) == 0); // Marker length is even. + + // Get the parameters for the request + uint64_t edgeIdA = invalid<uint64_t>; + getParameterValue(request, "edgeIdA", edgeIdA); + + uint64_t edgeIdB = invalid<uint64_t>; + getParameterValue(request, "edgeIdB", edgeIdB); + + // Write the form. + html << + "<form>" + "<table>" + "<tr><td class=centered>Edge A<td class=centered>" + "<input type=text required name=edgeIdA size=8 style='text-align:center' " << + ((edgeIdA == invalid<uint64_t>) ? "" : ("value='" + to_string(edgeIdA) + "'")) << ">" + "<tr><td class=centered>Edge B<td class=centered>" + "<input type=text required name=edgeIdB size=8 style='text-align:center' " << + ((edgeIdB == invalid<uint64_t>) ? "" : ("value='" + to_string(edgeIdB) + "'")) << ">" + "</table>" + "<br><input type=submit value='Do it'>" + "</form>"; + + // If the edge id are missing, do nothing. + if(edgeIdA == invalid<uint64_t> or edgeIdB == invalid<uint64_t>) { + return; + } + + // Sanity checks on the edge ids. + if(edgeIdA >= markerGraph.edges.size()) { + throw runtime_error("Marker graph edge " + to_string(edgeIdA) + + " is not valid. Maximum valid edge id is " + to_string(markerGraph.edges.size())); + } + if(edgeIdB >= markerGraph.edges.size()) { + throw runtime_error("Marker graph edge " + to_string(edgeIdB) + + " is not valid. Maximum valid edge id is " + to_string(markerGraph.edges.size())); + } + + // Sanity check that the two edges are distinct. + if(edgeIdA == edgeIdB) { + html << "Specify two distinct edges."; + return; + } + + // This analysis can only be done if both edges have no duplicate OrientedReadIds + // in their MarkerIntervals. + if(markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdA)) { + html << "Marker graph edge " << edgeIdA << " has duplicate oriented reads."; + return; + } + if(markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdB)) { + html << "Marker graph edge " << edgeIdB << " has duplicate oriented reads."; + return; + } + + // Write a header. + html << "<h1>Read composition analysis for marker graph edges " << edgeIdA << + " and " << edgeIdB << "</h1>"; + + // Analyze read composition. + MarkerGraphEdgePairInfo info; + SHASTA_ASSERT(analyzeMarkerGraphEdgePair(edgeIdA, edgeIdB, info)); + writeHtmlMarkerGraphEdgePairInfo(html, edgeIdA, edgeIdB, info); + + if( markerGraph.edges[edgeIdA].isPrimary==1 and + markerGraph.edges[edgeIdB].isPrimary==1 and + info.common == 0) { + const uint64_t estimatedOffset = estimateBaseOffsetUnsafe(edgeIdA, edgeIdB); + if(estimatedOffset != invalid<uint64_t>) { + html << "<p>Estimated offset is " << estimatedOffset << " bases."; + } + } +} + + + +void Assembler::writeHtmlMarkerGraphEdgePairInfo( + ostream& html, + MarkerGraphEdgeId edgeIdA, + MarkerGraphEdgeId edgeIdB, + const MarkerGraphEdgePairInfo& info + ) const +{ + // Begin the summary table. + html << + "<table>" + "<tr><th><th>On<br>edge A<th>On<br>edge B"; + + // Total. + html << + "<tr><th class=left>Total "; + writeInformationIcon(html, "The total number of oriented reads on each of the two edges."); + html << "<td class=centered>" << info.totalA << "<td class=centered>" << info.totalB; + + // Common. + html << "<tr><th class=left>Common "; + writeInformationIcon(html, "The number of common oriented reads between the two edges."); + html << + "<td class=centered colspan = 2>" << info.common; + + // Only. + html << + "<tr><th class=left>Only "; + writeInformationIcon(html, "The number of oriented reads that appear in one edge but not the other."); + html << + "<td class=centered>" << info.onlyA << "<td class=centered>" << info.onlyB; + + // The rest can only be written if there are common reads. + if(info.common > 0) { + + // Only, short. + html << + "<tr><th class=left>Only, short "; + writeInformationIcon(html, "The number of oriented reads that appear in one edge only " + " and are too short to appear on the other edge, based on the estimated base offset."); + html << + "<td class=centered>" << info.onlyAShort << "<td class=centered>" << info.onlyBShort; + + // Only, missing. + html << + "<tr><th class=left>Only, missing "; + writeInformationIcon(html, "The number of oriented reads that appear in one edge only " + " and are not too short to appear on the other edge, based on the estimated base offset."); + html << + "<td class=centered>" << info.onlyA - info.onlyAShort << "<td class=centered>" << info.onlyB - info.onlyBShort; + } + + // End the summary table. + html << "</table>"; + + // Only write out the rest if there are common reads. + if(info.common == 0) { + return; + } + + // Write the table with Jaccard similarities and estimated offsets. + using std::fixed; + using std::setprecision; + html << + "<br><table>" + "<tr><th class=left>Jaccard similarity<td class=centered>" << + fixed << setprecision(2) << info.jaccard() << + "<tr><th class=left>Corrected Jaccard similarity<td class=centered>" << + fixed << setprecision(2) << info.correctedJaccard() << + "<tr><th class=left>Estimated offset in markers<td class=centered>" << info.offsetInMarkers << + "<tr><th class=left>Estimated offset in bases<td class=centered>" << info.offsetInBases << + "</table>"; + + + // Write the details table. + html << + "<br>In the following table, positions in red are hypothetical, based on the above " + "estimated base offset." + "<p><table>"; + + // Header row. + html << + "<tr>" + "<th class=centered rowspan=2>Oriented<br>read id" + "<th class=centered colspan=2>Length" + "<th colspan=4>Edge A" + "<th colspan=4>Edge B" + "<th rowspan=2>Ordinal offset" + "<th rowspan=2>Base offset" + "<th rowspan=2>Classification" + "<tr>" + "<th>Markers" + "<th>Bases" + "<th>Ordinal0" + "<th>Ordinal1" + "<th>Position0" + "<th>Position1" + "<th>Ordinal0" + "<th>Ordinal1" + "<th>Position0" + "<th>Position1"; + + // Prepare for the joint loop over OrientedReadIds of the two edges. + const auto markerIntervalsA = markerGraph.edgeMarkerIntervals[edgeIdA]; + const auto markerIntervalsB = markerGraph.edgeMarkerIntervals[edgeIdB]; + const auto beginA = markerIntervalsA.begin(); + const auto beginB = markerIntervalsB.begin(); + const auto endA = markerIntervalsA.end(); + const auto endB = markerIntervalsB.end(); + + // Joint loop over the MarkerIntervals of the two edges. + auto itA = beginA; + auto itB = beginB; + while(true) { + if(itA == endA and itB == endB) { + break; + } + + else if(itB == endB or ((itA!=endA) and (itA->orientedReadId < itB->orientedReadId))) { + // This oriented read only appears in edge A. + const OrientedReadId orientedReadId = itA->orientedReadId; + const auto orientedReadMarkers = markers[orientedReadId.getValue()]; + const int64_t lengthInBases = int64_t(getReads().getReadRawSequenceLength(orientedReadId.getReadId())); + + // Get the positions of edge A in this oriented read. + const uint32_t ordinalA0 = itA->ordinals[0]; + const uint32_t ordinalA1 = itA->ordinals[1]; + const int64_t positionA0 = int64_t(orientedReadMarkers[ordinalA0].position); + const int64_t positionA1 = int64_t(orientedReadMarkers[ordinalA1].position); + + // Find the hypothetical positions of edge B, assuming the estimated base offset. + const int64_t positionB0 = positionA0 + info.offsetInBases; + const int64_t positionB1 = positionA1 + info.offsetInBases; + const bool isShort = positionB0<0 or positionB1 >= lengthInBases; + + html << + "<tr><td class=centered>" + "<a href='exploreRead?readId=" << orientedReadId.getReadId() << + "&strand=" << orientedReadId.getStrand() << "'>" << orientedReadId << "</a>" + "<td class=centered>" << orientedReadMarkers.size() << + "<td class=centered>" << lengthInBases << + "<td class=centered>" << ordinalA0 << + "<td class=centered>" << ordinalA1 << + "<td class=centered>" << positionA0 << + "<td class=centered>" << positionA1 << + "<td><td>" + "<td class=centered style='color:Red'>" << positionB0 << + "<td class=centered style='color:Red'>" << positionB1 << "<td><td>" + "<td class=centered>OnlyA, " << (isShort ? "short" : "missing"); + + ++itA; + continue; + } + + else if(itA == endA or ((itB!=endB) and (itB->orientedReadId < itA->orientedReadId))) { + // This oriented read only appears in edge B. + const OrientedReadId orientedReadId = itB->orientedReadId; + const auto orientedReadMarkers = markers[orientedReadId.getValue()]; + const int64_t lengthInBases = int64_t(getReads().getReadRawSequenceLength(orientedReadId.getReadId())); + + // Get the positions of edge B in this oriented read. + const uint32_t ordinalB0 = itB->ordinals[0]; + const uint32_t ordinalB1 = itB->ordinals[1]; + const int64_t positionB0 = int64_t(orientedReadMarkers[ordinalB0].position); + const int64_t positionB1 = int64_t(orientedReadMarkers[ordinalB1].position); + + // Find the hypothetical positions of edge A, assuming the estimated base offset. + const int64_t positionA0 = positionB0 - info.offsetInBases; + const int64_t positionA1 = positionB1 - info.offsetInBases; + const bool isShort = positionA0<0 or positionA1 >= lengthInBases; + + html << + "<tr><td class=centered>" + "<a href='exploreRead?readId=" << orientedReadId.getReadId() << + "&strand=" << orientedReadId.getStrand() << "'>" << orientedReadId << "</a>" + "<td class=centered>" << orientedReadMarkers.size() << + "<td class=centered>" << lengthInBases << + "<td><td>" + "<td class=centered style='color:Red'>" << positionA0 << + "<td class=centered style='color:Red'>" << positionA1 << + "<td class=centered>" << ordinalB0 << + "<td class=centered>" << ordinalB1 << + "<td class=centered>" << positionB0 << + "<td class=centered>" << positionB1 << "<td><td>" + "<td class=centered>OnlyB, " << (isShort ? "short" : "missing"); + + ++itB; + continue; + } + + else { + // This oriented read appears in both edges. + const OrientedReadId orientedReadId = itA->orientedReadId; + const auto orientedReadMarkers = markers[orientedReadId.getValue()]; + const int64_t lengthInBases = int64_t(getReads().getReadRawSequenceLength(orientedReadId.getReadId())); + + // Get the positions of edge A in this oriented read. + const uint32_t ordinalA0 = itA->ordinals[0]; + const uint32_t ordinalA1 = itA->ordinals[1]; + const int64_t positionA0 = int64_t(orientedReadMarkers[ordinalA0].position); + const int64_t positionA1 = int64_t(orientedReadMarkers[ordinalA1].position); + + // Get the positions of edge B in this oriented read. + const uint32_t ordinalB0 = itB->ordinals[0]; + const uint32_t ordinalB1 = itB->ordinals[1]; + const int64_t positionB0 = int64_t(orientedReadMarkers[ordinalB0].position); + const int64_t positionB1 = int64_t(orientedReadMarkers[ordinalB1].position); + + // Compute estimated offsets. + const int64_t ordinalOffset = uint64_t(ordinalB1) - uint64_t(ordinalA0); + const int64_t baseOffset = positionB1 - positionA0; + + html << + "<tr><td class=centered>" + "<a href='exploreRead?readId=" << orientedReadId.getReadId() << + "&strand=" << orientedReadId.getStrand() << "'>" << orientedReadId << "</a>" + "<td class=centered>" << orientedReadMarkers.size() << + "<td class=centered>" << lengthInBases << + "<td class=centered>" << ordinalA0 << + "<td class=centered>" << ordinalA1 << + "<td class=centered>" << positionA0 << + "<td class=centered>" << positionA1 << + "<td class=centered>" << ordinalB0 << + "<td class=centered>" << ordinalB1 << + "<td class=centered>" << positionB0 << + "<td class=centered>" << positionB1 << + "<td class=centered>" << ordinalOffset << + "<td class=centered>" << baseOffset << + "<td class=centered>Common"; + + ++itA; + ++itB; + } + } + + // Finish the details table. + html << "</table>"; + + +} + diff --git a/src/AssemblerHttpServer-Mode3.cpp b/src/AssemblerHttpServer-Mode3.cpp deleted file mode 100644 index 753d5dc..0000000 --- a/src/AssemblerHttpServer-Mode3.cpp +++ /dev/null @@ -1,996 +0,0 @@ -// Shasta. -#include "Assembler.hpp" -#include "assembleMarkerGraphPath.hpp" -#include "mode3.hpp" -#include "mode3-AssemblyPath.hpp" -#include "mode3-LocalAssemblyGraph.hpp" -#include "mode3-SegmentPairInformation.hpp" -#include "PngImage.hpp" -using namespace shasta; -using namespace mode3; - -// Boost library. -#include <boost/icl/discrete_interval.hpp> -#include <boost/icl/right_open_interval.hpp> - -// Standard library. -#include "fstream.hpp" - - -void Assembler::exploreMode3AssemblyGraph( - const vector<string>& request, - ostream& html) -{ - SHASTA_ASSERT(assemblyGraph3Pointer); - - // Get the parameters for the request. - mode3::LocalAssemblyGraph::SvgOptions options(request); - - uint64_t maxDistance = 2; - getParameterValue(request, "maxDistance", maxDistance); - - uint64_t startSegmentId; - const bool startSegmentIdIsPresent = getParameterValue(request, "startSegmentId", startSegmentId); - - double timeout = 30.; - getParameterValue(request, "timeout", timeout); - - - - // Write the form. - html << - "<h2>Display the local assembly graph near a given segment</h2>" - "<form>" - "<table>" - - "<tr>" - "<td>Start segment" - "<td class=centered><input type=text required name=startSegmentId size=8 style='text-align:center'" - " value='" << (startSegmentIdIsPresent ? to_string(startSegmentId) : "") << - "'>" - - "<tr>" - "<td>Maximum distance in the assembly graph (edges)" - "<td class=centered><input type=text name=maxDistance size=8 style='text-align:center'" - " value='" << maxDistance << - "'>" - - "<tr>" - "<td>Timeout for graph layout (seconds)" - "<td class=centered><input type=text name=timeout size=8 style='text-align:center'" - " value='" << timeout << - "'>"; - - options.addFormRows(html); - - html << - "</table>" - "<br><input type=submit value='Display'>" - "</form>"; - - - - if(not startSegmentIdIsPresent) { - return; - } - - if(startSegmentId >= assemblyGraph3Pointer->markerGraphPaths.size()) { - html << "<p>Invalid start segment id. Maximum valid value is " << - assemblyGraph3Pointer->markerGraphPaths.size() - 1; - return; - } - if(options.referenceSegmentId >= assemblyGraph3Pointer->markerGraphPaths.size()) { - html << "<p>Invalid reference segment id. Maximum valid value is " << - assemblyGraph3Pointer->markerGraphPaths.size() - 1; - return; - } - - - html << "<h1>Local assembly graph near segment " << startSegmentId << "</h1></p>"; - - - - // Create the local assembly graph, or reuse the last one, if possible. - static shared_ptr<mode3::LocalAssemblyGraph> lastLocalAssemblyGraphPointer; - static shared_ptr<mode3::LocalAssemblyGraph::SvgOptions> lastOptions; - static uint64_t lastStartSegmentId = invalid<uint64_t>; - static uint64_t lastMaxDistance = invalid<uint64_t>; - const bool canReuse = - lastLocalAssemblyGraphPointer and - (startSegmentId == lastStartSegmentId) and - (maxDistance == lastMaxDistance) and - options.hasSameLayoutOptions(*lastOptions); - if(canReuse) { - cout << "Reusing the previous mode3::LocalAssemblyGraph." << endl; - } else { - lastLocalAssemblyGraphPointer = make_shared<mode3::LocalAssemblyGraph>( - markerGraph, - *assemblyGraph3Pointer, - startSegmentId, maxDistance); - lastOptions = make_shared<mode3::LocalAssemblyGraph::SvgOptions>(options); - lastStartSegmentId = startSegmentId; - lastMaxDistance = maxDistance; - lastLocalAssemblyGraphPointer->computeLayout(options, timeout); - lastLocalAssemblyGraphPointer->computeSegmentTangents(); - } - mode3::LocalAssemblyGraph& localAssemblyGraph = *lastLocalAssemblyGraphPointer; - - html << "<p>The local assembly graph has " << - num_vertices(localAssemblyGraph) << " segments and " << - num_edges(localAssemblyGraph) << " links." - "<p>"; - - - - // Display the local assembly graph. - localAssemblyGraph.writeHtml(html, options); - - // To facilitate debugging and testing, also write a gfa file - // that represents the LocalAssemblyGraph. - localAssemblyGraph.writeGfa("LocalAssemblyGraph.gfa"); - -} - - - -void Assembler::exploreMode3AssemblyGraphSegment( - const vector<string>& request, - ostream& html) -{ - SHASTA_ASSERT(assemblyGraph3Pointer); - const mode3::AssemblyGraph& assemblyGraph3 = *assemblyGraph3Pointer; - - - - // Get request parameters. - uint64_t segmentId; - const bool segmentIdIsPresent = getParameterValue(request, "segmentId", segmentId); - - string showOrientedReadsString; - const bool showOrientedReads = HttpServer::getParameterValue(request, - "showOrientedReads", showOrientedReadsString); - - string showMarkerGraphPathString; - const bool showMarkerGraphPath = HttpServer::getParameterValue(request, - "showMarkerGraphPath", showMarkerGraphPathString); - - string showSequenceString; - const bool showSequence = HttpServer::getParameterValue(request, - "showSequence", showSequenceString); - - string showSequenceDetailsString; - const bool showSequenceDetails = HttpServer::getParameterValue(request, - "showSequenceDetails", showSequenceDetailsString); - - - - // Write the form. - html << - "<h3>Display details of an assembly graph segment</h3>" - "<form>" - "<table>" - - "<tr>" - "<td>Segment id" - "<td><input type=text required name=segmentId size=8 style='text-align:center'" - " value='" << (segmentIdIsPresent ? to_string(segmentId) : "") << - "'>" - - "<tr>" - "<td>Show oriented reads" - "<td class=centered> <input type=checkbox name=showOrientedReads" << - (showOrientedReads ? " checked=checked" : "") << - ">" - - "<tr>" - "<td>Show marker graph path" - "<td class=centered> <input type=checkbox name=showMarkerGraphPath" << - (showMarkerGraphPath ? " checked=checked" : "") << - ">" - - "<tr>" - "<td>Show sequence" - "<td class=centered> <input type=checkbox name=showSequence" << - (showSequence ? " checked=checked" : "") << - ">" - - "<tr>" - "<td>Show sequence assembly details" - "<td class=centered> <input type=checkbox name=showSequenceDetails" << - (showSequenceDetails ? " checked=checked" : "") << - ">" - - "</table>" - "<br><input type=submit value='Display'>" - "</form>"; - - // If the segmentId was not specified, stop here. - if(not segmentIdIsPresent) { - return; - } - - // Check that we have a valid segmentId. - if(segmentId >= assemblyGraph3.markerGraphPaths.size()) { - html << "Invalid segment id. Maximum valid value is " << - assemblyGraph3.markerGraphPaths.size() - 1 << "."; - return; - } - - // Access the marker graph path for this segment. - const auto path = assemblyGraph3.markerGraphPaths[segmentId]; - - // Get information about the oriented reads of this segment. - mode3::AssemblyGraph::SegmentOrientedReadInformation orientedReads; - assemblyGraph3.getOrientedReadsOnSegment(segmentId, orientedReads); - - const auto oldPrecision = html.precision(1); - const auto oldFlags = html.setf(std::ios_base::fixed, std::ios_base::floatfield); - html << - "<h1>Assembly graph segment " << segmentId << "</h1>" - "<p><table>" - "<tr><th class=left>Length of marker graph path<td class=centered>" << path.size() << - "<tr><th class=left>Average marker graph edge coverage on path<td class=centered>" << - assemblyGraph3.segmentCoverage[segmentId] << - "<tr><th class=left>Number of distinct oriented reads on path<td class=centered>" << orientedReads.infos.size(); - - // Write the incoming and outgoing links. - html << "<tr><th class=left>Incoming links<td class=centered>"; - for(const uint64_t linkId: assemblyGraph3.linksByTarget[segmentId]) { - html << "<a href='exploreMode3AssemblyGraphLink?linkId=" << linkId << "'>" << linkId << "</a> "; - } - html << "<tr><th class=left>Outgoing links<td class=centered>"; - for(const uint64_t linkId: assemblyGraph3.linksBySource[segmentId]) { - html << "<a href='exploreMode3AssemblyGraphLink?linkId=" << linkId << "'>" << linkId << "</a> "; - } - - - html << "</table>"; - html.precision(oldPrecision); - html.flags(oldFlags); - - - - // Write the oriented reads in a table. - if(showOrientedReads) { - html << - "<h2>Oriented reads on this segment</h2>" - "<table>" - "<tr>" - "<th>Oriented<br>read" - "<th>Average<br>offset"; - for(const auto& info: orientedReads.infos) { - html<< - "<tr>" - "<td class=centered>" << info.orientedReadId << - "<td class=centered>" << info.averageOffset; - } - html << "</table>"; - } - - - - // Write the marker graph path. - if(showMarkerGraphPath) { - html << - "<h2>Marker graph path for this segment</h2>" - "<table>" - "<tr>" - "<th>Position" - "<th>Edge" - "<th>Coverage" - "<th>Source<br>vertex" - "<th>Target<br>vertex"; - - for(uint64_t position=0; position<path.size(); position++) { - const MarkerGraphEdgeId& edgeId = path[position]; - const MarkerGraph::Edge& edge = markerGraph.edges[edgeId]; - const MarkerGraph::VertexId vertexId0 = edge.source; - const MarkerGraph::VertexId vertexId1 = edge.target; - - html << "<tr>" - "<td class=centered>" << position << - "<td class=centered>" << - "<a href='exploreMarkerGraphEdge?edgeId=" << edgeId << - "'>" << edgeId << "</a>" - "<td class=centered>" << markerGraph.edgeMarkerIntervals.size(edgeId) << - "<td class=centered>" << - "<a href='exploreMarkerGraphVertex?vertexId=" << vertexId0 << - "'>" << vertexId0 << "</a>" - "<td class=centered>" << - "<a href='exploreMarkerGraphVertex?vertexId=" << vertexId1 << - "'>" << vertexId1 << "</a>" - "\n"; - - - - } - html << "</table>"; - } - - - - // Assembled sequence, optionally with details. - if(showSequence or showSequenceDetails) { - - // Assemble the sequence for this segment. - AssembledSegment assembledSegment; - assembleMarkerGraphPath( - assemblyGraph3.readRepresentation, - assemblyGraph3.k, - assemblyGraph3.markers, - assemblyGraph3.markerGraph, - assemblyGraph3.markerGraphPaths[segmentId], - false, - assembledSegment); - - // Check that the sequence we have is the same as the stored sequence - // for this segment. - SHASTA_ASSERT(std::equal( - assembledSegment.rawSequence.begin(), assembledSegment.rawSequence.end(), - assemblyGraph3.segmentSequences.begin(segmentId), assemblyGraph3.segmentSequences.end(segmentId) - )); - - // Write the sequence. - assembledSegment.writeHtml(html, showSequence, showSequenceDetails, - 0, uint32_t(assembledSegment.rawSequence.size())); - } - - -} - - - -void Assembler::exploreMode3AssemblyGraphLink( - const vector<string>& request, - ostream& html) -{ - SHASTA_ASSERT(assemblyGraph3Pointer); - const mode3::AssemblyGraph& assemblyGraph3 = *assemblyGraph3Pointer; - - // Get the link id from the request. - uint64_t linkId; - const bool linkIdIsPresent = getParameterValue(request, "linkId", linkId); - - - - // Write the form. - html << - "<h3>Display details of an assembly graph link</h3>" - "<form>" - "<table>" - - "<tr>" - "<td>Link id" - "<td><input type=text required name=linkId size=8 style='text-align:center'" - " value='" << (linkIdIsPresent ? to_string(linkId) : "") << - "'>" - - "</table>" - "<br><input type=submit value='Display'>" - "</form>"; - - // If the segmentId was not specified, stop here. - if(not linkIdIsPresent) { - return; - } - - const mode3::AssemblyGraph::Link& link = assemblyGraph3.links[linkId]; - const auto transitions = assemblyGraph3.transitions[linkId]; - const uint64_t segmentId0 = link.segmentId0; - const uint64_t segmentId1 = link.segmentId1; - const auto path0 = assemblyGraph3.markerGraphPaths[segmentId0]; - const auto path1 = assemblyGraph3.markerGraphPaths[segmentId1]; - const uint64_t pathLength0 = path0.size(); - const uint64_t pathLength1 = path1.size(); - - html << - "<h1>Assembly graph link " << linkId << "</h1>" - "<p><table>" - "<tr><th>Segment<th>Id<th>Path<br>length" - "<tr><th class = left>Source segment<td class=centered>" << segmentId0 << "<td class=centered>" << pathLength0 << - "<tr><th class = left>Target segment<td class=centered>" << segmentId1 << "<td class=centered>" << pathLength1 << - "</table>"; - - if(link.segmentsAreAdjacent) { - html << "<p>The paths of these segments are adjacent."; - } else { - html << "<p>The paths of these segments are not adjacent."; - } - - - const auto oldPrecision = html.precision(1); - const auto oldFlags = html.setf(std::ios_base::fixed, std::ios_base::floatfield); - html << - "<p><table>" - "<tr><th class = left tooltip='Number of supporting transitions'>Coverage<td class=centered>" << - transitions.size() << - "<tr><th class = left>Average link separation<td class=centered>" << - link.separation << - "</table>"; - html.precision(oldPrecision); - html.flags(oldFlags); - - - html << - "<h2>Transitions</h2>" - "<p><table><tr>" - "<th class=centered>Oriented<br>read<br>id" - "<th class=centered>Last<br>position<br>on segment<br>" << link.segmentId0 << - "<th class=centered>Last<br>ordinal<br>on segment<br>" << link.segmentId0 << - "<th class=centered>First<br>position<br>on segment<br>" << link.segmentId1 << - "<th class=centered>First<br>ordinal<br>on segment<br>" << link.segmentId1 << - "<th class=centered>Link<br>separation"; - - - for(const auto& p: transitions) { - const OrientedReadId orientedReadId = p.first; - const Transition& transition = p.second; - const auto& pseudoPathEntry0 = transition[0]; - const auto& pseudoPathEntry1 = transition[1]; - - SHASTA_ASSERT(pseudoPathEntry1.ordinals[0] >= pseudoPathEntry0.ordinals[1]); - - const int64_t linkSeparation = - int64_t(pseudoPathEntry1.ordinals[0] - pseudoPathEntry0.ordinals[1]) - - int64_t(pathLength0 - 1 - pseudoPathEntry0.position) - - int64_t(pseudoPathEntry1.position); - - html << - "<tr><td class=centered>" << orientedReadId << - - "<td class=centered>" << pseudoPathEntry0.position << - "<td class=centered>" << pseudoPathEntry0.ordinals[1] << - - "<td class=centered>" << pseudoPathEntry1.position << - "<td class=centered>" << pseudoPathEntry1.ordinals[0] << - - "<td class=centered>" << linkSeparation; - } - html << "</table>"; - - - - -} - - - -void Assembler::exploreMode3AssemblyGraphSegmentPair( - const vector<string>& request, - ostream& html) -{ - using boost::icl::discrete_interval; - using boost::icl::intersects; - using boost::icl::length; - - SHASTA_ASSERT(assemblyGraph3Pointer); - const mode3::AssemblyGraph& assemblyGraph3 = *assemblyGraph3Pointer; - - // Get the segment ids from the request. - uint64_t segmentId0; - const bool segmentId0IsPresent = getParameterValue(request, "segmentId0", segmentId0); - uint64_t segmentId1; - const bool segmentId1IsPresent = getParameterValue(request, "segmentId1", segmentId1); - - - - // Write the form. - html << - "<h3>Display details for a pair assembly graph segment</h3>" - "<form>" - "<table>" - - "<tr>" - "<td>Segment id 0" - "<td><input type=text required name=segmentId0 size=8 style='text-align:center'" - " value='" << (segmentId0IsPresent ? to_string(segmentId0) : "") << - "'>" - - "<tr>" - "<td>Segment id 1" - "<td><input type=text required name=segmentId1 size=8 style='text-align:center'" - " value='" << (segmentId1IsPresent ? to_string(segmentId1) : "") << - "'>" - - "</table>" - "<br><input type=submit value='Display'>" - "</form>"; - - // If the segmentId's were not specified, stop here. - if(not segmentId0IsPresent) { - return; - } - if(not segmentId1IsPresent) { - return; - } - - // Check that we have valid segmentId's. - if(segmentId0 >= assemblyGraph3.markerGraphPaths.size()) { - html << "Invalid segment id. Maximum valid value is " << - assemblyGraph3.markerGraphPaths.size() - 1 << "."; - return; - } - if(segmentId1 >= assemblyGraph3.markerGraphPaths.size()) { - html << "Invalid segment id. Maximum valid value is " << - assemblyGraph3.markerGraphPaths.size() - 1 << "."; - return; - } - - - // Get information about the oriented reads of these segments. - mode3::AssemblyGraph::SegmentOrientedReadInformation orientedReads0; - mode3::AssemblyGraph::SegmentOrientedReadInformation orientedReads1; - assemblyGraph3.getOrientedReadsOnSegment(segmentId0, orientedReads0); - assemblyGraph3.getOrientedReadsOnSegment(segmentId1, orientedReads1); - const uint64_t length0 = assemblyGraph3.markerGraphPaths.size(segmentId0); - const uint64_t length1 = assemblyGraph3.markerGraphPaths.size(segmentId1); - - // Estimate the offset between the segments and count missing - // oriented reads. - SegmentPairInformation segmentPairInformation; - assemblyGraph3.analyzeSegmentPair( - segmentId0, segmentId1, - orientedReads0, orientedReads1, - markers, segmentPairInformation); - const uint64_t commonCount = segmentPairInformation.commonCount; - - - - /// Write a table with information about this pair of segments. - html << - "<p>" - "<table>" - - "<tr>" - "<th class=left>Segment id" - "<td class=centered>" << segmentId0 << - "<td class=centered>" << segmentId1 << - - "<tr title='Segment length in marker graph edges'>" - "<th class=left>Length" - "<td class=centered>" << length0 << - "<td class=centered>" << length1 << - - "<tr title='Total number of oriented reads in this segment'>" - "<th class=left>Total" - "<td class=centered>" << segmentPairInformation.totalCount[0] << - "<td class=centered>" << segmentPairInformation.totalCount[1] << - - "<tr title='Number of oriented reads present in both segments'>" - "<th class=left>Common" - "<td class=centered>" << segmentPairInformation.commonCount << - "<td class=centered>" << segmentPairInformation.commonCount; - - if(segmentPairInformation.commonCount > 0) { - const auto oldPrecision = html.precision(2); - const auto oldFlags = html.setf(std::ios_base::fixed, std::ios_base::floatfield); - html << - "<tr title='Number of oriented reads in this segment that are too short to appear in the other segment'>" - "<th class=left>Short" - "<td class=centered>" << segmentPairInformation.shortCount[0] << - "<td class=centered>" << segmentPairInformation.shortCount[1] << - - "<tr title='Number of oriented reads in this segment that are " - "unexpectedly missing in the other segment'>" - "<th class=left>Unexplained" - "<td class=centered>" << segmentPairInformation.unexplainedCount[0] << - "<td class=centered>" << segmentPairInformation.unexplainedCount[1] << - - "<tr title='Jaccard similarity without counting short reads'>" - "<th class=left>Jaccard" - "<td class=centered>" << segmentPairInformation.jaccard() << - "<td class=centered>" << segmentPairInformation.jaccard() << - - "<tr title='Jaccard similarity without special treatment of short reads'>" - "<th class=left>Raw Jaccard" - "<td class=centered>" << segmentPairInformation.rawJaccard() << - "<td class=centered>" << segmentPairInformation.rawJaccard() << - - "<tr title='Fraction of oriented reads in this segment that are " - "unexpectedly missing in the other segment'>" - "<th class=left>Unexplained fraction" - "<td class=centered>" << segmentPairInformation.unexplainedFraction(0) << - "<td class=centered>" << segmentPairInformation.unexplainedFraction(1); - html.precision(oldPrecision); - html.flags(oldFlags); - } - - html << "</table>"; - if(segmentPairInformation.commonCount > 0) { - html << "<p>Estimated offset " << segmentPairInformation.offset; - html << "<br>Estimated gap " << segmentPairInformation.offset - int64_t(length0); - } - - - - // Write a table with a row for each oriented read. - html << - "<p>" - "<table>" - "<tr>" - "<th>Oriented<br>read" - "<th>Length" - "<th>Average<br>offset of<br>oriented read<br>relative to<br>segment " << segmentId0 << - "<th>Average<br>offset of<br>oriented read<br>relative to<br>segment " << segmentId1 << - "<th>Estimated<br>offset of<br>segment " << segmentId1 << - "<br>relative to<br>segment " << segmentId0 << - "<th>Hypothetical<br>offset of<br>oriented read<br>relative to<br>segment " << segmentId0 << - "<th>Hypothetical<br>offset of<br>oriented read<br>relative to<br>segment " << segmentId1 << - "<th>Hypothetical<br>overlap of<br>oriented read<br>with<br>segment " << segmentId0 << - "<th>Hypothetical<br>overlap of<br>oriented read<br>with<br>segment " << segmentId1 << - "<th>On both<br>segments" << - "<th>Too<br>short" << - "<th>On segment<br>" << segmentId0 << "<br>only,<br>missing from<br>segment<br>" << segmentId1 << - "<th>On segment<br>" << segmentId1 << "<br>only,<br>missing from<br>segment<br>" << segmentId0; - - - // Set up a joint loop over oriented reads in the two segments. - const auto begin0 = orientedReads0.infos.begin(); - const auto begin1 = orientedReads1.infos.begin(); - const auto end0 = orientedReads0.infos.end(); - const auto end1 = orientedReads1.infos.end(); - auto it0 = begin0; - auto it1 = begin1; - - - - while(true) { - - // At end of both segments. - if(it0 == end0 and it1 == end1) { - break; - } - - - - // Only on segment 0. - if((it1 == end1) or ((it0!=end0) and (it0->orientedReadId < it1->orientedReadId))) { - const int64_t orientedReadLength = markers.size(it0->orientedReadId.getValue()); - html << - "<tr>" - "<td class=centered>" << - "<a href='exploreRead?readId=" << it0->orientedReadId.getReadId() << - "&strand=" << it0->orientedReadId.getStrand() << "'>" << it0->orientedReadId << "</a>" - "<td class=centered>" << orientedReadLength << - "<td class=centered>" << it0->averageOffset << - "<td>" - "<td><td>"; - - if(commonCount) { - // Compute the hypothetical range of the oriented read relative - // to the beginning of segment 1. - const discrete_interval<int64_t> orientedReadRange1( - it0->averageOffset - segmentPairInformation.offset, - it0->averageOffset - segmentPairInformation.offset + orientedReadLength); - const discrete_interval<int64_t> segment1Range(0, length1); - const bool wouldOverlap = intersects(orientedReadRange1, segment1Range); - html << - "<td class=centered>" << orientedReadRange1.lower() << - "<td><td class=centered>" << length(orientedReadRange1 & segment1Range); - if(wouldOverlap) { - html << "<td><td><td class=centered>✓<td>"; - } else { - html << "<td><td class=centered>✓<td><td>"; - } - } else { - html << "<td><td><td><td><td><td><td>"; - } - ++it0; - } - - - - // Only on segment 1 - else if((it0 == end0) or ((it1!=end1) and (it1->orientedReadId < it0->orientedReadId))) { - const int64_t orientedReadLength = markers.size(it1->orientedReadId.getValue()); - html << - "<tr>" - "<td class=centered>" << - "<a href='exploreRead?readId=" << it1->orientedReadId.getReadId() << - "&strand=" << it1->orientedReadId.getStrand() << "'>" << it1->orientedReadId << "</a>" - "<td class=centered>" << orientedReadLength << - "<td>" - "<td class=centered>" << it1->averageOffset << - "<td>"; - - if(commonCount) { - // Compute the hypothetical range of the oriented read relative - // to the beginning of segment 0. - const discrete_interval<int64_t> orientedReadRange0( - it1->averageOffset + segmentPairInformation.offset, - it1->averageOffset + segmentPairInformation.offset + orientedReadLength); - const discrete_interval<int64_t> segment0Range(0, length0); - const bool wouldOverlap = intersects(orientedReadRange0, segment0Range); - html << - "<td class=centered>" << orientedReadRange0.lower() << - "<td><td class=centered>" << length(orientedReadRange0 & segment0Range) << "<td>"; - if(wouldOverlap) { - html << "<td><td><td><td class=centered>✓"; - } else { - html << "<td><td class=centered>✓<td><td>"; - } - - } else { - html << "<td><td><td><td><td><td><td><td>"; - } - - ++it1; - } - - // On both segments. - else { - html << - "<tr>" - "<td class=centered>" << - "<a href='exploreRead?readId=" << it0->orientedReadId.getReadId() << - "&strand=" << it0->orientedReadId.getStrand() << "'>" << it0->orientedReadId << "</a>" - "<td class=centered>" << markers.size(it0->orientedReadId.getValue()) << - "<td class=centered>" << it0->averageOffset << - "<td class=centered>" << it1->averageOffset << - "<td class=centered>" << it0->averageOffset - it1->averageOffset << - "<td><td><td><td>" - "<td class=centered>✓<td><td><td>"; - - ++it0; - ++it1; - } - } - html << "</table>"; - -} - - - -void Assembler::exploreMode3MetaAlignment( - const vector<string>& request, - ostream& html) -{ - // Access the mode 3 assembly graph. - SHASTA_ASSERT(assemblyGraph3Pointer); - const mode3::AssemblyGraph& assemblyGraph3 = *assemblyGraph3Pointer; - - // Get the oriented read ids from the request. - string orientedReadId0String; - const bool orientedReadId0IsPresent = getParameterValue(request, "orientedReadId0", orientedReadId0String); - string orientedReadId1String; - const bool orientedReadId1IsPresent = getParameterValue(request, "orientedReadId1", orientedReadId1String); - - // Write the form. - html << - "Enter the two oriented read ids:" - "<form>" - "<p><input type=text size=8 name=orientedReadId0 value='" << - (orientedReadId0IsPresent ? orientedReadId0String : "") << "'>" - "<p><input type=text size=8 name=orientedReadId1 value='" << - (orientedReadId1IsPresent ? orientedReadId1String : "") << "'>" - "<p><input type=submit value='Compute the meta-alignment'>" - "</form>"; - - // If the oriented reads are not present, do nothing. - if(not(orientedReadId0IsPresent and orientedReadId1IsPresent)) { - return; - } - const OrientedReadId orientedReadId0(orientedReadId0String); - const OrientedReadId orientedReadId1(orientedReadId1String); - - html << "<h1>Meta-alignment of oriented reads " << - orientedReadId0 << " " << orientedReadId1 << "</h1>"; - - // Access the pseudo-paths, that is the meta-sequences to be aligned. - const auto pseudoPath0 = assemblyGraph3.assemblyGraphJourneys[orientedReadId0.getValue()]; - const auto pseudoPath1 = assemblyGraph3.assemblyGraphJourneys[orientedReadId1.getValue()]; - const int n0 = int(pseudoPath0.size()); - const int n1 = int(pseudoPath1.size()); - - // Create a png file representing the alignment matrix. - PngImage image = PngImage(int(n0), int(n1)); - for(int i0=0; i0<n0; i0++) { - const uint64_t segmentId0 = pseudoPath0[i0].segmentId; - for(int i1=0; i1<n1; i1++) { - const uint64_t segmentId1 = pseudoPath1[i1].segmentId; - if(segmentId0 == segmentId1) { - image.setPixel(i0, i1, 255, 0, 0); - } - } - } - image.write("MetaAlignment.png"); - - // Create a base64 version of the png file. - const string command = "base64 MetaAlignment.png > MetaAlignment.png.base64"; - ::system(command.c_str()); - - // Display the picture with the alignment. - // image-rendering:crisp-edges; is currently supported on Firefox but not Chrome, - // so Chrome will display blurry pictures. - html << - "<h3>Alignment matrix</h3>" - "<p><img " - " style='width:" << 3*n0 << "px;height:auto;image-rendering:crisp-edges;'" - "src=\"data:image/png;base64,"; - ifstream png("MetaAlignment.png.base64"); - html << png.rdbuf(); - html << "\"/>"; - -} - - - -void Assembler::exploreMode3AssemblyPath( - const vector<string>& request, - ostream& html) -{ - // Get the parametersof the request. - - // The segment that the path will start from. - string pathStartString; - HttpServer::getParameterValue(request, "pathStart", pathStartString); - - // The path direction can be forward, backward, or bidirectional. - string pathDirection = "bidirectional"; - HttpServer::getParameterValue(request, "pathDirection", pathDirection); - - - - // Write the form. - html << - "<h2>Assembly path computation</h2>" - "<form>" - - "Start the path at segment <input type=text name=pathStart required size=8 style='text-align:center'" - " value='" << pathStartString << "'>" - - "<br><input type=radio name=pathDirection value=forward" << - (pathDirection=="forward" ? " checked=checked" : "") << "> Forward" - "<br><input type=radio name=pathDirection value=backward" << - (pathDirection=="backward" ? " checked=checked" : "") << "> Backward" - "<br><input type=radio name=pathDirection value=bidirectional" << - (pathDirection=="bidirectional" ? " checked=checked" : "") << "> Both directions" << - - "<p><input type=submit value='Compute the path and assemble its sequence'>" - "</form>"; - - // If the path start was not specified, stop here. - if(pathStartString.empty()) { - return; - } - - // Get the path start segment. - uint64_t pathStart; - try { - pathStart = boost::lexical_cast<uint64_t>(pathStartString); - } catch(std::exception&) { - throw runtime_error("Invalid path start segment id."); - } - - // Check that it is a valid segment id. - const mode3::AssemblyGraph& assemblyGraph = *assemblyGraph3Pointer; - if(pathStart >= assemblyGraph.markerGraphPaths.size()) { - throw runtime_error("Invalid path start segment id. The assembly graph has " + - to_string(assemblyGraph.markerGraphPaths.size()) + " segments."); - } - - // Write a header. - html << "<h1>Assembly path</h1>"; - - - - // Compute the assembly path. - AssemblyPath path; - if(pathDirection == "forward" or pathDirection == "backward") { - - // Forward or backward. - assemblyGraph.createAssemblyPath(pathStart, - (pathDirection == "forward") ? 0 : 1, path); - if(pathDirection == "backward") { - reverse(path.segments.begin(), path.segments.end()); - } - - } else { - - // Bidirectional. - AssemblyPath forwardPath; - AssemblyPath backwardPath; - assemblyGraph.createAssemblyPath(pathStart, 0, forwardPath); - assemblyGraph.createAssemblyPath(pathStart, 1, backwardPath); - - // Stitch them together, making sure not to repeat the starting segment. - path.segments.clear(); - copy(backwardPath.segments.rbegin(), backwardPath.segments.rend(), back_inserter(path.segments)); - copy(forwardPath.segments.begin() + 1, forwardPath.segments.end(), back_inserter(path.segments)); - - } - - html << "<p>This assembly path was created starting at segment " << pathStart << - " and moving "; - if(pathDirection == "forward") { - html << "forward."; - } else if(pathDirection == "backward") { - html << "backward."; - } else if(pathDirection == "bidirectional") { - html << "in both directions."; - } - - // Assemble sequence for this path. - path.assemble(assemblyGraph); - - // Write path details to html. - path.writeHtml(html, assemblyGraph); - - -} - - - -void Assembler::exploreMode3LinkAssembly( - const vector<string>& request, - ostream& html) -{ - // Access the AssemblyGraph. - using mode3::AssemblyGraph; // Hide shasta::AssemblyGraph; - SHASTA_ASSERT(assemblyGraph3Pointer); - const AssemblyGraph& assemblyGraph = *assemblyGraph3Pointer; - - // Get the parameters of the request. - uint64_t linkId = invalid<uint64_t>; - getParameterValue(request, "linkId", linkId); - SHASTA_ASSERT(linkId < assemblyGraph.links.size()); - uint64_t previousPrimarySegmentId = invalid<uint64_t>; - getParameterValue(request, "previousPrimarySegmentId", previousPrimarySegmentId); - SHASTA_ASSERT(previousPrimarySegmentId < assemblyGraph.markerGraphPaths.size()); - uint64_t nextPrimarySegmentId = invalid<uint64_t>; - getParameterValue(request, "nextPrimarySegmentId", nextPrimarySegmentId); - SHASTA_ASSERT(nextPrimarySegmentId < assemblyGraph.markerGraphPaths.size()); - - // Access the link. - if(linkId >= assemblyGraph.links.size()) { - html << "Invalid link id. There are " << assemblyGraph.links.size() << - " links in the assembly graph."; - return; - } - const AssemblyGraph::Link& link = assemblyGraph.links[linkId]; - - // If this is a trivial link, there is nothing to show. - if(link.segmentsAreAdjacent) { - html << "This is a trivial link. No assembly is required."; - return; - } - - - - html << "<h1>Details of link assembly</h1>"; - - // Create the segments and assemble them. - AssemblyPathSegment segment0(link.segmentId0, false); - AssemblyPathSegment segment1(link.segmentId1, false); - assembleMarkerGraphPath( - assemblyGraph.readRepresentation, - assemblyGraph.k, - assemblyGraph.markers, - assemblyGraph.markerGraph, - assemblyGraph.markerGraphPaths[segment0.id], - false, - segment0.assembledSegment); - assembleMarkerGraphPath( - assemblyGraph.readRepresentation, - assemblyGraph.k, - assemblyGraph.markers, - assemblyGraph.markerGraph, - assemblyGraph.markerGraphPaths[segment1.id], - false, - segment1.assembledSegment); - - // Create the AssemblyPathLink. - AssemblyPathLink assemblyPathLink; - assemblyPathLink.id = linkId; - assemblyPathLink.isTrivial = false; - assemblyPathLink.previousPrimarySegmentId = previousPrimarySegmentId; - assemblyPathLink.nextPrimarySegmentId = nextPrimarySegmentId; - - // Do the assembly. - AssemblyPath::assembleNonTrivialLink( - assemblyGraph, - segment0, - segment1, - assemblyPathLink, - html); -} diff --git a/src/AssemblerHttpServer-ReadGraph.cpp b/src/AssemblerHttpServer-ReadGraph.cpp index c5ba0a9..5281da6 100644 --- a/src/AssemblerHttpServer-ReadGraph.cpp +++ b/src/AssemblerHttpServer-ReadGraph.cpp @@ -413,6 +413,16 @@ void Assembler::exploreUndirectedReadGraph( + // Cross strand edges are drawn purple. + BGL_FORALL_EDGES(e, graph, LocalReadGraph) { + const LocalReadGraphEdge& edge = graph[e]; + if(edge.crossesStrands) { + graph[e].color = "Purple"; + } + } + + + // Triangle analysis of the local read graph, if requested. LocalReadGraphTriangles triangles; if(alignmentAnalysis == "triangles") { @@ -508,6 +518,7 @@ void Assembler::exploreUndirectedReadGraph( vertexScalingFactor, edgeThicknessScalingFactor, maxDistance, + *this, html); } diff --git a/src/AssemblerHttpServer-Reads.cpp b/src/AssemblerHttpServer-Reads.cpp index c1424a5..6ec17e2 100644 --- a/src/AssemblerHttpServer-Reads.cpp +++ b/src/AssemblerHttpServer-Reads.cpp @@ -654,6 +654,8 @@ void Assembler::exploreReadRle( if (marker.position < beginRlePosition || marker.position > endRlePosition-k) { continue; } + const Kmer kmer = getOrientedReadMarkerKmer(orientedReadId, ordinal); + const KmerId kmerId = KmerId(kmer.id(k)); // See if this marker is contained in a vertex of the marker graph. const MarkerGraph::VertexId vertexId = @@ -664,10 +666,9 @@ void Assembler::exploreReadRle( // Write the k-mer of this marker. - const Kmer kmer(marker.kmerId, k); html << "<a xlink:title='Marker " << ordinal << ", position " << marker.position << - ", k-mer id " << marker.kmerId; + ", k-mer id " << kmerId; if(hasMarkerGraphVertex) { html << ", coverage " << markerGraph.vertexCoverage(vertexId); } @@ -675,7 +676,7 @@ void Assembler::exploreReadRle( if(hasMarkerGraphVertex) { // Add a hyperlink to the marker graph vertex // that contains this marker. - const string url = "exploreMarkerGraph?vertexId=" + to_string(vertexId) + + const string url = "exploreMarkerGraph0?vertexId=" + to_string(vertexId) + "&maxDistance=2&detailed=on&minCoverage=3&minConsensus=3&sizePixels=320&timeout=30"; html << " xlink:href='" << url << "' style='cursor:pointer'"; } @@ -762,7 +763,7 @@ void Assembler::exploreReadRle( // Loop over all markers on this oriented read. for(uint32_t ordinal=0; ordinal<orientedReadMarkers.size(); ordinal++) { const CompressedMarker& marker = orientedReadMarkers[ordinal]; - const Kmer kmer(marker.kmerId, k); + const Kmer kmer = getOrientedReadMarkerKmer(orientedReadId, ordinal); const uint32_t rlePosition = marker.position; const uint32_t rawPosition = rawPositions[rlePosition]; @@ -783,7 +784,7 @@ void Assembler::exploreReadRle( html << "</code><td class=centered>" << rlePosition << "<td class=centered>" << rawPosition; if(hasMarkerGraphVertex) { - const string url = "exploreMarkerGraph?vertexId=" + to_string(vertexId) + + const string url = "exploreMarkerGraph0?vertexId=" + to_string(vertexId) + "&maxDistance=2&detailed=on&minCoverage=3&minConsensus=3&sizePixels=320&timeout=30"; html << "<td class=centered><a href='" << url << "'>" << vertexId << "</a>" "<td class=centered>" << markerGraph.vertexCoverage(vertexId); @@ -806,7 +807,7 @@ void Assembler::exploreReadRle( for(uint32_t ordinal=0; ordinal<uint32_t(orientedReadMarkers.size()); ordinal++) { const CompressedMarker& marker = orientedReadMarkers[ordinal]; if (marker.position >= beginRlePosition && marker.position <= endRlePosition - k) { - kmers.push_back(marker.kmerId); + kmers.push_back(getOrientedReadMarkerKmerId(orientedReadId, ordinal)); } } vector<uint32_t> kmerFrequency; @@ -1081,10 +1082,10 @@ void Assembler::exploreReadRaw( for(const uint64_t ordinal: markersOnThisRow) { const CompressedMarker& marker = orientedReadMarkers[ordinal]; const uint64_t position = marker.position - beginPosition; - const Kmer kmer(marker.kmerId, k); + const Kmer kmer = getOrientedReadMarkerKmer(orientedReadId, ordinal); // Write the required number of spaces. - SHASTA_ASSERT(position > oldPosition); // There must be at least a blank. + SHASTA_ASSERT((position==0) or (position > oldPosition)); // There must be at least a blank. for(uint64_t i=oldPosition; i<position; i++) { html << " "; } @@ -1100,7 +1101,7 @@ void Assembler::exploreReadRaw( // There is a marker graph vertex. // Write the marker as a link to that vertex. - const string url = "exploreMarkerGraph?vertexId=" + to_string(vertexId) + + const string url = "exploreMarkerGraph0?vertexId=" + to_string(vertexId) + "&maxDistance=6&detailed=on&sizePixels=600&timeout=30"; html << "<a href='" << url << "' title='Marker " << ordinal << ", position " << marker.position << diff --git a/src/AssemblerHttpServer.cpp b/src/AssemblerHttpServer.cpp index f128285..ee671b7 100644 --- a/src/AssemblerHttpServer.cpp +++ b/src/AssemblerHttpServer.cpp @@ -8,6 +8,7 @@ #include "platformDependent.hpp" #include "Reads.hpp" using namespace shasta; +using namespace mode0; // Boost libraries. #include <boost/tokenizer.hpp> @@ -230,9 +231,11 @@ void Assembler::fillServerFunctionTable() SHASTA_ADD_TO_FUNCTION_TABLE(alignSequencesInMarkerRepresentation); SHASTA_ADD_TO_FUNCTION_TABLE(assessAlignments); SHASTA_ADD_TO_FUNCTION_TABLE(exploreReadGraph); - SHASTA_ADD_TO_FUNCTION_TABLE(exploreMarkerGraph); + SHASTA_ADD_TO_FUNCTION_TABLE(exploreMarkerGraph0); + SHASTA_ADD_TO_FUNCTION_TABLE(exploreMarkerGraph1); SHASTA_ADD_TO_FUNCTION_TABLE(exploreMarkerGraphVertex); SHASTA_ADD_TO_FUNCTION_TABLE(exploreMarkerGraphEdge); + SHASTA_ADD_TO_FUNCTION_TABLE(exploreMarkerGraphEdgePair); SHASTA_ADD_TO_FUNCTION_TABLE(exploreMarkerCoverage); SHASTA_ADD_TO_FUNCTION_TABLE(exploreMarkerGraphInducedAlignment); SHASTA_ADD_TO_FUNCTION_TABLE(followReadInMarkerGraph); @@ -241,14 +244,8 @@ void Assembler::fillServerFunctionTable() SHASTA_ADD_TO_FUNCTION_TABLE(exploreAssemblyGraphEdge); SHASTA_ADD_TO_FUNCTION_TABLE(exploreAssemblyGraphEdgesSupport); SHASTA_ADD_TO_FUNCTION_TABLE(exploreCompressedAssemblyGraph); - SHASTA_ADD_TO_FUNCTION_TABLE(exploreMode3AssemblyGraph); - SHASTA_ADD_TO_FUNCTION_TABLE(exploreMode3AssemblyGraphSegment); - SHASTA_ADD_TO_FUNCTION_TABLE(exploreMode3AssemblyGraphSegmentPair); - SHASTA_ADD_TO_FUNCTION_TABLE(exploreMode3AssemblyGraphLink); - SHASTA_ADD_TO_FUNCTION_TABLE(exploreMode3MetaAlignment); - SHASTA_ADD_TO_FUNCTION_TABLE(exploreMode3AssemblyPath); - SHASTA_ADD_TO_FUNCTION_TABLE(exploreMode3LinkAssembly); + SHASTA_ADD_TO_FUNCTION_TABLE(fillMode3AssemblyPathStep); } #undef SHASTA_ADD_TO_FUNCTION_TABLE @@ -442,15 +439,37 @@ void Assembler::writeNavigation(ostream& html) const writeNavigation(html, "Read graph", { {"Read graph", "exploreReadGraph"}, }); - writeNavigation(html, "Marker graph", { - {"Local marker graph", "exploreMarkerGraph?useBubbleReplacementEdges=on"}, - {"Marker graph vertices", "exploreMarkerGraphVertex"}, - {"Marker graph edges", "exploreMarkerGraphEdge"}, - {"Marker coverage", "exploreMarkerCoverage"}, - {"Induced alignments", "exploreMarkerGraphInducedAlignment"}, - {"Follow a read in the marker graph", "followReadInMarkerGraph"}, - {"Marker connectivity", "exploreMarkerConnectivity"}, - }); + + + + if(assemblerInfo->assemblyMode == 3) { + writeNavigation(html, "Marker graph", { + {"Local marker graph", "exploreMarkerGraph0?useBubbleReplacementEdges=on"}, + {"Local marker graph for mode 3 assembly", "exploreMarkerGraph1"}, + {"Marker graph vertices", "exploreMarkerGraphVertex"}, + {"Marker graph edges", "exploreMarkerGraphEdge"}, + {"Marker graph edge pairs", "exploreMarkerGraphEdgePair"}, + {"Marker coverage", "exploreMarkerCoverage"}, + {"Induced alignments", "exploreMarkerGraphInducedAlignment"}, + {"Follow a read in the marker graph", "followReadInMarkerGraph"}, + {"Marker connectivity", "exploreMarkerConnectivity"}, + {"Assembly path step", "fillMode3AssemblyPathStep"}, + {"Path graph", "exploreMode3PathGraph"}, + }); + } else { + writeNavigation(html, "Marker graph", { + {"Local marker graph", "exploreMarkerGraph0?useBubbleReplacementEdges=on"}, + {"Marker graph vertices", "exploreMarkerGraphVertex"}, + {"Marker graph edges", "exploreMarkerGraphEdge"}, + {"Marker coverage", "exploreMarkerCoverage"}, + {"Induced alignments", "exploreMarkerGraphInducedAlignment"}, + {"Follow a read in the marker graph", "followReadInMarkerGraph"}, + {"Marker connectivity", "exploreMarkerConnectivity"}, + }); + } + + + if(assemblerInfo->assemblyMode == 0) { writeNavigation(html, "Assembly graph", { {"Local assembly graph", "exploreAssemblyGraph"}, @@ -459,16 +478,6 @@ void Assembler::writeNavigation(ostream& html) const {"Compressed assembly graph", "exploreCompressedAssemblyGraph"}, }); } - if(assemblerInfo->assemblyMode == 3) { - writeNavigation(html, "Assembly graph", { - {"Local assembly graph", "exploreMode3AssemblyGraph"}, - {"Assembly graph segments", "exploreMode3AssemblyGraphSegment"}, - {"Assembly graph segment pairs", "exploreMode3AssemblyGraphSegmentPair"}, - {"Assembly graph links", "exploreMode3AssemblyGraphLink"}, - {"Meta-alignments", "exploreMode3MetaAlignment"}, - {"Assembly paths", "exploreMode3AssemblyPath"}, - }); - } if (!httpServerData.docsDirectory.empty()) { writeNavigation(html, "Help", { @@ -566,6 +575,10 @@ void Assembler::writeGnuPlotPngToHtml( // Write the png file to html. writePngToHtml(html, pngFileName); + + // Remove the files we created. + std::filesystem::remove(gnuplotFileName); + std::filesystem::remove(pngFileName); } @@ -586,9 +599,9 @@ void Assembler::accessAllSoft() bool allDataAreAvailable = true; try { - accessKmers(); + accessKmerChecker(); } catch(const exception& e) { - cout << "K-mers are not accessible." << endl; + cout << "The k-mer checker is not accessible." << endl; allDataAreAvailable = false; } @@ -675,6 +688,17 @@ void Assembler::accessAllSoft() allDataAreAvailable = false; } +#if 0 + if(assemblerInfo->assemblyMode == 3) { + try { + accessMarkerGraphPrimaryJourneys(); + } catch(const exception& e) { + cout << "MarkerGraph graph primary journeys are not accessible." << endl; + allDataAreAvailable = false; + } + } +#endif + try { accessCompressedAlignments(); } catch(const exception& e) { @@ -718,18 +742,6 @@ void Assembler::accessAllSoft() - // Data specific to assembly mode 3. - if(assemblerInfo->assemblyMode == 3) { - try { - accessMode3AssemblyGraph(); - } catch(const exception& e) { - cout << "The mode 3 assembly graph is not accessible." << endl; - allDataAreAvailable = false; - } - } - - - if(!allDataAreAvailable) { cout << "Not all assembly data are accessible." << endl; cout << "Some functionality is not available." << endl; @@ -906,20 +918,7 @@ void Assembler::writeAssemblySummary(ostream& html) void Assembler::writeAssemblySummaryBody(ostream& html) { using std::setprecision; - AssemblyGraph& assemblyGraph = *assemblyGraphPointer; - - - // Compute the number of run-length k-mers used as markers. - uint64_t totalRleKmerCount = 0; - uint64_t markerRleKmerCount = 0; - for(const auto& tableEntry: kmerTable) { - if(tableEntry.isRleKmer) { - ++totalRleKmerCount; - if(tableEntry.isMarker) { - ++markerRleKmerCount; - } - } - } + mode0::AssemblyGraph& assemblyGraph = *assemblyGraphPointer; const uint64_t totalDiscardedReadCount = assemblerInfo->discardedInvalidBaseReadCount + @@ -1006,21 +1005,6 @@ void Assembler::writeAssemblySummaryBody(ostream& html) html << - "<h3>Marker <i>k</i>-mers</h3>" - "<table>" - "<tr><td>Length <i>k</i> of <i>k</i>-mers used as markers" - "<td class=right>" << assemblerInfo->k << - "<tr><td>Total number of <i>k</i>-mers" - "<td class=right>" << totalRleKmerCount << - "<tr><td>Number of <i>k</i>-mers used as markers" - "<td class=right>" << markerRleKmerCount << - "<tr><td>Fraction of <i>k</i>-mers used as markers" - "<td class=right>" << setprecision(3) << double(markerRleKmerCount) / double(totalRleKmerCount) << - "</table>" - "<ul><li>In the above table, all <i>k</i>-mer counts only include run-length encoded <i>k</i>-mers, " - "that is, <i>k</i>-mers without repeated bases.</ul>" - - "<h3>Markers</h3>" "<table>" @@ -1240,20 +1224,6 @@ void Assembler::writeAssemblySummaryJson(ostream& json) AssemblyGraph& assemblyGraph = *assemblyGraphPointer; using std::setprecision; - - - // Compute the number of run-length k-mers used as markers. - uint64_t totalRleKmerCount = 0; - uint64_t markerRleKmerCount = 0; - for(const auto& tableEntry: kmerTable) { - if(tableEntry.isRleKmer) { - ++totalRleKmerCount; - if(tableEntry.isMarker) { - ++markerRleKmerCount; - } - } - } - const uint64_t totalDiscardedReadCount = assemblerInfo->discardedInvalidBaseReadCount + assemblerInfo->discardedShortReadReadCount + @@ -1332,17 +1302,6 @@ void Assembler::writeAssemblySummaryJson(ostream& json) double(totalDiscardedBaseCount + assemblerInfo->baseCount) << "\n" " }\n" - " },\n"; - - - json << - " \"Marker k-mers\":\n" - " {\n" - " \"Length k of k-mers used as markers\": " << assemblerInfo->k << ",\n" - " \"Total number of k-mers\": " << totalRleKmerCount << ",\n" - " \"Number of k-mers used as markers\": " << markerRleKmerCount << ",\n" - " \"Fraction of k<-mers used as markers\": " << - setprecision(3) << double(markerRleKmerCount) / double(totalRleKmerCount) << " },\n" diff --git a/src/AssemblerLowHash.cpp b/src/AssemblerLowHash.cpp index 8e67654..afba1bf 100644 --- a/src/AssemblerLowHash.cpp +++ b/src/AssemblerLowHash.cpp @@ -1,6 +1,5 @@ #include "Assembler.hpp" #include "LowHash0.hpp" -#include "LowHash1.hpp" using namespace shasta; @@ -23,7 +22,7 @@ void Assembler::findAlignmentCandidatesLowHash0( { // Check that we have what we need. - checkKmersAreOpen(); + SHASTA_ASSERT(kmerChecker); checkMarkersAreOpen(); const ReadId readCount = ReadId(markers.size() / 2); SHASTA_ASSERT(readCount > 0); @@ -43,9 +42,8 @@ void Assembler::findAlignmentCandidatesLowHash0( maxBucketSize, minFrequency, threadCount, - kmerTable, getReads(), - markers, + markerKmerIds, alignmentCandidates.candidates, readLowHashStatistics, largeDataFileNamePrefix, @@ -132,52 +130,6 @@ void Assembler::writeOverlappingReads( -// New version that also stores alignmentCandidates.featureOrdinals. -// This can be used to filter the alignment candidates. -void Assembler::findAlignmentCandidatesLowHash1( - size_t m, // Number of consecutive k-mers that define a feature. - double hashFraction, // Low hash threshold. - size_t minHashIterationCount, // Number of lowHash iterations. - size_t log2MinHashBucketCount, // Base 2 log of number of buckets for lowHash. - size_t minBucketSize, // The minimum size for a bucket to be used. - size_t maxBucketSize, // The maximum size for a bucket to be used. - size_t minFrequency, // Minimum number of minHash hits for a pair to become a candidate. - size_t threadCount) -{ - // Check that we have what we need. - checkKmersAreOpen(); - checkMarkersAreOpen(); - const ReadId readCount = ReadId(markers.size() / 2); - SHASTA_ASSERT(readCount > 0); - - // Prepare storage. - alignmentCandidates.candidates.createNew( - largeDataName("AlignmentCandidates"), largeDataPageSize); - alignmentCandidates.featureOrdinals.createNew( - largeDataName("AlignmentCandidatesFeatureOrdinale"), largeDataPageSize); - - // Do the computation. - LowHash1 lowHash1( - m, - hashFraction, - minHashIterationCount, - log2MinHashBucketCount, - minBucketSize, - maxBucketSize, - minFrequency, - threadCount, - kmerTable, - getReads(), - markers, - alignmentCandidates, - largeDataFileNamePrefix, - largeDataPageSize); - - alignmentCandidates.unreserve(); -} - - - void Assembler::writeAlignmentCandidates(bool useReadName, bool verbose) const { diff --git a/src/AssemblerMarkerGraph.cpp b/src/AssemblerMarkerGraph.cpp index cb37ce0..9a73df2 100644 --- a/src/AssemblerMarkerGraph.cpp +++ b/src/AssemblerMarkerGraph.cpp @@ -6,12 +6,14 @@ #include "compressAlignment.hpp" #include "Coverage.hpp" #include "dset64-gccAtomic.hpp" +#include "extractKmer.hpp" #include "PeakFinder.hpp" #include "performanceLog.hpp" -#include "LocalMarkerGraph.hpp" +#include "LocalMarkerGraph0.hpp" #include "Reads.hpp" #include "timestamp.hpp" using namespace shasta; +using namespace mode0; // Spoa. #include "spoa/spoa.hpp" @@ -77,7 +79,7 @@ void Assembler::createMarkerGraphVertices( // Check that we have what we need. reads->checkReadsAreOpen(); reads->checkReadFlagsAreOpen(); - checkKmersAreOpen(); + SHASTA_ASSERT(kmerChecker); checkMarkersAreOpen(); checkAlignmentDataAreOpen(); SHASTA_ASSERT(compressedAlignments.isOpen()); @@ -221,6 +223,22 @@ void Assembler::createMarkerGraphVertices( ++histogram[markerCount]; } + // Store the disjoint sets histogram in a MemoryMapped::Vector. + // This is used when flagging primary marker graph edges for Mode 3 assembly. + // This stored pairs(coverage, frequency). + // Only pairs where the frequency is not zero are stored. + { + markerGraph.disjointSetsHistogram.createNew( + largeDataName("DisjointSetsHistogram"), + largeDataPageSize); + for(uint64_t coverage=0; coverage<histogram.size(); coverage++) { + const uint64_t frequency = histogram[coverage]; + if(frequency) { + markerGraph.disjointSetsHistogram.push_back({coverage, frequency}); + } + } + } + ofstream csv("DisjointSetsHistogram.csv"); csv << "Coverage,Frequency\n"; for(uint64_t coverage=0; coverage<histogram.size(); coverage++) { @@ -592,7 +610,6 @@ void Assembler::createMarkerGraphVerticesThreadFunction1(size_t threadId) const uint32_t ordinal1 = p[1]; const MarkerId markerId0 = getMarkerId(orientedReadIds[0], ordinal0); const MarkerId markerId1 = getMarkerId(orientedReadIds[1], ordinal1); - SHASTA_ASSERT(markers.begin()[markerId0].kmerId == markers.begin()[markerId1].kmerId); disjointSetsPointer->unite(markerId0, markerId1); // Also merge the reverse complemented markers. @@ -1238,6 +1255,11 @@ void Assembler::accessMarkerGraphReverseComplementVertex(bool readWriteAccess) readWriteAccess); } +void Assembler::accessDisjointSetsHistogram() +{ + markerGraph.disjointSetsHistogram.accessExistingReadOnly(largeDataName("DisjointSetsHistogram")); +} + // Find the reverse complement of each marker graph edge. @@ -1489,12 +1511,13 @@ void Assembler::checkMarkerGraphIsStrandSymmetricThreadFunction2(size_t threadId const MarkerGraph::Edge& edge0 = markerGraph.edges[e0]; const MarkerGraph::Edge& edge1 = markerGraph.edges[e1]; - SHASTA_ASSERT(edge0.coverage == edge1.coverage); + SHASTA_ASSERT(markerGraph.edgeCoverage(e0) == markerGraph.edgeCoverage(e1)); SHASTA_ASSERT( edge0.wasRemovedByTransitiveReduction == edge1.wasRemovedByTransitiveReduction); SHASTA_ASSERT(edge0.wasPruned == edge1.wasPruned); SHASTA_ASSERT(edge0.isSuperBubbleEdge == edge1.isSuperBubbleEdge); + SHASTA_ASSERT(edge0.isLowCoverageCrossEdge == edge1.isLowCoverageCrossEdge); #if 0 // This portion does not work if parallel edges are present, @@ -1618,103 +1641,6 @@ void Assembler::writeBadMarkerGraphVertices() const -// Compute marker graph vertex coverage statistics by KmerId. -void Assembler::vertexCoverageStatisticsByKmerId() const -{ - // Check that we have what we need. - checkKmersAreOpen(); - checkMarkersAreOpen(); - checkMarkerGraphVerticesAreAvailable(); - - const uint64_t k = assemblerInfo->k; - - // For each KmerId, maintain a histogram by coverage. - vector< vector<uint64_t> > histogram(kmerTable.size()); - - // Loop over all marker graph vertices. - for(MarkerGraph::VertexId vertexId=0; vertexId!=markerGraph.vertexCount(); vertexId++) { - - // Get the markers for this vertex. - const span<const MarkerId> markerIds = markerGraph.getVertexMarkerIds(vertexId); - const uint64_t coverage = markerIds.size(); - SHASTA_ASSERT(coverage > 0); - - // Find the KmerId. - const MarkerId firstMarkerId = markerIds.front(); - const CompressedMarker& compressedMarker = markers.begin()[firstMarkerId]; - const KmerId kmerId = compressedMarker.kmerId; - - // Increment the histogram. - SHASTA_ASSERT(kmerId < histogram.size()); - vector<uint64_t>& h = histogram[kmerId]; - if(h.size() <= coverage) { - h.resize(coverage + 1, 0ULL); - } - ++h[coverage]; - } - - - - // Find the maximum histogram size for any k-mer. - uint64_t hMaxSize = 0ULL; - for(uint64_t kmerId=0; kmerId<kmerTable.size(); kmerId++) { - if(not kmerTable[kmerId].isMarker) { - continue; - } - if(not kmerTable[kmerId].isRleKmer) { - continue; - } - const vector<uint64_t>& h = histogram[kmerId]; - hMaxSize = max(hMaxSize, uint64_t(h.size())); - } - - - - // Write it out. - ofstream csv("VertexCoverageByKmerId.csv"); - csv << "Kmer,Total,"; - for(uint64_t coverage=1; coverage<hMaxSize; coverage++) { - csv << coverage << ","; - } - csv << "\n"; - for(uint64_t kmerId=0; kmerId<kmerTable.size(); kmerId++) { - if(not kmerTable[kmerId].isMarker) { - continue; - } - if(not kmerTable[kmerId].isRleKmer) { - continue; - } - const Kmer kmer(kmerId, k); - - // Compute the total number of markers with this k-mer - // that are associated with a vertex. - const vector<uint64_t>& h = histogram[kmerId]; - uint64_t totalMarkerCount = 0ULL; - for(uint64_t coverage=1; coverage<hMaxSize; coverage++) { - uint64_t vertexCount = 0; - if(coverage < h.size()) { - vertexCount = h[coverage]; - } - const uint64_t markerCount = coverage * vertexCount; - totalMarkerCount += markerCount; - } - - kmer.write(csv, k); - csv << "," << totalMarkerCount << ","; - for(uint64_t coverage=1; coverage<hMaxSize; coverage++) { - uint64_t vertexCount = 0; - if(coverage < h.size()) { - vertexCount = h[coverage]; - } - const uint64_t markerCount = coverage * vertexCount; - csv << markerCount << ","; - } - csv << "\n"; - } -} - - - bool Assembler::extractLocalMarkerGraph( OrientedReadId orientedReadId, uint32_t ordinal, @@ -1727,7 +1653,7 @@ bool Assembler::extractLocalMarkerGraph( bool useSuperBubbleEdges, bool useLowCoverageCrossEdges, bool useRemovedSecondaryEdges, - LocalMarkerGraph& graph + LocalMarkerGraph0& graph ) { const MarkerGraph::VertexId startVertexId = @@ -1758,7 +1684,7 @@ bool Assembler::extractLocalMarkerGraph( bool useSuperBubbleEdges, bool useLowCoverageCrossEdges, bool useRemovedSecondaryEdges, - LocalMarkerGraph& graph + LocalMarkerGraph0& graph ) { // Sanity check. @@ -1766,8 +1692,8 @@ bool Assembler::extractLocalMarkerGraph( // Some shorthands. AssemblyGraph& assemblyGraph = *assemblyGraphPointer; - using vertex_descriptor = LocalMarkerGraph::vertex_descriptor; - using edge_descriptor = LocalMarkerGraph::edge_descriptor; + using vertex_descriptor = LocalMarkerGraph0::vertex_descriptor; + using edge_descriptor = LocalMarkerGraph0::edge_descriptor; // Start a timer. const auto startTime = steady_clock::now(); @@ -1800,7 +1726,7 @@ bool Assembler::extractLocalMarkerGraph( // Dequeue a vertex. const vertex_descriptor v0 = q.front(); q.pop(); - const LocalMarkerGraphVertex& vertex0 = graph[v0]; + const LocalMarkerGraph0Vertex& vertex0 = graph[v0]; const MarkerGraph::VertexId vertexId0 = vertex0.vertexId; const uint64_t distance0 = vertex0.distance; const uint64_t distance1 = distance0 + 1; @@ -1903,8 +1829,8 @@ bool Assembler::extractLocalMarkerGraph( // Create edges. - BGL_FORALL_VERTICES(v0, graph, LocalMarkerGraph) { - const LocalMarkerGraphVertex& vertex0 = graph[v0]; + BGL_FORALL_VERTICES(v0, graph, LocalMarkerGraph0) { + const LocalMarkerGraph0Vertex& vertex0 = graph[v0]; const MarkerGraph::VertexId vertexId0 = vertex0.vertexId; // Loop over the children that exist in the local marker graph @@ -1979,8 +1905,8 @@ bool Assembler::extractLocalMarkerGraph( // Store consensus repeat counts for all vertices. if(markerGraph.vertexRepeatCounts.isOpen) { const size_t k = assemblerInfo->k; - BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph) { - LocalMarkerGraphVertex& vertex = graph[v]; + BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph0) { + LocalMarkerGraph0Vertex& vertex = graph[v]; vertex.storedConsensusRepeatCounts.resize(k); const uint8_t* begin = markerGraph.vertexRepeatCounts.begin() + k * vertex.vertexId; copy(begin, begin+k, vertex.storedConsensusRepeatCounts.begin()); @@ -2004,8 +1930,8 @@ bool Assembler::extractLocalMarkerGraph( const int8_t gap = -1; auto spoaAlignmentEngine = spoa::AlignmentEngine::Create(alignmentType, match, mismatch, gap); spoa::Graph spoaAlignmentGraph; - BGL_FORALL_EDGES(e, graph, LocalMarkerGraph) { - LocalMarkerGraphEdge& edge = graph[e]; + BGL_FORALL_EDGES(e, graph, LocalMarkerGraph0) { + LocalMarkerGraph0Edge& edge = graph[e]; ComputeMarkerGraphEdgeConsensusSequenceUsingSpoaDetail detail; computeMarkerGraphEdgeConsensusSequenceUsingSpoa( edge.edgeId, @@ -2156,12 +2082,6 @@ void Assembler::createMarkerGraphEdgesThreadFunction0(size_t threadId) const auto vertex1 = p.first; const auto& markerIntervals = p.second; edge.target = vertex1; - size_t coverage = markerIntervals.size(); - if(coverage < 256) { - edge.coverage = uint8_t(coverage); - } else { - edge.coverage = 255; - } // Store the edge. thisThreadEdges.push_back(edge); @@ -2302,34 +2222,41 @@ void Assembler::transitiveReduction( edge.isSuperBubbleEdge = 0; } - // Gather edges for each coverage less than highCoverageThreshold. + // Compute maximum edge coverage. + uint64_t maximumEdgeCoverage = 0; + for(EdgeId edgeId=0; edgeId!=edges.size(); edgeId++) { + maximumEdgeCoverage = max(maximumEdgeCoverage, markerGraph.edgeCoverage(edgeId)); + } + cout << "Maximum edge coverage is " << maximumEdgeCoverage << endl; + + + + // Gather edges for each coverage up to maximumEdgeCoverage. // Only add to the list those with id less than the id of their reverse complement. - MemoryMapped::VectorOfVectors<EdgeId, EdgeId> edgesByCoverage; + MemoryMapped::VectorOfVectors<EdgeId, EdgeId> edgesByCoverage; edgesByCoverage.createNew( largeDataName("tmp-flagMarkerGraphWeakEdges-edgesByCoverage"), largeDataPageSize); - edgesByCoverage.beginPass1(highCoverageThreshold); + edgesByCoverage.beginPass1(maximumEdgeCoverage + 1); for(EdgeId edgeId=0; edgeId!=edges.size(); edgeId++) { if (markerGraph.reverseComplementEdge[edgeId] < edgeId) { continue; } - const MarkerGraph::Edge& edge = edges[edgeId]; - if(edge.coverage < highCoverageThreshold) { - edgesByCoverage.incrementCount(edge.coverage); - } + const uint64_t coverage = markerGraph.edgeCoverage(edgeId); + edgesByCoverage.incrementCount(coverage); } edgesByCoverage.beginPass2(); for(EdgeId edgeId=0; edgeId!=edges.size(); edgeId++) { if (markerGraph.reverseComplementEdge[edgeId] < edgeId) { continue; } - const MarkerGraph::Edge& edge = edges[edgeId]; - if(edge.coverage < highCoverageThreshold) { - edgesByCoverage.store(edge.coverage, edgeId); - } + const uint64_t coverage = markerGraph.edgeCoverage(edgeId); + edgesByCoverage.store(coverage, edgeId); } edgesByCoverage.endPass2(); + + // Check that there are no edges with coverage 0. SHASTA_ASSERT(edgesByCoverage[0].size() == 0); @@ -2351,12 +2278,8 @@ void Assembler::transitiveReduction( // Flag as weak all edges with coverage <= lowCoverageThreshold - for(size_t coverage=1; coverage<=lowCoverageThreshold; coverage++) { + for(size_t coverage=1; coverage<=min(lowCoverageThreshold, maximumEdgeCoverage); coverage++) { const auto& edgesWithThisCoverage = edgesByCoverage[coverage]; - if(edgesWithThisCoverage.size() > 0) { - cout << "Flagging as weak " << 2 * edgesWithThisCoverage.size() << " edges with coverage " - << coverage << "." << endl; - } for(const EdgeId edgeId: edgesWithThisCoverage) { edges[edgeId].wasRemovedByTransitiveReduction = 1; edges[markerGraph.reverseComplementEdge[edgeId]].wasRemovedByTransitiveReduction = 1; @@ -2393,12 +2316,11 @@ void Assembler::transitiveReduction( // Process edges of intermediate coverage. for(size_t coverage=lowCoverageThreshold+1; - coverage<highCoverageThreshold; coverage++) { + coverage<min(highCoverageThreshold, maximumEdgeCoverage+1); coverage++) { const auto& edgesWithThisCoverage = edgesByCoverage[coverage]; if(edgesWithThisCoverage.size() == 0) { continue; } - size_t count = 0; // Loop over edges with this coverage. for(const EdgeId edgeId: edgesWithThisCoverage) { @@ -2454,7 +2376,6 @@ void Assembler::transitiveReduction( if(found) { edges[edgeId].wasRemovedByTransitiveReduction = 1; edges[markerGraph.reverseComplementEdge[edgeId]].wasRemovedByTransitiveReduction = 1; - count += 2; } // Clean up to be ready to process the next edge. @@ -2466,12 +2387,6 @@ void Assembler::transitiveReduction( } bfsVertices.clear(); } - - if(count) { - cout << "Flagged as weak " << count << - " edges with coverage " << coverage << - " out of "<< 2*edgesWithThisCoverage.size() << " total." << endl; - } } @@ -2500,179 +2415,6 @@ void Assembler::transitiveReduction( -// Approximate reverse transitive reduction of the marker graph. -// The goal is to remove local back-edges. -// This works similarly to transitive reduction, -// but in the opposite direction. -// This does the following: -// - Edges with coverage greater than lowCoverageThreshold -// and less then highCoverageThreshold are processed in -// ordered of increasing coverage: -// * For each such edge A->B, we look for a path of length -// at most maxDistance starting at B and ending at A -// that does not use edge A->B and also does not use any -// edges already marked wasRemovedByTransitiveReduction. -// * If such a path is found, the edge is marked -// wasRemovedByTransitiveReduction. -void Assembler::reverseTransitiveReduction( - size_t lowCoverageThreshold, - size_t highCoverageThreshold, - size_t maxDistance) -{ - // Some shorthands for readability. - auto& edges = markerGraph.edges; - using VertexId = MarkerGraph::VertexId; - using EdgeId = MarkerGraph::EdgeId; - using Edge = MarkerGraph::Edge; - - // Initial message. - cout << timestamp << "Reverse transitive reduction of the marker graph begins." << endl; - cout << "The marker graph has " << markerGraph.vertexCount() << " vertices and "; - cout << edges.size() << " edges." << endl; - - // Gather edges for each coverage less than highCoverageThreshold. - // Only add to the list those with id less than the id of their reverse complement. - MemoryMapped::VectorOfVectors<EdgeId, EdgeId> edgesByCoverage; - edgesByCoverage.createNew( - largeDataName("tmp-flagMarkerGraphWeakEdges-edgesByCoverage"), - largeDataPageSize); - edgesByCoverage.beginPass1(highCoverageThreshold); - for(EdgeId edgeId=0; edgeId!=edges.size(); edgeId++) { - if (markerGraph.reverseComplementEdge[edgeId] < edgeId) { - continue; - } - const MarkerGraph::Edge& edge = edges[edgeId]; - if(edge.coverage>lowCoverageThreshold && edge.coverage<highCoverageThreshold) { - edgesByCoverage.incrementCount(edge.coverage); - } - } - edgesByCoverage.beginPass2(); - for(EdgeId edgeId=0; edgeId!=edges.size(); edgeId++) { - if (markerGraph.reverseComplementEdge[edgeId] < edgeId) { - continue; - } - const MarkerGraph::Edge& edge = edges[edgeId]; - if(edge.coverage>lowCoverageThreshold && edge.coverage<highCoverageThreshold) { - edgesByCoverage.store(edge.coverage, edgeId); - } - } - edgesByCoverage.endPass2(); - - // Vector to contain vertex distances during each BFS. - // Is is set to -1 for vertices not reached by the BFS. - MemoryMapped::Vector<int> vertexDistances; - vertexDistances.createNew( - largeDataName("tmp-flagMarkerGraphWeakEdges-vertexDistances"), - largeDataPageSize); - vertexDistances.resize(markerGraph.vertexCount()); - fill(vertexDistances.begin(), vertexDistances.end(), -1); - - // Queue to be used for all BFSs. - std::queue<VertexId> q; - - // Vector to store vertices encountered during a BFS. - vector<VertexId> bfsVertices; - - - - // Process edges in the specified coverage range. - size_t removedCount = 0; - for(size_t coverage=lowCoverageThreshold+1; - coverage<highCoverageThreshold; coverage++) { - const auto& edgesWithThisCoverage = edgesByCoverage[coverage]; - if(edgesWithThisCoverage.size() == 0) { - continue; - } - size_t count = 0; - - // Loop over edges with this coverage. - for(const EdgeId edgeId: edgesWithThisCoverage) { - const Edge& edge = edges[edgeId]; - if(edge.wasRemovedByTransitiveReduction) { - continue; - } - const VertexId u0 = edge.target; - const VertexId u1 = edge.source; - - // Do a forward BFS starting at u0, up to distance maxDistance, - // using only edges currently marked as strong - // and without using this edge. - // If we encounter u1, u1 is reachable from u0 without - // using this edge, and so we can mark this edge as weak. - q.push(u0); - vertexDistances[u0] = 0; - bfsVertices.push_back(u0); - bool found = false; - while(!q.empty()) { - const VertexId v0 = q.front(); - q.pop(); - const int distance0 = vertexDistances[v0]; - const int distance1 = distance0 + 1; - for(const auto edgeId01: markerGraph.edgesBySource[v0]) { - if(edgeId01 == edgeId) { - continue; - } - const Edge& edge01 = markerGraph.edges[edgeId01]; - if(edge01.wasRemovedByTransitiveReduction) { - continue; - } - const VertexId v1 = edge01.target; - if(vertexDistances[v1] >= 0) { - continue; // We already encountered this vertex. - } - if(v1 == u1) { - // We found it! - found = true; - break; - } - vertexDistances[v1] = distance1; - bfsVertices.push_back(v1); - if(distance1 < int(maxDistance)) { - q.push(v1); - } - } - if(found) { - break; - } - } - - if(found) { - edges[edgeId].wasRemovedByTransitiveReduction = 1; - edges[markerGraph.reverseComplementEdge[edgeId]].wasRemovedByTransitiveReduction = 1; - count += 2; - } - - // Clean up to be ready to process the next edge. - while(!q.empty()) { - q.pop(); - } - for(const VertexId v: bfsVertices) { - vertexDistances[v] = -1; - } - bfsVertices.clear(); - } - - if(count) { - cout << timestamp << "Reverse transitive reduction removed " << count << - " edges with coverage " << coverage << - " out of "<< 2*edgesWithThisCoverage.size() << " total." << endl; - } - removedCount += count; - } - cout << timestamp << "Reverse transitive reduction removed " << removedCount <<" edges." << endl; - - - // Clean up our work areas. - edgesByCoverage.remove(); - // edgeFlags.remove(); - vertexDistances.remove(); - - cout << timestamp << "Reverse transitive reduction of the marker graph ends." << endl; - -} - - - // Return true if an edge disconnects the local subgraph. bool Assembler::markerGraphEdgeDisconnectsLocalStrongSubgraph( MarkerGraph::EdgeId startEdgeId, @@ -4514,7 +4256,7 @@ void Assembler::assembleMarkerGraphVertices(size_t threadCount) SHASTA_ASSERT(assemblerInfo->readRepresentation == 1); // Check that we have what we need. - checkKmersAreOpen(); + SHASTA_ASSERT(kmerChecker); reads->checkReadsAreOpen(); checkMarkersAreOpen(); checkMarkerGraphVerticesAreAvailable(); @@ -4582,7 +4324,7 @@ void Assembler::computeMarkerGraphVerticesCoverageData(size_t threadCount) performanceLog << timestamp<< "computeMarkerGraphVerticesCoverageData begins." << endl; // Check that we have what we need. - checkKmersAreOpen(); + SHASTA_ASSERT(kmerChecker); reads->checkReadsAreOpen(); checkMarkersAreOpen(); checkMarkerGraphVerticesAreAvailable(); @@ -4753,7 +4495,7 @@ void Assembler::assembleMarkerGraphEdges( performanceLog << timestamp << "assembleMarkerGraphEdges begins." << endl; // Check that we have what we need. - checkKmersAreOpen(); + SHASTA_ASSERT(kmerChecker); reads->checkReadsAreOpen(); checkMarkersAreOpen(); checkMarkerGraphVerticesAreAvailable(); @@ -4997,14 +4739,19 @@ void Assembler::assembleMarkerGraphEdgesThreadFunction(size_t threadId) void Assembler::accessMarkerGraphConsensus() { - if(assemblerInfo->readRepresentation == 1) { - markerGraph.vertexRepeatCounts.accessExistingReadOnly( - largeDataName("MarkerGraphVertexRepeatCounts")); + if(assemblerInfo->assemblyMode == 3) { + markerGraph.edgeSequence.accessExistingReadOnly(largeDataName("MarkerGraphEdgesSequence")); + + } else { + if(assemblerInfo->readRepresentation == 1) { + markerGraph.vertexRepeatCounts.accessExistingReadOnly( + largeDataName("MarkerGraphVertexRepeatCounts")); + } + markerGraph.edgeConsensus.accessExistingReadOnly( + largeDataName("MarkerGraphEdgesConsensus")); + markerGraph.edgeConsensusOverlappingBaseCount.accessExistingReadOnly( + largeDataName("MarkerGraphEdgesConsensusOverlappingBaseCount")); } - markerGraph.edgeConsensus.accessExistingReadOnly( - largeDataName("MarkerGraphEdgesConsensus")); - markerGraph.edgeConsensusOverlappingBaseCount.accessExistingReadOnly( - largeDataName("MarkerGraphEdgesConsensusOverlappingBaseCount")); } @@ -5065,7 +4812,8 @@ void Assembler::computeMarkerGraphCoverageHistogram() // Edges. vector<uint64_t> edgeCoverageHistogram; - for(const MarkerGraph::Edge& edge: markerGraph.edges) { + for(MarkerGraphEdgeId edgeId=0; edgeId<markerGraph.edges.size(); edgeId++) { + const MarkerGraph::Edge& edge = markerGraph.edges[edgeId]; // If this edge was removed, skip it. if(edge.wasRemoved()) { @@ -5073,7 +4821,7 @@ void Assembler::computeMarkerGraphCoverageHistogram() } // Increment the histogram. - const size_t coverage = edge.coverage; + const size_t coverage = markerGraph.edgeCoverage(edgeId); if(coverage >= edgeCoverageHistogram.size()) { edgeCoverageHistogram.resize(coverage+1, 0); } @@ -5410,153 +5158,12 @@ void Assembler::debugWriteMarkerGraph(const string& fileNamePrefix) const -// Assemble the RLE sequence of a path of the marker graph, under the assumption -// that, for each edge, all oriented reads have exactly the same sequence. -// This will be the case if edges were created by Assembler::createMarkerGraphEdgesStrict. -void Assembler::assembleMarkerGraphPathRleStrict( - span<const MarkerGraph::EdgeId> path, - vector<Base>& rleSequence - ) const -{ - using VertexId = MarkerGraph::VertexId; - using EdgeId = MarkerGraph::EdgeId; - const uint64_t k = assemblerInfo->k; - - // Start with no sequence. - rleSequence.clear(); - if(path.empty()) { - return; - } - - // Add the RLE sequence of the first vertex. - VertexId v0 = markerGraph.edges[path.front()].source; - const MarkerId firstMarkerId = markerGraph.getVertexMarkerIds(v0)[0]; - const CompressedMarker& firstMarker = markers.begin()[firstMarkerId]; - const KmerId kmerId = firstMarker.kmerId; - const Kmer kmer(kmerId, k); - for(uint64_t i=0; i<k; i++) { - rleSequence.push_back(kmer[i]); - } - - - - // Loop over edges of the path. - for(const EdgeId edgeId: path) { - const MarkerGraph::Edge& edge = markerGraph.edges[edgeId]; - SHASTA_ASSERT(edge.source == v0); - const VertexId v1 = edge.target; - - const span<const MarkerInterval> markerIntervals = markerGraph.edgeMarkerIntervals[edgeId]; - SHASTA_ASSERT(not markerIntervals.empty()); - - // Get the RLE sequence and check that all the MarkerIntervals agree. - // This will be the case if edges were created by Assembler::createMarkerGraphEdgesStrict. - uint64_t overlappingRleBaseCount; - vector<Base> edgeRleSequence; - getMarkerIntervalRleSequence( - markerIntervals.front(), - overlappingRleBaseCount, - edgeRleSequence); - uint64_t markerIntervalOverlappingRleBaseCount; - vector<Base> markerIntervalRleSequence; - for(const MarkerInterval& markerInterval: markerIntervals) { - getMarkerIntervalRleSequence( - markerInterval, - markerIntervalOverlappingRleBaseCount, - markerIntervalRleSequence); - SHASTA_ASSERT(markerIntervalOverlappingRleBaseCount == overlappingRleBaseCount); - SHASTA_ASSERT(markerIntervalRleSequence == edgeRleSequence); - } - - - - // Construct the sequence of the v1 vertex. - const MarkerId markerId1 = markerGraph.getVertexMarkerIds(v1)[0]; - const CompressedMarker& marker1 = markers.begin()[markerId1]; - const KmerId kmerId1 = marker1.kmerId; - const Kmer kmer1(kmerId1, k); - - - // Add the sequence of this edge and the v1 vertex. - if(overlappingRleBaseCount == 0) { - - // There is no overlap. - - // Add the edge sequence. - copy(edgeRleSequence.begin(), edgeRleSequence.end(), back_inserter(rleSequence)); - - // Add the entire sequence of v1. - for(uint64_t i=0; i<k; i++) { - rleSequence.push_back(kmer1[i]); - } - - } else { - - // There is overlap. - // Add the sequence of v1, excluding the overlapping bases. - for(uint64_t i=overlappingRleBaseCount; i<k; i++) { - rleSequence.push_back(kmer1[i]); - } - } - - - // Prepare to process the next edge. - v0 = v1; - } -} - - - -void Assembler::assembleAssemblyGraphEdgeRleStrict( - AssemblyGraph::EdgeId edgeId, - vector<Base>& rleSequence -) const -{ - const AssemblyGraph& assemblyGraph = *assemblyGraphPointer; - assembleMarkerGraphPathRleStrict( - assemblyGraph.edgeLists[edgeId], - rleSequence); -} - - - -// Get the RLE sequence implied by a MarkerInterval. -// If the markers overlap, returns the number of -// overlapping RLE bases in overlappingRleBaseCount -// and empty rleSequence. -// Otherwise, returns zero overlappingRleBaseCount -// and the intervening sequence in rleSequence -// (which can be empty if the two markers are exactly adjacent). -void Assembler::getMarkerIntervalRleSequence( - const MarkerInterval& markerInterval, - uint64_t& overlappingRleBaseCount, - vector<Base>& rleSequence) const +// Find the common KmerId for all the markers of a marker graph vertex. +KmerId Assembler::getMarkerGraphVertexKmerId(MarkerGraphVertexId vertexId) const { - const uint64_t k = assemblerInfo->k; - const OrientedReadId orientedReadId = markerInterval.orientedReadId; - - // Extract the k-mers and their RLE positions in this oriented read. - array<Kmer, 2> kmers; - array<uint32_t, 2> positions; - for(uint64_t i=0; i<2; i++) { - const MarkerId markerId = getMarkerId(orientedReadId, markerInterval.ordinals[i]); - const CompressedMarker& compressedMarker = markers.begin()[markerId]; - kmers[i] = Kmer(compressedMarker.kmerId, k); - positions[i] = compressedMarker.position; - } - - - if(positions[1] < positions[0] + k) { - // The two markers overlap. - overlappingRleBaseCount = (positions[0] + k) - positions[1]; - rleSequence.clear(); - } else { - // The two markers don't overlap. - overlappingRleBaseCount = 0; - rleSequence.clear(); - for(uint32_t position=positions[0]+uint32_t(k); position<positions[1]; position++) { - rleSequence.push_back(getReads().getOrientedReadBase(orientedReadId, position)); - } - } + return markerGraph.getVertexKmerId( + vertexId, + assemblerInfo->k, + *reads, + markers); } - diff --git a/src/AssemblerMarkerGraphEdges.cpp b/src/AssemblerMarkerGraphEdges.cpp index b5f6275..c3bd1e6 100644 --- a/src/AssemblerMarkerGraphEdges.cpp +++ b/src/AssemblerMarkerGraphEdges.cpp @@ -332,15 +332,11 @@ void Assembler::createMarkerGraphEdgesStrictPass3(size_t threadId) if( (strandCoverage[0] >= minEdgeCoveragePerStrand) and (strandCoverage[1] >= minEdgeCoveragePerStrand)) { - // If getting here, we actually generate an edge. - uint64_t coverage = candidateEdge.size(); - // Store the edge. MarkerGraph::Edge edge; edge.clearFlags(); edge.source = vertexId0; edge.target = vertexId1; - edge.coverage = (coverage > 255) ? 255 : uint8_t(coverage); thisThreadEdges.push_back(edge); // Store the marker intervals. @@ -572,12 +568,6 @@ void Assembler::createMarkerGraphSecondaryEdges( MarkerGraph::Edge edge; edge.source = v0; edge.target = v1; - const uint64_t coverage = markerIntervals.size(); - if(coverage < 256) { - edge.coverage = uint8_t(coverage); - } else { - edge.coverage = 255; - } edge.isSecondary = 1; markerGraph.edges.push_back(edge); markerGraph.edgeMarkerIntervals.appendVector(markerIntervals); @@ -630,10 +620,8 @@ vector< vector<uint64_t> > Assembler::clusterMarkerGraphEdgeOrientedReads( // The length of each marker sequence. const size_t k = assemblerInfo->k; - const MarkerGraph::Edge& edge = markerGraph.edges[edgeId]; const span<const MarkerInterval> markerIntervals = markerGraph.edgeMarkerIntervals[edgeId]; const uint64_t n = markerIntervals.size(); - SHASTA_ASSERT(edge.coverage == n); @@ -811,7 +799,6 @@ void Assembler::splitMarkerGraphSecondaryEdges( auto& newEdge = markerGraph.edges.back(); newEdge.source = Uint40(tmpNewEdge.source); newEdge.target = Uint40(tmpNewEdge.target); - newEdge.coverage = uint8_t(tmpNewEdge.markerIntervals.size()); newEdge.isSecondary = 1; markerGraph.edgeMarkerIntervals.appendVector(tmpNewEdge.markerIntervals); } @@ -950,3 +937,546 @@ void Assembler::splitMarkerGraphSecondaryEdgesThreadFunction(size_t threadId) __sync_fetch_and_add(&data.splitCount, splitCount); __sync_fetch_and_add(&data.createdCount, createdCount); } + + + + +// Assemble Mode 3 sequence for all marker graph edges. +// See the comments before MarkerGraph::edgeSequence for more information. +// The sequence of each edge is simply obtained from the first +// marker interval of the edge. +// For now this is done sequentially. +void Assembler::assembleMarkerGraphEdgesMode3() +{ + const uint64_t k = assemblerInfo->k; + SHASTA_ASSERT((k % 2) == 0); + const uint64_t kHalf = k / 2; + SHASTA_ASSERT(getReads().representation == 0); + + markerGraph.edgeSequence.createNew( + largeDataName("MarkerGraphEdgesSequence"), largeDataPageSize); + + // Loop over all marker graph edges. + for(MarkerGraphEdgeId edgeId=0; edgeId<markerGraph.edges.size(); edgeId++) { + markerGraph.edgeSequence.appendVector(); + + // Get the first marker interval for this edge. + const span<const MarkerInterval> markerIntervals = markerGraph.edgeMarkerIntervals[edgeId]; + const MarkerInterval& firstMarkerInterval = markerIntervals.front(); + const OrientedReadId orientedReadId = firstMarkerInterval.orientedReadId; + const uint64_t ordinal0 = firstMarkerInterval.ordinals[0]; + const uint64_t ordinal1 = firstMarkerInterval.ordinals[1]; + + // Get the position interval on the oriented read that corresponds to this + // marker interval, including k/2 bases on each of the adjacent markers. + const span<const CompressedMarker> orientedReadMarkers = markers[orientedReadId.getValue()]; + const uint64_t positionBegin = orientedReadMarkers[ordinal0].position + kHalf; + const uint64_t positionEnd = orientedReadMarkers[ordinal1].position + kHalf; + + for(uint64_t position=positionBegin; position!=positionEnd; position++) { + const Base base = getReads().getOrientedReadBase(orientedReadId, uint32_t(position)); + markerGraph.edgeSequence.append(base); + } + } + +} + + + +// Analyze and compare the read compositions of two marker graph edges. +// This can only be done if the two edges have no duplicate OrientedReadIds +// in the markers. In that case, each OrientedReadId of an edge +// corresponds to one and only one markerInterval for each edge. +bool Assembler::analyzeMarkerGraphEdgePair( + MarkerGraphEdgeId edgeIdA, + MarkerGraphEdgeId edgeIdB, + MarkerGraphEdgePairInfo& info + ) const +{ + + // Check for duplicate OrientedReadIds on the two edges. + if(markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdA)) { + return false; + } + if(markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdB)) { + return false; + } + + // Prepare for the joint loop over OrientedReadIds of the two edges. + const auto markerIntervalsA = markerGraph.edgeMarkerIntervals[edgeIdA]; + const auto markerIntervalsB = markerGraph.edgeMarkerIntervals[edgeIdB]; + const auto beginA = markerIntervalsA.begin(); + const auto beginB = markerIntervalsB.begin(); + const auto endA = markerIntervalsA.end(); + const auto endB = markerIntervalsB.end(); + + // Store the total number of OrientedReadIds on the two edges. + info.totalA = endA - beginA; + info.totalB = endB - beginB; + + + + // Joint loop over the MarkerIntervals of the two edges, + // to count the common reads and compute average offsets. + info.common = 0; + int64_t sumMarkerOffsets = 0; + int64_t sumTwiceBaseOffsets = 0; + auto itA = beginA; + auto itB = beginB; + while(itA != endA and itB != endB) { + + if(itA->orientedReadId < itB->orientedReadId) { + ++itA; + continue; + } + + if(itB->orientedReadId < itA->orientedReadId) { + ++itB; + continue; + } + + // We found a common OrientedReadId. + ++info.common; + const OrientedReadId orientedReadId = itA->orientedReadId; + const auto orientedReadMarkers = markers[orientedReadId.getValue()]; + + // Compute the offset in markers. + SHASTA_ASSERT(itA->ordinals[1] == itA->ordinals[0] + 1); + SHASTA_ASSERT(itB->ordinals[1] == itB->ordinals[0] + 1); + const uint32_t ordinalA = itA->ordinals[0]; + const uint32_t ordinalB = itB->ordinals[0]; + const int64_t markerOffset = int64_t(ordinalB) - int64_t(ordinalA); + sumMarkerOffsets += markerOffset; + + // Compute the offset in bases. + const int64_t positionA0 = int64_t(orientedReadMarkers[ordinalA].position); + const int64_t positionA1 = int64_t(orientedReadMarkers[ordinalA+1].position); + const int64_t positionB0 = int64_t(orientedReadMarkers[ordinalB].position); + const int64_t positionB1 = int64_t(orientedReadMarkers[ordinalB+1].position); + sumTwiceBaseOffsets -= positionA0; + sumTwiceBaseOffsets -= positionA1; + sumTwiceBaseOffsets += positionB0; + sumTwiceBaseOffsets += positionB1; + + // Continue the joint loop. + ++itA; + ++itB; + + } + info.onlyA = info.totalA - info.common; + info.onlyB = info.totalB - info.common; + + // If there are no common reads, this is all we can do. + if(info.common == 0) { + info.offsetInMarkers = invalid<int64_t>; + info.offsetInBases = invalid<int64_t>; + info.onlyAShort = invalid<uint64_t>; + info.onlyBShort = invalid<uint64_t>; + return true; + } + + // Compute the estimated offsets. + info.offsetInMarkers = int64_t(std::round(double(sumMarkerOffsets) / double(info.common))); + info.offsetInBases = int64_t(0.5 * std::round(double(sumTwiceBaseOffsets) / double(info.common))); + + // Now do the joint loop again, and count the onlyA and onlyB oriented reads + // that are too short to appear in the other edge. + itA = beginA; + itB = beginB; + uint64_t onlyACheck = 0; + uint64_t onlyBCheck = 0; + info.onlyAShort = 0; + info.onlyBShort = 0; + while(true) { + if(itA == endA and itB == endB) { + break; + } + + else if(itB == endB or ((itA!=endA) and (itA->orientedReadId < itB->orientedReadId))) { + // This oriented read only appears in edge A. + ++onlyACheck; + const OrientedReadId orientedReadId = itA->orientedReadId; + const auto orientedReadMarkers = markers[orientedReadId.getValue()]; + const int64_t lengthInBases = int64_t(getReads().getReadRawSequenceLength(orientedReadId.getReadId())); + + // Get the positions of edge A in this oriented read. + const uint32_t ordinalA0 = itA->ordinals[0]; + const uint32_t ordinalA1 = itA->ordinals[1]; + const int64_t positionA0 = int64_t(orientedReadMarkers[ordinalA0].position); + const int64_t positionA1 = int64_t(orientedReadMarkers[ordinalA1].position); + + // Find the hypothetical positions of edge B, assuming the estimated base offset. + const int64_t positionB0 = positionA0 + info.offsetInBases; + const int64_t positionB1 = positionA1 + info.offsetInBases; + + // If this ends up outside the read, this counts as onlyAShort. + if(positionB0 < 0 or positionB1 >= lengthInBases) { + ++info.onlyAShort; + } + + ++itA; + continue; + } + + else if(itA == endA or ((itB!=endB) and (itB->orientedReadId < itA->orientedReadId))) { + // This oriented read only appears in edge B. + ++onlyBCheck; + const OrientedReadId orientedReadId = itB->orientedReadId; + const auto orientedReadMarkers = markers[orientedReadId.getValue()]; + const int64_t lengthInBases = int64_t(getReads().getReadRawSequenceLength(orientedReadId.getReadId())); + + // Get the positions of edge B in this oriented read. + const uint32_t ordinalB0 = itB->ordinals[0]; + const uint32_t ordinalB1 = itB->ordinals[1]; + const int64_t positionB0 = int64_t(orientedReadMarkers[ordinalB0].position); + const int64_t positionB1 = int64_t(orientedReadMarkers[ordinalB1].position); + + // Find the hypothetical positions of edge A, assuming the estimated base offset. + const int64_t positionA0 = positionB0 - info.offsetInBases; + const int64_t positionA1 = positionB1 - info.offsetInBases; + + // If this ends up outside the read, this counts as onlyBShort. + if(positionA0 < 0 or positionA1 >= lengthInBases) { + ++info.onlyBShort; + } + + ++itB; + continue; + } + + else { + // This oriented read appears in both edges. In this loop, we + // don't need to do anything. + ++itA; + ++itB; + } + } + SHASTA_ASSERT(onlyACheck == info.onlyA); + SHASTA_ASSERT(onlyBCheck == info.onlyB); + + + return true; +} + + + +#if 0 +// More detailed analysis for a pair of marker graph edges, +// both of which must be primary. +void Assembler::analyzePrimaryMarkerGraphEdgePair( + MarkerGraphEdgeId edgeIdA, + MarkerGraphEdgeId edgeIdB) const +{ + cout << "analyzePrimaryMarkerGraphEdgePair begins for " << edgeIdA << " " << edgeIdB << endl; + + // Sanity checks. + SHASTA_ASSERT(markerGraph.edges[edgeIdA].isPrimary == 1); + SHASTA_ASSERT(markerGraph.edges[edgeIdB].isPrimary == 1); + + // The MarkerIntervals on these two edges. + const auto markerIntervalsA = markerGraph.edgeMarkerIntervals[edgeIdA]; + const auto markerIntervalsB = markerGraph.edgeMarkerIntervals[edgeIdB]; + + // Find the position of edgeA on the primary journey of each oriented read on edgeA. + vector<uint64_t> positionInJourneyA(markerIntervalsA.size(), invalid<uint64_t>); + for(uint64_t i=0; i<markerIntervalsA.size(); i++) { + const OrientedReadId orientedReadId = markerIntervalsA[i].orientedReadId; + const auto journey = markerGraph.primaryJourneys[orientedReadId.getValue()]; + for(uint64_t position=0; position<journey.size(); position++) { + if(journey[position].edgeId == edgeIdA) { + positionInJourneyA[i] = position; + break; + } + } + SHASTA_ASSERT(positionInJourneyA[i] != invalid<uint64_t>); + } + + // Find the position of edgeB on the primary journey of each oriented read on edgeB. + vector<uint64_t> positionInJourneyB(markerIntervalsB.size(), invalid<uint64_t>); + for(uint64_t i=0; i<markerIntervalsB.size(); i++) { + const OrientedReadId orientedReadId = markerIntervalsB[i].orientedReadId; + const auto journey = markerGraph.primaryJourneys[orientedReadId.getValue()]; + for(uint64_t position=0; position<journey.size(); position++) { + if(journey[position].edgeId == edgeIdB) { + positionInJourneyB[i] = position; + break; + } + } + SHASTA_ASSERT(positionInJourneyB[i] != invalid<uint64_t>); + } + + + // The MarkerGraphEdgeIds that we encountered so far by moving forward from edgeA on + // the primary journeys of oriented reads on edgeA. + std::set<MarkerGraphEdgeId> edgeIdsForwardA; + + // The MarkerGraphEdgeIds that we encountered so far by moving backward from edgeB on + // the primary journeys of oriented reads on edgeB. + std::set<MarkerGraphEdgeId> edgeIdsBackwardB; + + // Iterate over offsets in the primary journeys. + // For journeys of the oriented reads on edgeA, we use positive offsets. + // For journeys of the oriented reads on edgeB, we use negative offsets. + for(uint64_t offset=1; ; ++offset) { + + uint64_t activeCountA = 0; + for(uint64_t i=0; i<markerIntervalsA.size(); i++) { + const OrientedReadId orientedReadId = markerIntervalsA[i].orientedReadId; + const auto journey = markerGraph.primaryJourneys[orientedReadId.getValue()]; + const uint64_t position = positionInJourneyA[i] + offset; + if(position >= journey.size()) { + continue; + } + ++activeCountA; + const MarkerGraphEdgeId edgeId = journey[position].edgeId; + + if(not edgeIdsForwardA.contains(edgeId)) { + edgeIdsForwardA.insert(edgeId); + + if(edgeIdsBackwardB.contains(edgeId)) { + MarkerGraphEdgePairInfo infoA; + analyzeMarkerGraphEdgePair(edgeIdA, edgeId, infoA); + MarkerGraphEdgePairInfo infoB; + analyzeMarkerGraphEdgePair(edgeId, edgeIdB, infoB); + cout << "At offset " << offset << " found " << edgeId << + ", common " << infoA.common << " " << infoB.common << ", total offset " << + infoA.offsetInBases+ infoB.offsetInBases << endl; + } + } + } + + uint64_t activeCountB = 0; + for(uint64_t i=0; i<markerIntervalsB.size(); i++) { + const OrientedReadId orientedReadId = markerIntervalsB[i].orientedReadId; + const auto journey = markerGraph.primaryJourneys[orientedReadId.getValue()]; + if(offset > positionInJourneyB[i]) { + continue; + } + const uint64_t position = positionInJourneyB[i] - offset; + ++activeCountB; + const MarkerGraphEdgeId edgeId = journey[position].edgeId; + + if(not edgeIdsBackwardB.contains(edgeId)) { + edgeIdsBackwardB.insert(edgeId); + + if(edgeIdsForwardA.contains(edgeId)) { + MarkerGraphEdgePairInfo infoA; + analyzeMarkerGraphEdgePair(edgeIdA, edgeId, infoA); + MarkerGraphEdgePairInfo infoB; + analyzeMarkerGraphEdgePair(edgeId, edgeIdB, infoB); + cout << "At offset " << offset << " found " << edgeId << + ", common " << infoA.common << " " << infoB.common << endl; + } + } + } + + if(activeCountA == 0 or activeCountB == 0) { + break; + } + } +} +#endif + + + +// Estimate the offset, in bases, between two marker graph edges. +// This assumes, WITHOUT CHECKING, that each of the two edges has no duplicate +// oriented reads. This assumption is satisfied for primary marker graph edges +// in Mode 3 assembly. +// If there are common oriented reads between the two edges, this uses +// analyzeMarkerGraphEdgePair. +// This can fail, in which case it returns invalid<uint64_t>. +uint64_t Assembler::estimateBaseOffsetUnsafe( + MarkerGraphEdgeId edgeIdA, + MarkerGraphEdgeId edgeIdB) const +{ + // If there are common oriented reads between the two edges, use + // analyzeMarkerGraphEdgePair. This is the most common case. + if(countCommonOrientedReadsUnsafe(edgeIdA, edgeIdB) > 0) { + MarkerGraphEdgePairInfo info; + SHASTA_ASSERT(analyzeMarkerGraphEdgePair(edgeIdA, edgeIdB, info)); + if(info.offsetInBases >= 0) { + return info.offsetInBases; + } else { + return invalid<uint64_t>; + } + } else { + return invalid<uint64_t>; + } + +#if 0 + // There are no common oriented reads between the two edges. + // Find a primary marker graph edge in-between that has common + // oriented reads with both edgeIdA and edgeIdB. + + // Sanity checks. + SHASTA_ASSERT(markerGraph.edges[edgeIdA].isPrimary == 1); + SHASTA_ASSERT(markerGraph.edges[edgeIdB].isPrimary == 1); + + // The MarkerIntervals on these two edges. + const auto markerIntervalsA = markerGraph.edgeMarkerIntervals[edgeIdA]; + const auto markerIntervalsB = markerGraph.edgeMarkerIntervals[edgeIdB]; + + // Find the position of edgeA on the primary journey of each oriented read on edgeA. + vector<uint64_t> positionInJourneyA(markerIntervalsA.size(), invalid<uint64_t>); + for(uint64_t i=0; i<markerIntervalsA.size(); i++) { + const OrientedReadId orientedReadId = markerIntervalsA[i].orientedReadId; + const auto journey = markerGraph.primaryJourneys[orientedReadId.getValue()]; + for(uint64_t position=0; position<journey.size(); position++) { + if(journey[position].edgeId == edgeIdA) { + positionInJourneyA[i] = position; + break; + } + } + SHASTA_ASSERT(positionInJourneyA[i] != invalid<uint64_t>); + } + + // Find the position of edgeB on the primary journey of each oriented read on edgeB. + vector<uint64_t> positionInJourneyB(markerIntervalsB.size(), invalid<uint64_t>); + for(uint64_t i=0; i<markerIntervalsB.size(); i++) { + const OrientedReadId orientedReadId = markerIntervalsB[i].orientedReadId; + const auto journey = markerGraph.primaryJourneys[orientedReadId.getValue()]; + for(uint64_t position=0; position<journey.size(); position++) { + if(journey[position].edgeId == edgeIdB) { + positionInJourneyB[i] = position; + break; + } + } + SHASTA_ASSERT(positionInJourneyB[i] != invalid<uint64_t>); + } + + + // The MarkerGraphEdgeIds that we encountered so far by moving forward from edgeA on + // the primary journeys of oriented reads on edgeA. + std::set<MarkerGraphEdgeId> edgeIdsForwardA; + + // The MarkerGraphEdgeIds that we encountered so far by moving backward from edgeB on + // the primary journeys of oriented reads on edgeB. + std::set<MarkerGraphEdgeId> edgeIdsBackwardB; + + // The best edgeId we found, and the lowest of its common oriented reads + // with edgeIdA and edgeIdB. + uint64_t edgeIdBest = invalid<uint64_t>; + uint64_t commonBest = 0; + + // Iterate over offsets in the primary journeys. + // For journeys of the oriented reads on edgeA, we use positive offsets. + // For journeys of the oriented reads on edgeB, we use negative offsets. + for(uint64_t offset=1; ; ++offset) { + + uint64_t activeCountA = 0; + for(uint64_t i=0; i<markerIntervalsA.size(); i++) { + const OrientedReadId orientedReadId = markerIntervalsA[i].orientedReadId; + const auto journey = markerGraph.primaryJourneys[orientedReadId.getValue()]; + const uint64_t position = positionInJourneyA[i] + offset; + if(position >= journey.size()) { + continue; + } + ++activeCountA; + const MarkerGraphEdgeId edgeId = journey[position].edgeId; + + if(not edgeIdsForwardA.contains(edgeId)) { + edgeIdsForwardA.insert(edgeId); + + if(edgeIdsBackwardB.contains(edgeId)) { + const uint64_t commonCountA = countCommonOrientedReadsUnsafe(edgeIdA, edgeId); + const uint64_t commonCountB = countCommonOrientedReadsUnsafe(edgeId, edgeIdB); + const uint64_t commonCountMin = min(commonCountA, commonCountB); + if(commonCountMin > commonBest) { + edgeIdBest = edgeId; + commonBest = commonCountMin; + } + } + } + } + + uint64_t activeCountB = 0; + for(uint64_t i=0; i<markerIntervalsB.size(); i++) { + const OrientedReadId orientedReadId = markerIntervalsB[i].orientedReadId; + const auto journey = markerGraph.primaryJourneys[orientedReadId.getValue()]; + if(offset > positionInJourneyB[i]) { + continue; + } + const uint64_t position = positionInJourneyB[i] - offset; + ++activeCountB; + const MarkerGraphEdgeId edgeId = journey[position].edgeId; + + if(not edgeIdsBackwardB.contains(edgeId)) { + edgeIdsBackwardB.insert(edgeId); + + if(edgeIdsForwardA.contains(edgeId)) { + const uint64_t commonCountA = countCommonOrientedReadsUnsafe(edgeIdA, edgeId); + const uint64_t commonCountB = countCommonOrientedReadsUnsafe(edgeId, edgeIdB); + const uint64_t commonCountMin = min(commonCountA, commonCountB); + if(commonCountMin > commonBest) { + edgeIdBest = edgeId; + commonBest = commonCountMin; + } + } + } + } + + if(activeCountA == 0 or activeCountB == 0) { + break; + } + } + + if(commonBest == 0) { + return invalid<uint64_t>; + } + + // edgeIdBest has common oriented reads with both edgeIdA and edgeIdB. + MarkerGraphEdgePairInfo infoA; + MarkerGraphEdgePairInfo infoB; + SHASTA_ASSERT(analyzeMarkerGraphEdgePair(edgeIdA, edgeIdBest, infoA)); + SHASTA_ASSERT(analyzeMarkerGraphEdgePair(edgeIdBest, edgeIdB, infoB)); + SHASTA_ASSERT(infoA.common > 0); + SHASTA_ASSERT(infoB.common > 0); + return infoA.offsetInBases + infoB.offsetInBases; +#endif +} + + + +// Count the number of common oriented reads between two marker graph edges. +// This assumes, WITHOUT CHECKING, that each of the two edges has no duplicate +// oriented reads. This assumption is satisfied for primary marker graph edges +// in Mode 3 assembly. +uint64_t Assembler::countCommonOrientedReadsUnsafe( + MarkerGraphEdgeId edgeIdA, + MarkerGraphEdgeId edgeIdB) const +{ + // Prepare for the joint loop over OrientedReadIds of the two edges. + const auto markerIntervalsA = markerGraph.edgeMarkerIntervals[edgeIdA]; + const auto markerIntervalsB = markerGraph.edgeMarkerIntervals[edgeIdB]; + const auto beginA = markerIntervalsA.begin(); + const auto beginB = markerIntervalsB.begin(); + const auto endA = markerIntervalsA.end(); + const auto endB = markerIntervalsB.end(); + + + // Joint loop over the MarkerIntervals of the two edges, + // to count the common reads and compute average offsets. + // This assumes that there are no duplicate oriented reads + // on the two edges. + uint64_t n = 0; + auto itA = beginA; + auto itB = beginB; + while(itA != endA and itB != endB) { + + if(itA->orientedReadId < itB->orientedReadId) { + ++itA; + } else if(itB->orientedReadId < itA->orientedReadId) { + ++itB; + continue; + } else { + // We found a common OrientedReadId. + ++n; + ++itA; + ++itB; + } + + } + return n; +} + diff --git a/src/AssemblerMarkers.cpp b/src/AssemblerMarkers.cpp index 4b7eb9f..7c34b6f 100644 --- a/src/AssemblerMarkers.cpp +++ b/src/AssemblerMarkers.cpp @@ -1,7 +1,10 @@ // Shasta. #include "Assembler.hpp" +#include "extractKmer.hpp" #include "findMarkerId.hpp" #include "MarkerFinder.hpp" +#include "performanceLog.hpp" +#include "timestamp.hpp" using namespace shasta; // Standard library. @@ -11,12 +14,12 @@ using namespace shasta; void Assembler::findMarkers(size_t threadCount) { reads->checkReadsAreOpen(); - checkKmersAreOpen(); + SHASTA_ASSERT(kmerChecker); markers.createNew(largeDataName("Markers"), largeDataPageSize); MarkerFinder markerFinder( assemblerInfo->k, - kmerTable, + *kmerChecker, getReads(), markers, threadCount); @@ -41,7 +44,7 @@ void Assembler::checkMarkersAreOpen() const void Assembler::writeMarkers(ReadId readId, Strand strand, const string& fileName) { // Check that we have what we need. - checkKmersAreOpen(); + SHASTA_ASSERT(kmerChecker); reads->checkReadsAreOpen(); checkMarkersAreOpen(); reads->checkReadId(readId); @@ -56,44 +59,25 @@ void Assembler::writeMarkers(ReadId readId, Strand strand, const string& fileNam for(uint32_t ordinal=0; ordinal<orientedReadMarkers.size(); ordinal++) { const CompressedMarker& marker = orientedReadMarkers[ordinal]; const MarkerId markerId = getMarkerId(orientedReadId, ordinal); + const KmerId kmerId = getOrientedReadMarkerKmerId(orientedReadId, ordinal); + const Kmer kmer(kmerId, assemblerInfo->k); csv << markerId << ","; csv << ordinal << ","; - csv << marker.kmerId << ","; - csv << Kmer(marker.kmerId, assemblerInfo->k) << ","; + csv << kmerId << ","; + csv << kmer << ","; csv << marker.position << "\n"; } } -vector<KmerId> Assembler::getMarkers(ReadId readId, Strand strand) -{ - const OrientedReadId orientedReadId(readId, strand); - const auto orientedReadMarkers = markers[orientedReadId.getValue()]; - - vector<KmerId> v; - for(const CompressedMarker& marker: orientedReadMarkers) { - v.push_back(marker.kmerId); - } - return v; -} - - // Get markers sorted by KmerId for a given OrientedReadId. void Assembler::getMarkersSortedByKmerId( OrientedReadId orientedReadId, vector<MarkerWithOrdinal>& markersSortedByKmerId) const { - const auto compressedMarkers = markers[orientedReadId.getValue()]; - markersSortedByKmerId.clear(); - markersSortedByKmerId.resize(compressedMarkers.size()); - - for(uint32_t ordinal=0; ordinal<compressedMarkers.size(); ordinal++) { - const CompressedMarker& compressedMarker = compressedMarkers[ordinal]; - markersSortedByKmerId[ordinal] = MarkerWithOrdinal(compressedMarker, ordinal); - } - - // Sort by kmerId. + markersSortedByKmerId.resize(markers.size(orientedReadId.getValue())); + getOrientedReadMarkers(orientedReadId, markersSortedByKmerId); sort(markersSortedByKmerId.begin(), markersSortedByKmerId.end()); } @@ -154,28 +138,482 @@ MarkerId Assembler::findReverseComplement(MarkerId markerId) const -// Write the frequency of markers in oriented reads. -void Assembler::writeMarkerFrequency() +void Assembler::computeMarkerKmerIds(uint64_t threadCount) { - const uint64_t k = assemblerInfo->k; - const uint64_t kmerCount = 1ULL << (2ULL*k); - SHASTA_ASSERT(markers.isOpen()); - vector<uint64_t> frequency(kmerCount, 0); + performanceLog << timestamp << "Gathering marker KmerIds." << endl; - const CompressedMarker* compressedMarker = markers.begin(); - const CompressedMarker* end = markers.end(); - for(; compressedMarker!=end; ++compressedMarker) { - ++frequency[compressedMarker->kmerId]; + // Check that we have what we need. + checkMarkersAreOpen(); + const uint64_t readCount = reads->readCount(); + + // Adjust the numbers of threads, if necessary. + if(threadCount == 0) { + threadCount = std::thread::hardware_concurrency(); + } + + // Do it. + // The layout is identical to that used by the markers. + markerKmerIds.createNew(largeDataName("MarkerKmerIds"), largeDataPageSize); + for(uint64_t readId=0; readId<readCount; readId++) { + const OrientedReadId orientedReadId0(uint32_t(readId), 0); + const OrientedReadId orientedReadId1(uint32_t(readId), 1); + const uint64_t readMarkerCount = markers.size(orientedReadId0.getValue()); + SHASTA_ASSERT(markers.size(orientedReadId1.getValue()) == readMarkerCount); + for(uint64_t strand=0; strand<2; strand++) { + markerKmerIds.appendVector(readMarkerCount); + } + } + markerKmerIds.unreserve(); + const uint64_t batchSize = 100; + setupLoadBalancing(readCount, batchSize); + runThreads(&Assembler::computeMarkerKmerIdsThreadFunction, threadCount); + + + +#if 0 + // Test the low level functions to extract Kmers/KmerIds. + const uint64_t k = assemblerInfo->k; + vector<Kmer> kmerVector; + vector<KmerId> kmerIdVector; + performanceLog << timestamp << "Testing." << endl; + for(uint64_t readId=0; readId<readCount; readId++) { + for(uint64_t strand=0; strand<2; strand++) { + + const OrientedReadId orientedReadId = OrientedReadId(ReadId(readId), Strand(strand)); + const auto orientedReadMarkers = markers[orientedReadId.getValue()]; + const auto orientedReadMarkerKmerIds = markerKmerIds[orientedReadId.getValue()]; + const uint64_t orientedReadMarkerCount = orientedReadMarkers.size(); + SHASTA_ASSERT(orientedReadMarkerKmerIds.size() == orientedReadMarkerCount); + + kmerVector.resize(orientedReadMarkerCount); + kmerIdVector.resize(orientedReadMarkerCount); + const span<Kmer> kmerSpan(kmerVector); + const span<KmerId> kmerIdSpan(kmerIdVector); + + getOrientedReadMarkerKmers(orientedReadId, kmerSpan); + getOrientedReadMarkerKmerIds(orientedReadId, kmerIdSpan); + + for(uint64_t ordinal=0; ordinal<orientedReadMarkerCount; ordinal++) { + SHASTA_ASSERT(kmerVector[ordinal].id(k) == orientedReadMarkers[ordinal].kmerId); + SHASTA_ASSERT(kmerIdVector[ordinal] == orientedReadMarkers[ordinal].kmerId); + + SHASTA_ASSERT(kmerVector[ordinal] == getOrientedReadMarkerKmer(orientedReadId, ordinal)); + SHASTA_ASSERT(kmerIdVector[ordinal] == getOrientedReadMarkerKmerId(orientedReadId, ordinal)); + } + } } +#endif - ofstream csv("MarkerFrequency.csv"); - for(uint64_t kmerId=0; kmerId<kmerCount; kmerId++) { - const uint64_t n = frequency[kmerId]; - if(n== 0) { - continue; +} + + + +void Assembler::cleanupMarkerKmerIds() +{ + markerKmerIds.remove(); +} + + + +void Assembler::computeMarkerKmerIdsThreadFunction(size_t threadId) +{ + + // Loop over all batches assigned to this thread. + uint64_t begin, end; + while(getNextBatch(begin, end)) { + + // Loop over reads in this batch. + for(uint64_t readId=begin; readId!=end; ++readId) { + + const OrientedReadId orientedReadId0(uint32_t(readId), 0); + const OrientedReadId orientedReadId1(uint32_t(readId), 1); + + getReadMarkerKmerIds( + ReadId(readId), + markerKmerIds[orientedReadId0.getValue()], + markerKmerIds[orientedReadId1.getValue()]); } - const Kmer kmer(kmerId, k); - kmer.write(csv, k); - csv << "," << n << "\n"; } + +} + + + +Kmer Assembler::getOrientedReadMarkerKmer(OrientedReadId orientedReadId, uint32_t ordinal) const +{ + const ReadId readId = orientedReadId.getReadId(); + const Strand strand = orientedReadId.getStrand(); + + if(strand == 0) { + return getOrientedReadMarkerKmerStrand0(readId, ordinal); + } else { + return getOrientedReadMarkerKmerStrand1(readId, ordinal); + } + +} + + + +Kmer Assembler::getOrientedReadMarkerKmerStrand0(ReadId readId, uint32_t ordinal0) const +{ + const uint64_t k = assemblerInfo->k; + const auto read = reads->getRead(uint32_t(readId)); + const OrientedReadId orientedReadId0(readId, 0); + const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()]; + + Kmer kmer0; + extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0); + + return kmer0; +} + + + +Kmer Assembler::getOrientedReadMarkerKmerStrand1(ReadId readId, uint32_t ordinal1) const +{ + const uint64_t k = assemblerInfo->k; + + // We only have the read stored without reverse complement, so get it from there... + const auto read = reads->getRead(uint32_t(readId)); + const OrientedReadId orientedReadId0(readId, 0); + const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()]; + const uint64_t readMarkerCount = orientedReadMarkers0.size(); + const uint64_t ordinal0 = readMarkerCount - 1 - ordinal1; + Kmer kmer0; + extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0); + + // ... then do the reverse complement. + const Kmer kmer1 = kmer0.reverseComplement(k); + return kmer1; +} + + + +KmerId Assembler::getOrientedReadMarkerKmerId(OrientedReadId orientedReadId, uint32_t ordinal) const +{ + const Kmer kmer = getOrientedReadMarkerKmer(orientedReadId, ordinal); + return KmerId(kmer.id(assemblerInfo->k)); +} + + + +// Get all marker Kmers for an oriented read. +void Assembler::getOrientedReadMarkerKmers( + OrientedReadId orientedReadId, + const span<Kmer>& kmers) const +{ + const ReadId readId = orientedReadId.getReadId(); + const Strand strand = orientedReadId.getStrand(); + + if(strand == 0) { + getOrientedReadMarkerKmersStrand0(readId, kmers); + } else { + getOrientedReadMarkerKmersStrand1(readId, kmers); + } +} + + + +void Assembler::getOrientedReadMarkerKmersStrand0(ReadId readId, const span<Kmer>& kmers0) const +{ + const uint64_t k = assemblerInfo->k; + + const auto read = reads->getRead(uint32_t(readId)); + const OrientedReadId orientedReadId0(readId, 0); + const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()]; + const uint64_t readMarkerCount = orientedReadMarkers0.size(); + SHASTA_ASSERT(kmers0.size() == readMarkerCount); + + // Loop over all markers. + for(uint64_t ordinal0=0; ordinal0<readMarkerCount; ordinal0++) { + Kmer kmer0; + extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0); + kmers0[ordinal0] = kmer0; + } + +} + + + +void Assembler::getOrientedReadMarkerKmersStrand1(ReadId readId, const span<Kmer>& kmers1) const +{ + const uint64_t k = assemblerInfo->k; + + const auto read = reads->getRead(uint32_t(readId)); + const OrientedReadId orientedReadId0(readId, 0); + const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()]; + const uint64_t readMarkerCount = orientedReadMarkers0.size(); + SHASTA_ASSERT(kmers1.size() == readMarkerCount); + + // Loop over all markers. + for(uint64_t ordinal0=0; ordinal0<readMarkerCount; ordinal0++) { + Kmer kmer0; + extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0); + const Kmer kmer1 = kmer0.reverseComplement(k); + const uint64_t ordinal1 = readMarkerCount - 1 - ordinal0; + kmers1[ordinal1] = kmer1; + } + } + + + +// Get all marker KmerIds for an oriented read. +void Assembler::getOrientedReadMarkerKmerIds( + OrientedReadId orientedReadId, + const span<KmerId>& kmerIds) const +{ + const ReadId readId = orientedReadId.getReadId(); + const Strand strand = orientedReadId.getStrand(); + + if(strand == 0) { + getOrientedReadMarkerKmerIdsStrand0(readId, kmerIds); + } else { + getOrientedReadMarkerKmerIdsStrand1(readId, kmerIds); + } +} + + + +void Assembler::getOrientedReadMarkerKmerIdsStrand0(ReadId readId, const span<KmerId>& kmerIds0) const +{ + const uint64_t k = assemblerInfo->k; + + const auto read = reads->getRead(uint32_t(readId)); + const OrientedReadId orientedReadId0(readId, 0); + const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()]; + const uint64_t readMarkerCount = orientedReadMarkers0.size(); + SHASTA_ASSERT(kmerIds0.size() == readMarkerCount); + + // Loop over all markers. + for(uint64_t ordinal0=0; ordinal0<readMarkerCount; ordinal0++) { + Kmer kmer0; + extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0); + kmerIds0[ordinal0] = KmerId(kmer0.id(k)); + } + +} + + + +void Assembler::getOrientedReadMarkerKmerIdsStrand1(ReadId readId, const span<KmerId>& kmerIds1) const +{ + const uint64_t k = assemblerInfo->k; + + const auto read = reads->getRead(uint32_t(readId)); + const OrientedReadId orientedReadId0(readId, 0); + const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()]; + const uint64_t readMarkerCount = orientedReadMarkers0.size(); + SHASTA_ASSERT(kmerIds1.size() == readMarkerCount); + + // Loop over all markers. + for(uint64_t ordinal0=0; ordinal0<readMarkerCount; ordinal0++) { + Kmer kmer0; + extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0); + const Kmer kmer1 = kmer0.reverseComplement(k); + const uint64_t ordinal1 = readMarkerCount - 1 - ordinal0; + kmerIds1[ordinal1] = KmerId(kmer1.id(k)); + } + +} + + + +void Assembler::getOrientedReadMarkers( + OrientedReadId orientedReadId, + const span<MarkerWithOrdinal>& markers) const +{ + const ReadId readId = orientedReadId.getReadId(); + const Strand strand = orientedReadId.getStrand(); + + if(strand == 0) { + getOrientedReadMarkersStrand0(readId, markers); + } else { + getOrientedReadMarkersStrand1(readId, markers); + } + +} + + + +void Assembler::getOrientedReadMarkersStrand0( + ReadId readId, + const span<MarkerWithOrdinal>& markers0) const +{ + const uint64_t k = assemblerInfo->k; + + const auto read = reads->getRead(uint32_t(readId)); + const OrientedReadId orientedReadId0(readId, 0); + const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()]; + const uint64_t readMarkerCount = orientedReadMarkers0.size(); + SHASTA_ASSERT(markers0.size() == readMarkerCount); + + // Loop over all markers. + for(uint64_t ordinal0=0; ordinal0<readMarkerCount; ordinal0++) { + const CompressedMarker& compressedMarker0 = orientedReadMarkers0[ordinal0]; + const uint32_t position = compressedMarker0.position; + Kmer kmer0; + extractKmer(read, uint64_t(position), k, kmer0); + markers0[ordinal0] = MarkerWithOrdinal(KmerId(kmer0.id(k)), position, uint32_t(ordinal0)); + } + +} + + + +void Assembler::getOrientedReadMarkersStrand1( + ReadId readId, + const span<MarkerWithOrdinal>& markers1) const +{ + const uint64_t k = assemblerInfo->k; + + const auto read = reads->getRead(uint32_t(readId)); + const OrientedReadId orientedReadId0(readId, 0); + const OrientedReadId orientedReadId1(readId, 1); + const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()]; + const auto orientedReadMarkers1 = markers[orientedReadId1.getValue()]; + const uint64_t readMarkerCount = orientedReadMarkers0.size(); + SHASTA_ASSERT(markers1.size() == readMarkerCount); + + // Loop over all markers. + for(uint64_t ordinal0=0; ordinal0<readMarkerCount; ordinal0++) { + const uint64_t ordinal1 = readMarkerCount - 1 - ordinal0; + const CompressedMarker& compressedMarker1 = orientedReadMarkers1[ordinal1]; + const uint32_t position1 = compressedMarker1.position; + Kmer kmer0; + extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0); + const Kmer kmer1 = kmer0.reverseComplement(k); + markers1[ordinal1] = MarkerWithOrdinal(KmerId(kmer1.id(k)), position1, uint32_t(ordinal1)); + } + +} + + + +// Get all marker Kmers for a read in both orientations. +void Assembler::getReadMarkerKmers( + ReadId readId, + const span<Kmer>& kmers0, + const span<Kmer>& kmers1) const +{ + const uint64_t k = assemblerInfo->k; + + // Access the information we need for this read. + const auto read = reads->getRead(uint32_t(readId)); + const OrientedReadId orientedReadId0(uint32_t(readId), 0); + const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()]; + const uint64_t readMarkerCount = orientedReadMarkers0.size(); + SHASTA_ASSERT(kmers0.size() == readMarkerCount); + SHASTA_ASSERT(kmers1.size() == readMarkerCount); + + // Loop over all markers. + for(uint64_t ordinal0=0; ordinal0<readMarkerCount; ordinal0++) { + + // Strand 0. + Kmer kmer0; + extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0); + kmers0[ordinal0] = kmer0; + + // Strand 1. + const Kmer kmer1 = kmer0.reverseComplement(k); + const uint64_t ordinal1 = readMarkerCount - 1 - ordinal0; + kmers1[ordinal1] = kmer1; + } + +} + + + +// Get all marker KmerIds for a read in both orientations. +void Assembler::getReadMarkerKmerIds( + ReadId readId, + const span<KmerId>& kmerIds0, + const span<KmerId>& kmerIds1) const +{ + // Get the marker length. + const uint64_t k = assemblerInfo->k; + + // Access the information we need for this read. + const auto read = reads->getRead(uint32_t(readId)); + const OrientedReadId orientedReadId0(uint32_t(readId), 0); + const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()]; + const uint64_t readMarkerCount = orientedReadMarkers0.size(); + SHASTA_ASSERT(kmerIds0.size() == readMarkerCount); + SHASTA_ASSERT(kmerIds1.size() == readMarkerCount); + + // Loop over all markers. + for(uint64_t ordinal0=0; ordinal0<readMarkerCount; ordinal0++) { + + // Strand 0. + Kmer kmer0; + extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0); + kmerIds0[ordinal0] = KmerId(kmer0.id(k)); + + // Strand 1. + const Kmer kmer1 = kmer0.reverseComplement(k); + const uint64_t ordinal1 = readMarkerCount - 1 - ordinal0; + kmerIds1[ordinal1] = KmerId(kmer1.id(k)); + } + +} + + + +// Get the Kmer for an oriented read at a given marker ordinal. +Kmer Assembler::getOrientedReadMarkerKmer(OrientedReadId orientedReadId, uint64_t ordinal) const +{ + const uint64_t k = assemblerInfo->k; + + const ReadId readId = orientedReadId.getReadId(); + const Strand strand = orientedReadId.getStrand(); + const auto read = reads->getRead(readId); + const OrientedReadId orientedReadId0(uint32_t(readId), 0); + const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()]; + + if(strand == 0) { + + const uint64_t ordinal0 = ordinal; + Kmer kmer0; + extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0); + return kmer0; + + } else { + + const uint64_t ordinal0 = orientedReadMarkers0.size() - 1 - ordinal; + Kmer kmer0; + extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0); + return kmer0.reverseComplement(k); + + } +} + + + +// Get the KmerId for an oriented read at a given marker ordinal. +KmerId Assembler::getOrientedReadMarkerKmerId(OrientedReadId orientedReadId, uint64_t ordinal) const +{ + const uint64_t k = assemblerInfo->k; + + const ReadId readId = orientedReadId.getReadId(); + const Strand strand = orientedReadId.getStrand(); + const auto read = reads->getRead(readId); + const OrientedReadId orientedReadId0(uint32_t(readId), 0); + const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()]; + + if(strand == 0) { + + const uint64_t ordinal0 = ordinal; + Kmer kmer0; + extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0); + return KmerId(kmer0.id(k)); + + } else { + + const uint64_t ordinal0 = orientedReadMarkers0.size() - 1 - ordinal; + Kmer kmer0; + extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0); + return KmerId(kmer0.reverseComplement(k).id(k)); + + } +} + diff --git a/src/AssemblerMode3.cpp b/src/AssemblerMode3.cpp index 4c4199f..0990b8b 100644 --- a/src/AssemblerMode3.cpp +++ b/src/AssemblerMode3.cpp @@ -1,82 +1,236 @@ +// Shasta #include "Assembler.hpp" -#include "mode3.hpp" -#include "mode3-Detangler.hpp" -#include "mode3-PathGraph.hpp" +#include "LocalMarkerGraph1.hpp" +#include "mode3-LocalAssembly.hpp" +#include "mode3-AssemblyGraph.hpp" +#include "mode3-PrimaryGraph.hpp" +#include "Mode3Assembler.hpp" #include "Reads.hpp" using namespace shasta; -using namespace mode3; +// Boost libraries. +#include <boost/graph/iteration_macros.hpp> +// Standard library. +#include "fstream.hpp" +#include <map> -void Assembler::mode3Assembly( - size_t threadCount) -{ - // EXPOSE WHEN CODE STABILIZES. - // const uint64_t minClusterSize = 3; - // Adjust the numbers of threads, if necessary. - if(threadCount == 0) { - threadCount = std::thread::hardware_concurrency(); - } - assemblyGraph3Pointer = std::make_shared<mode3::AssemblyGraph>( - largeDataFileNamePrefix, - largeDataPageSize, - threadCount, - assemblerInfo->readRepresentation, - assemblerInfo->k, - *reads, +void Assembler::flagPrimaryMarkerGraphEdges( + uint64_t minPrimaryCoverage, + uint64_t maxPrimaryCoverage, + uint64_t threadCount) +{ + // Check that we have what we need. + SHASTA_ASSERT(markers.isOpen()); + checkMarkerGraphVerticesAreAvailable(); + SHASTA_ASSERT(markerGraph.edges.isOpenWithWriteAccess); + SHASTA_ASSERT(markerGraph.disjointSetsHistogram.isOpen); + + markerGraph.flagPrimaryEdges( + minPrimaryCoverage, + maxPrimaryCoverage, markers, - markerGraph, - *consensusCaller); - auto& assemblyGraph3 = *assemblyGraph3Pointer; - assemblyGraph3.writeGfa("AssemblyGraph"); - // assemblyGraph3.clusterSegments(threadCount, minClusterSize); - assemblyGraph3.createJaccardGraph(threadCount); - // assemblyGraph3.assembleJaccardGraphPaths(); - assemblyGraph3.createDeBruijnGraph(); - + threadCount); } -void Assembler::accessMode3AssemblyGraph() +void Assembler::mode3Assembly( + uint64_t threadCount, + const Mode3AssemblyOptions& options, + bool debug + ) { - assemblyGraph3Pointer = std::make_shared<mode3::AssemblyGraph>( - largeDataFileNamePrefix, - assemblerInfo->readRepresentation, - assemblerInfo->k, - *reads, markers, markerGraph, *consensusCaller); + mode3Assembler = make_shared<Mode3Assembler>(*this, threadCount, options, debug); } -void Assembler::analyzeMode3Subgraph(const vector<uint64_t>& segmentIds) + +void Assembler::mode3AssembleComponent( + const string& fileName, + uint64_t threadCount, + const Mode3AssemblyOptions& options, + bool assembleSequence, + bool debug) const { - SHASTA_ASSERT(assemblyGraph3Pointer); - vector<mode3::AssemblyGraph::AnalyzeSubgraphClasses::Cluster> clusters; - assemblyGraph3Pointer->analyzeSubgraph(segmentIds, clusters, true); + mode3::AssemblyGraph(fileName, *this, threadCount, options, assembleSequence, debug); } -void Assembler::createMode3PathGraph() +// Assemble sequence between two primary edges. +void Assembler::fillMode3AssemblyPathStep(const vector<string>& request, ostream& html) { - SHASTA_ASSERT(assemblyGraph3Pointer); - const mode3::AssemblyGraph& assemblyGraph = *assemblyGraph3Pointer; + // Check that our assumptions are satisfied. + if(assemblerInfo->assemblyMode != 3) { + throw runtime_error("This is only available for assembly mode 3."); + } + SHASTA_ASSERT(getReads().representation == 0); // No RLE. + SHASTA_ASSERT((assemblerInfo->k % 2) == 0); // Marker length is even. - mode3::PathGraph pathGraph(assemblyGraph); + mode3::LocalAssemblyDisplayOptions options(html); -} + // Get the parameters for the request. + uint64_t edgeIdA = invalid<uint64_t>; + getParameterValue(request, "edgeIdA", edgeIdA); + uint64_t edgeIdB = invalid<uint64_t>; + getParameterValue(request, "edgeIdB", edgeIdB); + string useAString; + const bool useA = getParameterValue(request, "useA", useAString); -void Assembler::createMode3Detangler() -{ - SHASTA_ASSERT(assemblyGraph3Pointer); - const mode3::AssemblyGraph& assemblyGraph = *assemblyGraph3Pointer; + string useBString; + const bool useB = getParameterValue(request, "useB", useBString); - mode3::Detangler detangler(assemblyGraph); + uint64_t minVertexCoverage = 0; + getParameterValue(request, "minVertexCoverage", minVertexCoverage); -} + string showOrientedReadsString; + options.showOrientedReads = getParameterValue(request, "showOrientedReads", showOrientedReadsString); + + string showMarkersString; + options.showMarkers = getParameterValue(request, "showMarkers", showMarkersString); + + string showGraphString; + options.showGraph = getParameterValue(request, "showGraph", showGraphString); + + string showVerticesString; + options.showVertices = getParameterValue(request, "showVertices", showVerticesString); + + string showVertexLabelsString; + options.showVertexLabels = getParameterValue(request, "showVertexLabels", showVertexLabelsString); + + string showEdgeLabelsString; + options.showEdgeLabels = getParameterValue(request, "showEdgeLabels", showEdgeLabelsString); + + string showAssemblyDetailsString; + options.showAssemblyDetails = getParameterValue(request, "showAssemblyDetails", showAssemblyDetailsString); + + string showDebugInformationString; + options.showDebugInformation = getParameterValue(request, "showDebugInformation", showDebugInformationString); + + + + // Write the form. + html << + "<form>" + "<table>" + + "<tr><th class=left>Edge A<td class=centered>" + "<input type=text required name=edgeIdA size=8 style='text-align:center' " << + ((edgeIdA == invalid<uint64_t>) ? "" : ("value='" + to_string(edgeIdA) + "'")) << ">" + + "<tr><th class=left>Edge B<td class=centered>" + "<input type=text required name=edgeIdB size=8 style='text-align:center' " << + ((edgeIdB == invalid<uint64_t>) ? "" : ("value='" + to_string(edgeIdB) + "'")) << ">" + + "<tr>" + "<th class=left>Use for assembly oriented reads that appear only on edge A" + "<td class=centered><input type=checkbox name=useA" << + (useA ? " checked" : "") << ">" + + "<tr>" + "<th class=left>Use for assembly oriented reads that appear only on edge B" + "<td class=centered><input type=checkbox name=useB" << + (useB ? " checked" : "") << ">" + "<tr><th class=left>Minimum vertex coverage<br>(0 = automatic)<td class=centered>" + "<input type=text required name=minVertexCoverage size=8 style='text-align:center' " + "value='" << minVertexCoverage << "'>" + + "<tr>" + "<th class=left>Display the oriented reads" + "<td class=centered><input type=checkbox name=showOrientedReads" << + (options.showOrientedReads ? " checked" : "") << ">" + + "<tr>" + "<th class=left>Display the markers" + "<td class=centered><input type=checkbox name=showMarkers" << + (options.showMarkers ? " checked" : "") << ">" + + "<tr>" + "<th class=left>Display the graph" + "<td class=centered><input type=checkbox name=showGraph" << + (options.showGraph ? " checked" : "") << ">" + + "<tr>" + "<th class=left>Display the vertices" + "<td class=centered><input type=checkbox name=showVertices" << + (options.showVertices ? " checked" : "") << ">" + + "<tr>" + "<th class=left>Display vertex labels" + "<td class=centered><input type=checkbox name=showVertexLabels" << + (options.showVertexLabels ? " checked" : "") << ">" + + "<tr>" + "<th class=left>Display edge labels" + "<td class=centered><input type=checkbox name=showEdgeLabels" << + (options.showEdgeLabels ? " checked" : "") << ">" + + "<tr>" + "<th class=left>Display assembly details" + "<td class=centered><input type=checkbox name=showAssemblyDetails" << + (options.showAssemblyDetails ? " checked" : "") << ">" + + "<tr>" + "<th class=left>Display debug information" + "<td class=centered><input type=checkbox name=showDebugInformation" << + (options.showDebugInformation ? " checked" : "") << ">" + + "</table>" + "<br><input type=submit value='Do it'>" + "</form>"; + + + + // If the edge ids are missing, do nothing. + if(edgeIdA == invalid<uint64_t> or edgeIdB == invalid<uint64_t>) { + return; + } + + // Sanity checks on the edge ids. + if(edgeIdA >= markerGraph.edges.size()) { + throw runtime_error("Marker graph edge " + to_string(edgeIdA) + + " is not valid. Maximum valid edge id is " + to_string(markerGraph.edges.size())); + } + if(edgeIdB >= markerGraph.edges.size()) { + throw runtime_error("Marker graph edge " + to_string(edgeIdB) + + " is not valid. Maximum valid edge id is " + to_string(markerGraph.edges.size())); + } + + // Sanity check that the two edges are distinct. + if(edgeIdA == edgeIdB) { + html << "<p>Specify two distinct edges."; + return; + } + + // This analysis can only be done if both edges have no duplicate OrientedReadIds + // in their MarkerIntervals. + if(markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdA)) { + html << "<p>Marker graph edge " << edgeIdA << " has duplicate oriented reads."; + return; + } + if(markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdB)) { + html << "<p>Marker graph edge " << edgeIdB << " has duplicate oriented reads."; + return; + } + + // Check that there are common reads. + MarkerGraphEdgePairInfo info; + SHASTA_ASSERT(analyzeMarkerGraphEdgePair(edgeIdA, edgeIdB, info)); + if(info.common == 0) { + html << "<p>The two edges have no common oriented reads."; + return; + } + + // Local assembly for this assembly step. + mode3::LocalAssembly localAssembly(*this, edgeIdA, edgeIdB, minVertexCoverage, + options, + httpServerData.assemblerOptions->assemblyOptions.mode3Options.localAssemblyOptions, + useA, useB); +} diff --git a/src/AssemblerOptions.cpp b/src/AssemblerOptions.cpp index abc112e..d4cc7a0 100644 --- a/src/AssemblerOptions.cpp +++ b/src/AssemblerOptions.cpp @@ -14,7 +14,11 @@ using namespace shasta; -// Constructor. +// Constructor from a command line. +// If the command line includes a --config option, +// the specified built-in configuration or configuration file +// is used to fill the AssemblyOptions, +// but values specified on the command line take precedence. AssemblerOptions::AssemblerOptions(int argumentCount, const char** arguments) : commandLineOnlyOptionsDescription("Options allowed only on the command line"), configurableOptionsDescription("Options allowed on the command line and in the config file") @@ -124,6 +128,41 @@ AssemblerOptions::AssemblerOptions(int argumentCount, const char** arguments) : +// Constructor from a configuration file. +// This only fills in the configurable options specified in +// the given configuration file. Command line only options +// are left at their defaults. +AssemblerOptions::AssemblerOptions(const string& fileName) +{ + + using boost::program_options::positional_options_description; + using boost::program_options::value; + using boost::program_options::variables_map; + using boost::program_options::command_line_parser; + + addConfigurableOptions(); + + ifstream configFile(fileName); + if (not configFile) { + throw runtime_error("Invalid configuration file " + fileName + " specified.\n"); + } + variables_map variablesMap; + store(parse_config_file(configFile, configurableOptionsDescription), variablesMap); + notify(variablesMap); + + // Parse MarkerGraph.simplifyMaxLength. + markerGraphOptions.parseSimplifyMaxLength(); + + // Parse ReadOptions.desiredCoverageString into its numeric value. + readsOptions.parseDesiredCoverageString(); + + // Unpack the consensuscaller and replace relative path with the absolute + // one if necessary. + assemblyOptions.parseConsensusCallerString(); +} + + + // Add non-configurable options to the Boost option description object. // These are options that can only be used on the command line // (not in the configuration file). @@ -252,6 +291,12 @@ void AssemblerOptions::addConfigurableOptions() "This is done by specifying the O_DIRECT flag when opening " "input files containing reads.") + ("Reads.handleDuplicates", + value<string>(&readsOptions.handleDuplicates)-> + default_value("useOneCopy"), + "Controls handling of reads with duplicate names. " + "Can be one of: useAllCopies, useOneCopy, useNone, forbid.") + ("Reads.palindromicReads.skipFlagging", bool_switch(&readsOptions.palindromicReads.skipFlagging)-> default_value(false), @@ -327,8 +372,7 @@ void AssemblerOptions::addConfigurableOptions() ("MinHash.version", value<int>(&minHashOptions.version)-> default_value(0), - "Controls the version of the LowHash algorithm to use. Can be 0 (default) " - "or 1.(experimental).") + "Controls the version of the LowHash algorithm to use. Must be 0 (default).") ("MinHash.m", value<int>(&minHashOptions.m)-> @@ -357,12 +401,16 @@ void AssemblerOptions::addConfigurableOptions() ("MinHash.minBucketSize", value<int>(&minHashOptions.minBucketSize)-> default_value(0), - "The minimum bucket size to be used by the LowHash algorithm.") + "The minimum bucket size to be used by the LowHash algorithm. " + "If minBucketSize and maxBucketSize are both 0, they are adjusted automatically " + "at each iteration using simple heuristics.") ("MinHash.maxBucketSize", value<int>(&minHashOptions.maxBucketSize)-> default_value(10), - "The maximum bucket size to be used by the LowHash algorithm.") + "The maximum bucket size to be used by the LowHash algorithm. " + "If minBucketSize and maxBucketSize are both 0, they are adjusted automatically " + "at each iteration using simple heuristics.") ("MinHash.minFrequency", value<int>(&minHashOptions.minFrequency)-> @@ -381,7 +429,7 @@ void AssemblerOptions::addConfigurableOptions() value<int>(&alignOptions.alignMethod)-> default_value(3), "The alignment method to be used to create the read graph & the marker graph. " - "0 = old Shasta method, 1 = SeqAn (slow), 3 = banded SeqAn, 4 = new Shasta method (experimental).") + "0 = old Shasta method, 1 = SeqAn (slow), 3 = banded SeqAn, 4 and 5 = experimental.") ("Align.maxSkip", value<int>(&alignOptions.maxSkip)-> @@ -488,6 +536,16 @@ void AssemblerOptions::addConfigurableOptions() default_value(100), "Only used for alignment method 4 (experimental).") + ("Align.align5.driftRateTolerance", + value<double>(&alignOptions.align5DriftRateTolerance)-> + default_value(0.02), + "Maximum allowed drift rate for alignment method 5.") + + ("Align.align5.minBandExtend", + value<uint64_t>(&alignOptions.align5MinBandExtend)-> + default_value(10), + "Minimum band extension for alignment method 5.") + ("ReadGraph.creationMethod", value<int>(&readGraphOptions.creationMethod)-> default_value(0), @@ -501,7 +559,7 @@ void AssemblerOptions::addConfigurableOptions() ("ReadGraph.maxChimericReadDistance", value<int>(&readGraphOptions.maxChimericReadDistance)-> default_value(2), - "Used for chimeric read detection.") + "Used for chimeric read detection. Set to 0 to turn off chimera detection.") ("ReadGraph.strandSeparationMethod", value<uint64_t>(&readGraphOptions.strandSeparationMethod)-> @@ -599,7 +657,7 @@ void AssemblerOptions::addConfigurableOptions() default_value(6), "Minimum edge coverage (number of supporting oriented reads) " "for a marker graph edge to be created." - "Experimental. Only used with --Assembly.mode 1.") + "Only used with --Assembly.mode 2.") ("MarkerGraph.minEdgeCoveragePerStrand", value<uint64_t>(&markerGraphOptions.minEdgeCoveragePerStrand)-> @@ -607,7 +665,7 @@ void AssemblerOptions::addConfigurableOptions() "Minimum edge coverage (number of supporting oriented reads) " "on each strand " "for a marker graph edge to be created." - "Experimental. Only used with --Assembly.mode 1.") + "Only used with --Assembly.mode 2.") ("MarkerGraph.allowDuplicateMarkers", bool_switch(&markerGraphOptions.allowDuplicateMarkers)-> @@ -667,11 +725,6 @@ void AssemblerOptions::addConfigurableOptions() "corresponding marker graph edges. A cross edge is defined as an edge v0->v1 " "with out-degree(v0)>1, in-degree(v1)>1.") - ("MarkerGraph.reverseTransitiveReduction", - bool_switch(&markerGraphOptions.reverseTransitiveReduction)-> - default_value(false), - "Perform approximate reverse transitive reduction of the marker graph.") - ("MarkerGraph.peakFinder.minAreaFraction", value<double>(&markerGraphOptions.peakFinderMinAreaFraction)-> default_value(0.08), @@ -924,6 +977,178 @@ void AssemblerOptions::addConfigurableOptions() default_value(false), "Suppress output of haploid representation of the assembly (Mode 2 assembly only).") + ("Assembly.mode3.minPrimaryCoverage", + value<uint64_t>(&assemblyOptions.mode3Options.minPrimaryCoverage)-> + default_value(0), + "Minimum primary coverage. " + "If minPrimaryCoverage and maxPrimaryCoverage are both 0, " + "they are set automatically to appropriate values using a simple heuristic." + "Only used with --Assembly.mode 3.") + + ("Assembly.mode3.maxPrimaryCoverage", + value<uint64_t>(&assemblyOptions.mode3Options.maxPrimaryCoverage)-> + default_value(0), + "Maximum primary coverage. " + "If minPrimaryCoverage and maxPrimaryCoverage are both 0, " + "they are set automatically to appropriate values using a simple heuristic." + "Only used with --Assembly.mode 3.") + + ("Assembly.mode3.primaryGraph.maxLoss", + value<double>(&assemblyOptions.mode3Options.primaryGraphOptions.maxLoss)-> + default_value(0.1), + "Use for weak edge removal in the primary graph. " + "(Mode 3 assembly only).") + + ("Assembly.mode3.primaryGraph.crossEdgesLowCoverageThreshold", + value<uint64_t>(&assemblyOptions.mode3Options.primaryGraphOptions.crossEdgesLowCoverageThreshold)-> + default_value(1), + "Low coverage threshold for cross edge removal in the primary graph. " + "(Mode 3 assembly only).") + + ("Assembly.mode3.primaryGraph.crossEdgesHighCoverageThreshold", + value<uint64_t>(&assemblyOptions.mode3Options.primaryGraphOptions.crossEdgesHighCoverageThreshold)-> + default_value(3), + "High coverage threshold for cross edge removal in the primary graph. " + "(Mode 3 assembly only).") + + ("Assembly.mode3.assemblyGraph.detangleToleranceLow", + value<uint64_t>(&assemblyOptions.mode3Options.assemblyGraphOptions.detangleToleranceLow)-> + default_value(0), + "Used for detangling of the assembly graph " + "(Mode 3 assembly only).") + + ("Assembly.mode3.assemblyGraph.detangleToleranceHigh", + value<uint64_t>(&assemblyOptions.mode3Options.assemblyGraphOptions.detangleToleranceHigh)-> + default_value(2), + "Used for detangling of the assembly graph " + "(Mode 3 assembly only).") + + ("Assembly.mode3.assemblyGraph.epsilon", + value<double>(&assemblyOptions.mode3Options.assemblyGraphOptions.epsilon)-> + default_value(0.1), + "Epsilon value for the Bayesian model used for detangling the assembly graph " + "(Mode 3 assembly only).") + + ("Assembly.mode3.assemblyGraph.minLogP", + value<double>(&assemblyOptions.mode3Options.assemblyGraphOptions.minLogP)-> + default_value(20.), + "MinLogP value (in dB) for the Bayesian model used for detangling the assembly graph " + "(Mode 3 assembly only).") + + ("Assembly.mode3.assemblyGraph.longBubbleThreshold", + value<uint64_t>(&assemblyOptions.mode3Options.assemblyGraphOptions.longBubbleThreshold)-> + default_value(5000), + "Long bubble threshold " + "(Mode 3 assembly only).") + + ("Assembly.mode3.assemblyGraph.phaseErrorThreshold", + value<double>(&assemblyOptions.mode3Options.assemblyGraphOptions.phaseErrorThreshold)-> + default_value(0.1), + "Phase error threshold for phasing " + "(Mode 3 assembly only).") + + ("Assembly.mode3.assemblyGraph.bubbleErrorThreshold", + value<double>(&assemblyOptions.mode3Options.assemblyGraphOptions.bubbleErrorThreshold)-> + default_value(0.03), + "Bubble error threshold for bubble cleanup " + "(Mode 3 assembly only).") + + ("Assembly.mode3.assemblyGraph.bubbleCleanupMaxOffset", + value<uint64_t>(&assemblyOptions.mode3Options.assemblyGraphOptions.bubbleCleanupMaxOffset)-> + default_value(1000), + "Maximum bubble offset for bubble cleanup " + "(Mode 3 assembly only).") + + ("Assembly.mode3.assemblyGraph.chainTerminalCommonThreshold", + value<uint64_t>(&assemblyOptions.mode3Options.assemblyGraphOptions.chainTerminalCommonThreshold)-> + default_value(3), + "Used for bubble cleanup " + "(Mode 3 assembly only).") + + ("Assembly.mode3.assemblyGraph.superbubbleLengthThreshold1", + value<uint64_t>(&assemblyOptions.mode3Options.assemblyGraphOptions.superbubbleLengthThreshold1)-> + default_value(30000), + "Length threshold used for superbubble cleanup " + "(Mode 3 assembly only).") + + ("Assembly.mode3.assemblyGraph.superbubbleLengthThreshold2", + value<uint64_t>(&assemblyOptions.mode3Options.assemblyGraphOptions.superbubbleLengthThreshold2)-> + default_value(10000), + "Low length threshold used for superbubble removal " + "(Mode 3 assembly only).") + + ("Assembly.mode3.assemblyGraph.superbubbleLengthThreshold3", + value<uint64_t>(&assemblyOptions.mode3Options.assemblyGraphOptions.superbubbleLengthThreshold3)-> + default_value(30000), + "High length threshold used for superbubble removal " + "(Mode 3 assembly only).") + + ("Assembly.mode3.assemblyGraph.superbubbleLengthThreshold4", + value<uint64_t>(&assemblyOptions.mode3Options.assemblyGraphOptions.superbubbleLengthThreshold4)-> + default_value(30000), + "Length threshold used for superbubble detangling " + "(Mode 3 assembly only).") + + ("Assembly.mode3.localAssembly.estimatedOffsetRatio", + value<double>(&assemblyOptions.mode3Options.localAssemblyOptions.estimatedOffsetRatio)-> + default_value(1.1), + "For local assembly, the estimated offset between edgeIdA and edgeIdB gets " + "extended by this ratio to decide how much to extend reads that only appear in edgeIdA or edgeIdB. " + "(Mode 3 assembly only).") + + ("Assembly.mode3.localAssembly.vertexSamplingRate", + value<double>(&assemblyOptions.mode3Options.localAssemblyOptions.vertexSamplingRate)-> + default_value(0.8), + "Vertex sampling rate for local assembly, used to set minVertexCoverage. " + "Only used if minVertexCoverage is 0 on input to mode3::LocalAssembly constructor. " + "(Mode 3 assembly only).") + + ("Assembly.mode3.localAssembly.matchScore", + value<int64_t>(&assemblyOptions.mode3Options.localAssemblyOptions.matchScore)-> + default_value(6), + "Match score for local assembly. (Mode 3 assembly only).") + + ("Assembly.mode3.localAssembly.mismatchScore", + value<int64_t>(&assemblyOptions.mode3Options.localAssemblyOptions.mismatchScore)-> + default_value(-1), + "Mismatch score for local assembly. (Mode 3 assembly only).") + + ("Assembly.mode3.localAssembly.gapScore", + value<int64_t>(&assemblyOptions.mode3Options.localAssemblyOptions.gapScore)-> + default_value(-1), + "Gap score for local assembly. (Mode 3 assembly only).") + + ("Assembly.mode3.localAssembly.maxSkipBases", + value<uint64_t>(&assemblyOptions.mode3Options.localAssemblyOptions.maxSkipBases)-> + default_value(500), + "Number of bases (not markers) that can be skipped by an alignment in local assembly. " + "(Mode 3 assembly only).") + + ("Assembly.mode3.localAssembly.maxDrift", + value<double>(&assemblyOptions.mode3Options.localAssemblyOptions.maxDrift)-> + default_value(0.005), + "The maximum tolerated length drift of each read. " + "Used to compute the band for banded alignments in local assembly. " + "(Mode 3 assembly only).") + + ("Assembly.mode3.localAssembly.minHalfBand", + value<uint64_t>(&assemblyOptions.mode3Options.localAssemblyOptions.minHalfBand)-> + default_value(100), + "Minimum half band, in markers, for local assembly. " + "(Mode 3 assembly only).") + + ("Assembly.mode3.localAssembly.minScoreRatio", + value<double>(&assemblyOptions.mode3Options.localAssemblyOptions.minScoreRatio)-> + default_value(0.7), + "Score threshold for discarding alignments in for local assembly. " + "(Mode 3 assembly only).") + + ("Assembly.mode3.localAssembly.maxMsaLength", + value<uint64_t>(&assemblyOptions.mode3Options.localAssemblyOptions.maxMsaLength)-> + default_value(5000), + "Maximum length of a multiple sequence alignment for local assembly. " + "(Mode 3 assembly only).") + ; } @@ -950,6 +1175,7 @@ void ReadsOptions::write(ostream& s) const s << "desiredCoverage = " << desiredCoverageString << "\n"; s << "noCache = " << convertBoolToPythonString(noCache) << "\n"; + s << "handleDuplicates = " << handleDuplicates << "\n"; palindromicReads.write(s); } @@ -1009,6 +1235,8 @@ void AlignOptions::write(ostream& s) const s << "align4.deltaY = " << align4DeltaY << "\n"; s << "align4.minEntryCountPerCell = " << align4MinEntryCountPerCell << "\n"; s << "align4.maxDistanceFromBoundary = " << align4MaxDistanceFromBoundary << "\n"; + s << "align5.driftRateTolerance = " << align5DriftRateTolerance << "\n"; + s << "align5.minBandExtend = " << align5MinBandExtend << "\n"; } @@ -1060,8 +1288,6 @@ void MarkerGraphOptions::write(ostream& s) const s << "pruneIterationCount = " << pruneIterationCount << "\n"; s << "simplifyMaxLength = " << simplifyMaxLength << "\n"; s << "crossEdgeCoverageThreshold = " << crossEdgeCoverageThreshold << "\n"; - s << "reverseTransitiveReduction = " << - convertBoolToPythonString(reverseTransitiveReduction) << "\n"; s << "peakFinder.minAreaFraction = " << peakFinderMinAreaFraction << "\n"; s << "peakFinder.areaStartIndex = " << peakFinderAreaStartIndex << "\n"; @@ -1105,6 +1331,7 @@ void AssemblyOptions::write(ostream& s) const s << "iterative.bridgeRemovalMaxDistance = " << iterativeBridgeRemovalMaxDistance << "\n"; mode2Options.write(s); + mode3Options.write(s); } @@ -1134,6 +1361,68 @@ void Mode2AssemblyOptions::write(ostream& s) const +void Mode3AssemblyOptions::write(ostream& s) const +{ + s << "minPrimaryCoverage = " << minPrimaryCoverage << "\n"; + s << "maxPrimaryCoverage = " << maxPrimaryCoverage << "\n"; + primaryGraphOptions.write(s); + assemblyGraphOptions.write(s); + localAssemblyOptions.write(s); +} + + + +void Mode3AssemblyOptions::PrimaryGraphOptions::write(ostream& s) const +{ + s << "mode3.primaryGraph.maxLoss = " << maxLoss << "\n"; + s << "mode3.primaryGraph.crossEdgesLowCoverageThreshold = " << crossEdgesLowCoverageThreshold << "\n"; + s << "mode3.primaryGraph.crossEdgesHighCoverageThreshold = " << crossEdgesHighCoverageThreshold << "\n"; + +} + + + +void Mode3AssemblyOptions::AssemblyGraphOptions::write(ostream& s) const +{ + s << "mode3.assemblyGraph.detangleToleranceLow = " << detangleToleranceLow << "\n"; + s << "mode3.assemblyGraph.detangleToleranceHigh = " << detangleToleranceHigh << "\n"; + s << "mode3.assemblyGraph.epsilon = " << epsilon << "\n"; + s << "mode3.assemblyGraph.minLogP = " << minLogP << "\n"; + s << "mode3.assemblyGraph.longBubbleThreshold = " << longBubbleThreshold << "\n"; + s << "mode3.assemblyGraph.phaseErrorThreshold = " << phaseErrorThreshold << "\n"; + s << "mode3.assemblyGraph.bubbleErrorThreshold = " << bubbleErrorThreshold << "\n"; + s << "mode3.assemblyGraph.bubbleCleanupMaxOffset = " << bubbleCleanupMaxOffset << "\n"; + s << "mode3.assemblyGraph.chainTerminalCommonThreshold = " << chainTerminalCommonThreshold << "\n"; + s << "mode3.assemblyGraph.superbubbleLengthThreshold1 = " << superbubbleLengthThreshold1 << "\n"; + s << "mode3.assemblyGraph.superbubbleLengthThreshold2 = " << superbubbleLengthThreshold2 << "\n"; + s << "mode3.assemblyGraph.superbubbleLengthThreshold3 = " << superbubbleLengthThreshold3 << "\n"; + s << "mode3.assemblyGraph.superbubbleLengthThreshold4 = " << superbubbleLengthThreshold4 << "\n"; +} + + + +void Mode3AssemblyOptions::LocalAssemblyOptions::write(ostream& s) const +{ + s << "mode3.localAssembly.estimatedOffsetRatio = " << estimatedOffsetRatio << "\n"; + s << "mode3.localAssembly.vertexSamplingRate = " << vertexSamplingRate << "\n"; + + s << "mode3.localAssembly.matchScore = " << matchScore << "\n"; + s << "mode3.localAssembly.mismatchScore = " << mismatchScore << "\n"; + s << "mode3.localAssembly.gapScore = " << gapScore << "\n"; + + s << "mode3.localAssembly.maxSkipBases = " << maxSkipBases << "\n"; + + s << "mode3.localAssembly.maxDrift = " << maxDrift << "\n"; + + s << "mode3.localAssembly.minHalfBand = " << minHalfBand << "\n"; + + s << "mode3.localAssembly.minScoreRatio = " << minScoreRatio << "\n"; + + s << "mode3.localAssembly.maxMsaLength = " << maxMsaLength << "\n"; +} + + + void AssemblerOptions::write(ostream& s) const { readsOptions.write(s); diff --git a/src/AssemblerOptions.hpp b/src/AssemblerOptions.hpp index b8b74e9..e1a2f65 100644 --- a/src/AssemblerOptions.hpp +++ b/src/AssemblerOptions.hpp @@ -76,6 +76,7 @@ namespace shasta { class MarkerGraphOptions; class MinHashOptions; class Mode2AssemblyOptions; + class Mode3AssemblyOptions; class PalindromicReadOptions; class ReadsOptions; class ReadGraphOptions; @@ -127,6 +128,16 @@ public: bool noCache; string desiredCoverageString; uint64_t desiredCoverage; + + // String to control handling of duplicate reads. + // Can be one of: + // useAllCopies + // useOneCopy + // useNone + // forbid + // See ReadFlags.hpp for the meaning of each option. + string handleDuplicates; + PalindromicReadOptions palindromicReads; void write(ostream&) const; @@ -195,6 +206,8 @@ public: uint64_t align4DeltaY; uint64_t align4MinEntryCountPerCell; uint64_t align4MaxDistanceFromBoundary; + double align5DriftRateTolerance; + uint64_t align5MinBandExtend; void write(ostream&) const; }; @@ -246,7 +259,6 @@ public: string simplifyMaxLength; double crossEdgeCoverageThreshold; vector<size_t> simplifyMaxLengthVector; - bool reverseTransitiveReduction; double peakFinderMinAreaFraction; uint64_t peakFinderAreaStartIndex; @@ -307,6 +319,104 @@ public: +// Assembly options that are specific to Mode 3 assembly. +// See source code in the mode3 namespace +// (source files with a mode3-) prefix for more information +class shasta::Mode3AssemblyOptions { +public: + + uint64_t minPrimaryCoverage; + uint64_t maxPrimaryCoverage; + + // Options used to clean up the PrimaryGraph. + class PrimaryGraphOptions { + public: + + // Parameter to control removal of weak edges. + double maxLoss; + + // Parameters to control removal of cross edges. + uint64_t crossEdgesLowCoverageThreshold; + uint64_t crossEdgesHighCoverageThreshold; + + void write(ostream&) const; + }; + PrimaryGraphOptions primaryGraphOptions; + + + + class AssemblyGraphOptions { + public: + + // Detangle tolerances. + uint64_t detangleToleranceLow; + uint64_t detangleToleranceHigh; + + // Bayesian model. + double epsilon; + double minLogP; + + // Other thresholds used by the mode3::AssemblyGraph + uint64_t longBubbleThreshold; + double phaseErrorThreshold; + double bubbleErrorThreshold; + uint64_t bubbleCleanupMaxOffset; + uint64_t chainTerminalCommonThreshold; + uint64_t superbubbleLengthThreshold1; + uint64_t superbubbleLengthThreshold2; + uint64_t superbubbleLengthThreshold3; + uint64_t superbubbleLengthThreshold4; + + void write(ostream&) const; + }; + AssemblyGraphOptions assemblyGraphOptions; + + + + // Options used by class mode3::LocalAssembly + class LocalAssemblyOptions { + public: + + // The estimated offset gets extended by this ratio to + // decide how much to extend reads that only appear in edgeIdA or edgeIdB. + double estimatedOffsetRatio; + + // Vertex sampling rate, used to set minVertexCoverage. + // Only used if minVertexCoverage is 0 on input to + // mode3::LocalAssembly constructor. + double vertexSamplingRate; + + // Alignment parameters. + int64_t matchScore; + int64_t mismatchScore; + int64_t gapScore; + + // Number of bases (not markers) that can be skipped by an alignment. + uint64_t maxSkipBases; + + // The maximum tolerated length drift of each read. + // Used to compute the band for banded alignments. + double maxDrift; + + // Minimum half band, in markers. + uint64_t minHalfBand; + + // Minimum ration of scorew to best possible score for + // an alignment to be used. + double minScoreRatio; + + // The maximum length of an MSA alignment we are willing to compute. + uint64_t maxMsaLength; + + void write(ostream&) const; + }; + LocalAssemblyOptions localAssemblyOptions; + + void write(ostream&) const; +}; + + + // Options in the [Assembly] section of the configuration file. // Can also be entered on the command line with option names // beginning with "Assembly.". @@ -343,6 +453,9 @@ public: // Mode 2 assembly options. Mode2AssemblyOptions mode2Options; + // Mode 3 assembly options. + Mode3AssemblyOptions mode3Options; + void write(ostream&) const; // If a relative path is provided for a Bayesian consensus caller @@ -365,9 +478,19 @@ public: MarkerGraphOptions markerGraphOptions; AssemblyOptions assemblyOptions; - // Constructor. + // Constructor from a command line. + // If the command line includes a --config option, + // the specified built-in configuration or configuration file + // is used to fill the AssemblyOptions, + // but values specified on the command line take precedence. AssemblerOptions(int argumentCount, const char** arguments); + // Constructor from a configuration file. + // This only fills in the configurable options specified in + // the given configuration file. Command line only options + // are left at their defaults. + AssemblerOptions(const string& fileName); + // Add configurable options to the Boost option description object. void addCommandLineOnlyOptions(); void addConfigurableOptions(); diff --git a/src/AssemblerReadGraph.cpp b/src/AssemblerReadGraph.cpp index 5389a68..70c4125 100644 --- a/src/AssemblerReadGraph.cpp +++ b/src/AssemblerReadGraph.cpp @@ -642,8 +642,7 @@ void Assembler::computeReadGraphConnectedComponents() const componentMap[componentId].push_back(orientedReadId); } } - cout << "The read graph has " << componentMap.size() << - " connected components." << endl; + // cout << "The read graph has " << componentMap.size() << " connected components." << endl; @@ -1215,8 +1214,7 @@ void Assembler::flagCrossStrandReadGraphEdges2() componentMap[componentId].push_back(orientedReadId); } } - cout << "The read graph has " << componentMap.size() << - " connected components." << endl; + // cout << "The read graph has " << componentMap.size() << " connected components." << endl; diff --git a/src/AssemblerReads.cpp b/src/AssemblerReads.cpp index 22cbc79..da78c56 100644 --- a/src/AssemblerReads.cpp +++ b/src/AssemblerReads.cpp @@ -297,3 +297,13 @@ void Assembler::computeReadIdsSortedByName() reads->computeReadIdsSortedByName(); } + + + +// Find duplicate reads, as determined by name (not sequence). +// This also sets the isDuplicate and discardDueToDuplicates read flags +// and summarizes what it found Duplicates.csv. +void Assembler::findDuplicateReads(const string& handleDuplicates) +{ + reads->findDuplicates(handleDuplicates); +} diff --git a/src/AssemblyGraph.cpp b/src/AssemblyGraph.cpp index 38cd478..9bbbae1 100644 --- a/src/AssemblyGraph.cpp +++ b/src/AssemblyGraph.cpp @@ -1,6 +1,7 @@ #include "AssemblyGraph.hpp" #include "deduplicate.hpp" using namespace shasta; +using namespace mode0; #include "fstream.hpp" #include "iterator.hpp" diff --git a/src/AssemblyGraph.hpp b/src/AssemblyGraph.hpp index bffee58..38d7df1 100644 --- a/src/AssemblyGraph.hpp +++ b/src/AssemblyGraph.hpp @@ -25,12 +25,14 @@ vertex in the assembly graph. #include <limits> namespace shasta { - class AssemblyGraph; + namespace mode0 { + class AssemblyGraph; + } } -class shasta::AssemblyGraph { +class shasta::mode0::AssemblyGraph { public: // Use the same vertex and edge ids of the marker graph. diff --git a/src/AssemblyGraph2.cpp b/src/AssemblyGraph2.cpp index 5b41dfa..07b9d89 100644 --- a/src/AssemblyGraph2.cpp +++ b/src/AssemblyGraph2.cpp @@ -45,6 +45,7 @@ AssemblyGraph2::AssemblyGraph2( uint64_t readRepresentation, uint64_t k, // Marker length const MemoryMapped::Vector<ReadFlags>& readFlags, + const Reads& reads, const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, MarkerGraph& markerGraph, uint64_t pruneLength, @@ -57,6 +58,7 @@ AssemblyGraph2::AssemblyGraph2( readRepresentation(readRepresentation), k(k), readFlags(readFlags), + reads(reads), markers(markers), markerGraph(markerGraph) { @@ -649,7 +651,7 @@ AssemblyGraph2::edge_descriptor AssemblyGraph2::addEdge( (*this)[e].storeReadInformation(markerGraph); } if(assemble) { - AssemblyGraph2::assemble(e); + AssemblyGraph2::assemble(e, reads); } return e; @@ -744,7 +746,7 @@ void AssemblyGraph2::assemble() // Use assembled sequence from the marker graph to obtain // assembled sequence for all edges. BGL_FORALL_EDGES(e, g, G) { - assemble(e); + assemble(e, reads); } performanceLog << timestamp << "AssemblyGraph2::assemble ends." << endl; @@ -785,7 +787,7 @@ void AssemblyGraph2::assembleThreadFunction(size_t threadId) // Loop over all edges in this batch. for(uint64_t i=begin; i!=end; i++) { const edge_descriptor e = assembleParallelData.allEdges[i]; - assemble(e); + assemble(e, reads); } } } @@ -793,7 +795,7 @@ void AssemblyGraph2::assembleThreadFunction(size_t threadId) // Assemble sequence for every marker graph path of a given edge. -void AssemblyGraph2::assemble(edge_descriptor e) +void AssemblyGraph2::assemble(edge_descriptor e, const Reads& reads) { G& g = *this; @@ -807,7 +809,7 @@ void AssemblyGraph2::assemble(edge_descriptor e) MarkerGraph::EdgeId const * const end = begin + path.size(); const span<const MarkerGraph::EdgeId> pathSpan(begin, end); assembleMarkerGraphPath(readRepresentation, k, - markers, markerGraph, pathSpan, false, assembledSegment); + reads, markers, markerGraph, pathSpan, false, assembledSegment); @@ -2058,43 +2060,6 @@ uint64_t AssemblyGraph2Edge::countCommonSuffixBases() const - -// Figure out if this is a bubble is caused by copy number -// differences in repeats of period up to maxPeriod. -// If this is the case, returns the shortest period for which this is true. -// Otherwise, returns 0. -void AssemblyGraph2Edge::computeCopyNumberDifferencePeriod(uint64_t maxPeriod) -{ - if(not isBubble()) { - period = 0; - } - - // Check all pairs of branches. - vector<uint64_t> periods; - for(uint64_t i=0; i<branches.size()-1; i++) { - const vector<Base>& iSequence = branches[i].rawSequence; - for(uint64_t j=i+1; j<branches.size(); j++) { - const vector<Base>& jSequence = branches[j].rawSequence; - const uint64_t pairPeriod = shasta::isCopyNumberDifference(iSequence, jSequence, maxPeriod); - if(pairPeriod == 0) { - period = 0; - return; - } - periods.push_back(pairPeriod); - } - } - deduplicate(periods); - - - if(periods.size() == 1) { - period = periods.front(); - } else { - period = 0; - } -} - - - // Compute the edit distance between the sequences of the two branches. // This can only be called for a diploid bubble (2 branches). uint64_t AssemblyGraph2Edge::bubbleEditDistance() const @@ -2686,7 +2651,7 @@ AssemblyGraph2::edge_descriptor AssemblyGraph2::mergeWithPreviousIfPossible(edge newBranch.storeReadInformation(markerGraph); // Compute sequence for the updated edge. - assemble(eNew); + assemble(eNew, reads); // Remove the edges we are merging. boost::remove_edge(e, g); @@ -2754,7 +2719,7 @@ AssemblyGraph2::edge_descriptor AssemblyGraph2::mergeWithFollowingIfPossible(edg newBranch.storeReadInformation(markerGraph); // Compute sequence for the updated edge. - assemble(eNew); + assemble(eNew, reads); // Remove the edges we are merging. boost::remove_edge(e, g); @@ -3725,7 +3690,7 @@ void AssemblyGraph2::handleSuperbubble1( g[eNew].storeReadInformation(markerGraph); } if(assemble) { - AssemblyGraph2::assemble(eNew); + AssemblyGraph2::assemble(eNew, reads); } } diff --git a/src/AssemblyGraph2.hpp b/src/AssemblyGraph2.hpp index d475659..2b372d8 100644 --- a/src/AssemblyGraph2.hpp +++ b/src/AssemblyGraph2.hpp @@ -214,11 +214,6 @@ public: uint64_t backwardTransferCount = 0; uint64_t forwardTransferCount = 0; - // Figure out if this is a bubble is caused by copy number - // differences in repeats of period up to maxPeriod. - // If this is the case, stores the shortest period for which this is true. - // Otherwise, stores 0 as the period. - void computeCopyNumberDifferencePeriod(uint64_t maxPeriod); uint64_t period = 0; string color(uint64_t branchId) const; @@ -258,6 +253,7 @@ public: uint64_t readRepresentation, uint64_t k, // Marker length const MemoryMapped::Vector<ReadFlags>& readFlags, + const Reads& reads, const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, MarkerGraph&, uint64_t pruneLength, @@ -314,6 +310,7 @@ private: uint64_t readRepresentation; uint64_t k; const MemoryMapped::Vector<ReadFlags>& readFlags; + const Reads& reads; const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers; public: uint64_t getReadCount() const @@ -447,7 +444,7 @@ private: AssembleParallelData assembleParallelData; // Assemble sequence for every marker graph path of a given edge. - void assemble(edge_descriptor); + void assemble(edge_descriptor, const Reads&); // Store GFA sequence in each edge. void storeGfaSequence(); diff --git a/src/AssemblyPathGraph.cpp b/src/AssemblyPathGraph.cpp index c482831..792b586 100644 --- a/src/AssemblyPathGraph.cpp +++ b/src/AssemblyPathGraph.cpp @@ -3,6 +3,7 @@ #include "deduplicate.hpp" #include "html.hpp" using namespace shasta; +using namespace mode0; // Boost libraries. #include <boost/graph/graphviz.hpp> diff --git a/src/AssemblyPathGraph.hpp b/src/AssemblyPathGraph.hpp index 8cc8d56..e863f7f 100644 --- a/src/AssemblyPathGraph.hpp +++ b/src/AssemblyPathGraph.hpp @@ -128,15 +128,14 @@ namespace shasta { ostream&, const AssemblyPathGraphEdge&); - class AssemblyGraph; } class shasta::AssemblyPathGraphVertex { public: - AssemblyGraph::VertexId vertexId; - AssemblyPathGraphVertex(AssemblyGraph::VertexId vertexId) : + mode0::AssemblyGraph::VertexId vertexId; + AssemblyPathGraphVertex(mode0::AssemblyGraph::VertexId vertexId) : vertexId(vertexId) {} AssemblyPathGraphBaseClass::vertex_descriptor reverseComplementVertex = @@ -149,7 +148,7 @@ class shasta::AssemblyPathGraphEdge { public: // The AsssemblyGraph path corresponding to this edge. - vector <AssemblyGraph::EdgeId> path; + vector <mode0::AssemblyGraph::EdgeId> path; // The length of the path, as measured on the marker graph. uint64_t pathLength = 0; @@ -172,7 +171,7 @@ public: // Initialize the path to a single AssemblyGraph edge. - AssemblyPathGraphEdge(AssemblyGraph::EdgeId edgeId) : + AssemblyPathGraphEdge(mode0::AssemblyGraph::EdgeId edgeId) : path(1, edgeId) {} AssemblyPathGraphEdge() {} @@ -261,7 +260,7 @@ public: // The constructor does not fill in the oriented read ids for each edge. // This must be done separately (see Assembler::detangle). - AssemblyPathGraph(const AssemblyGraph&); + AssemblyPathGraph(const mode0::AssemblyGraph&); // The tangles currently present in the graph, keyed by their ids. TangleId nextTangleId = 0; @@ -273,7 +272,7 @@ public: void fillReverseComplementNewEdges( const vector<edge_descriptor>& newEdges, - const AssemblyGraph&); + const mode0::AssemblyGraph&); // Initial creation of all tangles. void createTangles(); @@ -302,7 +301,7 @@ public: // for GFA output. void detangle( double basesPerMarker, - const AssemblyGraph&); + const mode0::AssemblyGraph&); // Detangle a single tangle. // This does not fill in the reverseComplementEdge of newly created edges, diff --git a/src/AssemblyPathGraph2.cpp b/src/AssemblyPathGraph2.cpp index 00c8cd4..497b242 100644 --- a/src/AssemblyPathGraph2.cpp +++ b/src/AssemblyPathGraph2.cpp @@ -3,6 +3,7 @@ #include "deduplicate.hpp" #include "html.hpp" using namespace shasta; +using namespace mode0; // Boost libraries. #include <boost/graph/graphviz.hpp> diff --git a/src/AssemblyPathGraph2.hpp b/src/AssemblyPathGraph2.hpp index 1c8ae26..786c818 100644 --- a/src/AssemblyPathGraph2.hpp +++ b/src/AssemblyPathGraph2.hpp @@ -132,15 +132,17 @@ namespace shasta { ostream&, const AssemblyPathGraph2Edge&); - class AssemblyGraph; + namespace mode0 { + class AssemblyGraph; + } } class shasta::AssemblyPathGraph2Vertex { public: - AssemblyGraph::VertexId vertexId; - AssemblyPathGraph2Vertex(AssemblyGraph::VertexId vertexId) : + mode0::AssemblyGraph::VertexId vertexId; + AssemblyPathGraph2Vertex(mode0::AssemblyGraph::VertexId vertexId) : vertexId(vertexId) {} AssemblyPathGraph2BaseClass::vertex_descriptor reverseComplementVertex = @@ -153,7 +155,7 @@ class shasta::AssemblyPathGraph2Edge { public: // The AsssemblyGraph path corresponding to this edge. - vector <AssemblyGraph::EdgeId> path; + vector <mode0::AssemblyGraph::EdgeId> path; // The length of the path, as measured on the marker graph. uint64_t pathLength = 0; @@ -176,7 +178,7 @@ public: // Initialize the path to a single AssemblyGraph edge. - AssemblyPathGraph2Edge(AssemblyGraph::EdgeId edgeId) : + AssemblyPathGraph2Edge(mode0::AssemblyGraph::EdgeId edgeId) : path(1, edgeId) {} AssemblyPathGraph2Edge() {} @@ -282,7 +284,7 @@ public: // The constructor does not fill in the oriented read ids for each edge. // This must be done separately (see Assembler::detangle2). AssemblyPathGraph2( - const AssemblyGraph&, + const mode0::AssemblyGraph&, uint64_t diagonalReadCountMin, uint64_t offDiagonalReadCountMax, double detangleOffDiagonalRatio); @@ -302,7 +304,7 @@ public: void fillReverseComplementNewEdges( const vector<edge_descriptor>& newEdges, - const AssemblyGraph&); + const mode0::AssemblyGraph&); // Initial creation of all tangles. void createTangles(); @@ -331,7 +333,7 @@ public: // for GFA output. void detangle( double basesPerMarker, - const AssemblyGraph&); + const mode0::AssemblyGraph&); // Detangle a single tangle. // This does not fill in the reverseComplementEdge of newly created edges, diff --git a/src/CompressedAssemblyGraph.cpp b/src/CompressedAssemblyGraph.cpp index d120643..5e5767e 100644 --- a/src/CompressedAssemblyGraph.cpp +++ b/src/CompressedAssemblyGraph.cpp @@ -10,6 +10,7 @@ #include "runCommandWithTimeout.hpp" #include "subgraph.hpp" using namespace shasta; +using namespace mode0; // Boost libraries. #include <boost/algorithm/string.hpp> diff --git a/src/CompressedAssemblyGraph.hpp b/src/CompressedAssemblyGraph.hpp index e74a3cf..ca7e6ba 100644 --- a/src/CompressedAssemblyGraph.hpp +++ b/src/CompressedAssemblyGraph.hpp @@ -36,9 +36,9 @@ namespace shasta { class shasta::CompressedAssemblyGraphVertex { public: - AssemblyGraph::VertexId vertexId; + mode0::AssemblyGraph::VertexId vertexId; - CompressedAssemblyGraphVertex(AssemblyGraph::VertexId vertexId) : + CompressedAssemblyGraphVertex(mode0::AssemblyGraph::VertexId vertexId) : vertexId(vertexId) {} }; @@ -51,10 +51,10 @@ public: // with this edge. // This includes the assembly graph vertices // associated with the source and target of this edge. - vector<AssemblyGraph::VertexId> vertices; + vector<mode0::AssemblyGraph::VertexId> vertices; // The chain of sets of parallel assembly graph edges. - vector< vector<AssemblyGraph::EdgeId> > edges; + vector< vector<mode0::AssemblyGraph::EdgeId> > edges; // An id assigned to this edge of the compressed assembly graph // and used in gfa and other output. @@ -68,7 +68,7 @@ public: { return double(minMarkerCount + maxMarkerCount) / 2.; } - void fillMarkerCounts(const AssemblyGraph&); + void fillMarkerCounts(const mode0::AssemblyGraph&); // Find the oriented reads that appear in marker graph vertices // internal to this edge of the compressed assembly graph. @@ -99,8 +99,8 @@ private: class shasta::CompressedAssemblyGraph : public CompressedAssemblyGraphBaseClass { public: - using VertexId = AssemblyGraph::VertexId; - using EdgeId = AssemblyGraph::EdgeId; + using VertexId = mode0::AssemblyGraph::VertexId; + using EdgeId = mode0::AssemblyGraph::EdgeId; // Create the CompressedAssemblyGraph from the AssemblyGraph. @@ -180,7 +180,7 @@ private: // Create an edge for each set of parallel edges of the assembly graph. void createEdges( - const AssemblyGraph&, + const mode0::AssemblyGraph&, const vector<vertex_descriptor>& vertexTable ); @@ -195,10 +195,10 @@ private: // Fill in the assembly graph edges that go into each // edge of the compressed assembly graph. - void fillContributingEdges(const AssemblyGraph&); + void fillContributingEdges(const mode0::AssemblyGraph&); // Fill in minimum and maximum marker counts for each edge. - void fillMarkerCounts(const AssemblyGraph&); + void fillMarkerCounts(const mode0::AssemblyGraph&); // Find the oriented reads that appear in marker graph vertices // internal to each edge of the compressed assembly graph. diff --git a/src/ConfigurationTable.cpp b/src/ConfigurationTable.cpp index 8e69180..7e8d064 100644 --- a/src/ConfigurationTable.cpp +++ b/src/ConfigurationTable.cpp @@ -1609,6 +1609,51 @@ mode2.bubbleRemoval.minConcordantReadCount = 2 +)zzz"}, + {"Nanopore-ncm23-May2024", R"zzz(# This assembly configuration is for nanopore reads generated using the +# "Experimental extremely high-accuracy, ultra-long sequencing kit" +# from the ONT December 2023 data release: +# https://labs.epi2me.io/gm24385_ncm23_preview/ + +# It uses Mode 3 assembly to create a phased assembly. +# It was only tested for a human genome at coverage 40x to 60x, +# but it should work at lower or higher coverage, +# within reasonable limits, because it includes some +# provisions for coverage adaptivity. + +[Reads] +representation = 0 +minReadLength = 10000 +noCache = True +palindromicReads.deltaThreshold = 300 + +[Kmers] +k = 30 +probability = 0.05 + +[MinHash] +minHashIterationCount = 50 +minBucketSize = 0 +maxBucketSize = 0 +minFrequency = 5 + +[Align] +alignMethod = 5 +sameChannelReadAlignment.suppressDeltaThreshold = 30 +minAlignedMarkerCount = 1000 +minAlignedFraction = 0.9 +maxSkip = 20 +maxDrift = 10 +maxTrim = 20 + +[ReadGraph] +maxAlignmentCount = 20 +strandSeparationMethod = 2 + +[Assembly] +mode = 3 + + )zzz"} }; } diff --git a/src/HashedKmerChecker.cpp b/src/HashedKmerChecker.cpp new file mode 100644 index 0000000..6fc3c03 --- /dev/null +++ b/src/HashedKmerChecker.cpp @@ -0,0 +1,118 @@ +// Shasta. +#include "HashedKmerChecker.hpp" +#include "Kmer.hpp" +#include "MemoryMappedObject.hpp" +using namespace shasta; + +// MurmurHash. +#include "MurmurHash2.hpp" + +// Standard library. +#include <cmath> + + + +// We must guarantee that if a KmerId if a marker +// its reverse complement is also a marker. +// To do this we check both. +// This will usually require two calls to MurmurHash2, +// but this is probably still faster than two cache misses +// in the old k-mer table. +bool HashedKmerChecker::isMarker(KmerId kmerId) const +{ + // Check the KmerId. + if(MurmurHash2(&kmerId, sizeof(kmerId), 267457831) < hashThreshold) { + return true; + } + + // Check its reverse complement. + const Kmer kmer(kmerId, k); + const Kmer kmerRc = kmer.reverseComplement(k); + const KmerId kmerIdRc = KmerId(kmerRc.id(k)); + return MurmurHash2(&kmerIdRc, sizeof(kmerId), 267457831) < hashThreshold; +} + + + +// Initial creation. +HashedKmerChecker::HashedKmerChecker( + uint64_t k, + double markerDensity, + const MappedMemoryOwner& mappedMemoryOwner) : + MappedMemoryOwner(mappedMemoryOwner), + k(k) +{ + // Sanity check on the marker density. + if(markerDensity<0. || markerDensity>1.) { + throw runtime_error("Invalid marker density " + + to_string(markerDensity) + " requested."); + } + + + + // Compute the hash threshold that achieves the required marker density. + + // In this computation, we neglect self-complementary k-mers, + // which are a small minority of total. + + // Call: + // - hashMax the maximum possible value of a hash + // - hashValue the hash value for a given KmerId + // - hashValueRc the hash value for its reverse complement (for length k) + // - p = hashThreshold / hashMax + + // A KmerId is a marker if + // Event A: hashValue < hashThreshold + // OR + // Event B: hashValueRc < hashThreshold + // Event A occurs with probability P(A) = hashThreshold / hashMax = p. + // Event B also occurs with probability P(B) = hashThreshold / hashMax = p. + + // If we use a good hash function, we can consider A and B uncorrelated. + // Therefore we can use the standard formula: + // P(A or B) = 1 - P(not(A or B)) = + // 1 - P((not A) and (not B)) = + // 1 - (P(not A)) * P(not B)) = + // 1 - (1 - P(A)) * (1 - P(B)) + // It can also be verified by simple algebra that this is equal to the standard formula + // P(A or B) = + // P(A) + P(B) - P(A and B) = + // P(A) + P(B) - P(A) * P(B) + // but we don't need this part. + + // Using the above we get: + // markerDensity = + // P(A or B) = + // 1 - (1 - P(A)) * (1 - P(B)) == + // 1 - (1 - p)^2 + // (Because P(A) = P(B) = p). + // From + // markerDensity = 1 - (1 - p)^2 + // we get + // p = 1 - sqrt(1 - markerDensity) + // And finally hashThreshold = hashMax * p. + + const double p = 1. - std::sqrt(1. - markerDensity); + const double hashMax = std::numeric_limits<uint32_t> :: max(); + hashThreshold = uint32_t(std::round(double(hashMax) * p)); + + // Store k and the hash threshold in binary data. + MemoryMapped::Object<HashedKmerCheckerData> data; + data.createNew(largeDataName("HashedKmerChecker"), largeDataPageSize); + data->k = k; + data->hashThreshold = hashThreshold; + +} + + + +// Creation from binary data. +HashedKmerChecker::HashedKmerChecker( + const MappedMemoryOwner& mappedMemoryOwner) : + MappedMemoryOwner(mappedMemoryOwner) +{ + MemoryMapped::Object<HashedKmerCheckerData> data; + data.accessExistingReadOnly(largeDataName("HashedKmerChecker")); + k = data->k; + hashThreshold = data->hashThreshold; +} diff --git a/src/HashedKmerChecker.hpp b/src/HashedKmerChecker.hpp new file mode 100644 index 0000000..36b1f4c --- /dev/null +++ b/src/HashedKmerChecker.hpp @@ -0,0 +1,43 @@ +#ifndef SHASTA_HASHED_KMER_CHECKER_HPP +#define SHASTA_HASHED_KMER_CHECKER_HPP + +#include "KmerChecker.hpp" +#include "MappedMemoryOwner.hpp" + +namespace shasta { + class HashedKmerChecker; +} + + +// The new implementation of the KmerChecker is not table based +// and uses hashing instead. +// It only supports marker generation method 0 (random generation) +// but allow marker lengths k<32. +class shasta::HashedKmerChecker : + public KmerChecker, + public MappedMemoryOwner { +public: + bool isMarker(KmerId) const; + + // Initial creation. + HashedKmerChecker(uint64_t k, double markerDensity, const MappedMemoryOwner&); + + // Creation from binary data. + HashedKmerChecker(const MappedMemoryOwner&); + +private: + uint64_t k; + uint32_t hashThreshold; + + // This is used to store the hashThreshold in binary data. + class HashedKmerCheckerData { + public: + uint64_t k; + uint32_t hashThreshold; + }; +}; + + + +#endif + diff --git a/src/HttpServer.hpp b/src/HttpServer.hpp index 186d7a8..51dc138 100644 --- a/src/HttpServer.hpp +++ b/src/HttpServer.hpp @@ -114,7 +114,14 @@ public: } try { std::istringstream s(next); - s >> value; + if constexpr(std::is_same_v<string, T>) { + // For string use getline to process correctly strings containing spaces. + // The constexpr is needed to force evaluation at compile time + // (otherwise this branch does not compile for types other than string). + std::getline(s, value); + } else { + s >> value; + } } catch (...) { return false; } diff --git a/src/Kmer.hpp b/src/Kmer.hpp index aae6cfd..a7da37b 100644 --- a/src/Kmer.hpp +++ b/src/Kmer.hpp @@ -10,31 +10,12 @@ namespace shasta { // Type used to represent a k-mer. // This limits the maximum k-mer length that can be used. // If this changes, KmerId must also be changed. - using Kmer = ShortBaseSequence16; + using Kmer16 = ShortBaseSequence16; + using Kmer32 = ShortBaseSequence32; + using Kmer = Kmer32; static_assert( std::numeric_limits<KmerId>::digits == 2*Kmer::capacity, "Kmer and KmerId types are inconsistent."); - - class KmerInfo; } - - -class shasta::KmerInfo { -public: - - // Frequency of this k-mer in input reads. - // Only filled in if selectKmersBasedOnFrequency - // is used. - uint64_t frequency = 0; - - KmerId reverseComplementedKmerId; - bool isMarker; - bool isRleKmer; - - // Hash function of the KmerId, used for downsampling markers - // for alignments using method 3. - uint32_t hash; -}; - #endif diff --git a/src/KmerChecker.hpp b/src/KmerChecker.hpp new file mode 100644 index 0000000..c7b17fd --- /dev/null +++ b/src/KmerChecker.hpp @@ -0,0 +1,23 @@ +#ifndef SHASTA_KMER_CHECKER_HPP +#define SHASTA_KMER_CHECKER_HPP + +// Shasta. +#include "shastaTypes.hpp" + +namespace shasta { + class KmerChecker; + class HashedKmerChecker; +} + + + +// The KmerChecker is an abstract class that knows how to find +// out if a k-mer is a marker. +// All implementations must guarantee that if a KmerId if a marker +// its reverse complement is also a marker. +class shasta::KmerChecker { +public: + virtual bool isMarker(KmerId) const = 0; +}; + +#endif diff --git a/src/KmerCheckerFactory.cpp b/src/KmerCheckerFactory.cpp new file mode 100644 index 0000000..ab22c53 --- /dev/null +++ b/src/KmerCheckerFactory.cpp @@ -0,0 +1,118 @@ +#include "KmerCheckerFactory.hpp" +#include "Kmer.hpp" +#include "KmerTable.hpp" +#include "HashedKmerChecker.hpp" +#include "AssemblerOptions.hpp" +#include "Reads.hpp" +using namespace shasta; + + + +std::shared_ptr<KmerChecker> KmerCheckerFactory::createNew( + const KmersOptions& kmersOptions, + uint64_t threadCount, + const Reads& reads, + const MappedMemoryOwner& mappedMemoryOwner) +{ + // For generation method 0, always use the HashedKmerChecker. + if(kmersOptions.generationMethod == 0) { + return make_shared<HashedKmerChecker>( + kmersOptions.k, + kmersOptions.probability, + mappedMemoryOwner); + } + + // In all other cases, we are limited to k<=16. + if(kmersOptions.k > int(Kmer16::capacity)) { + throw runtime_error("Kmer generation method " + + to_string(kmersOptions.generationMethod) + + " is only supported for a maximum marker length of 15."); + } + + const int seed = 231; + switch(kmersOptions.generationMethod) { + case 0: + return make_shared<KmerTable0>( + kmersOptions.k, + kmersOptions.probability, + seed, + mappedMemoryOwner); + + case 1: + return make_shared<KmerTable1>( + kmersOptions.k, + kmersOptions.probability, + seed, + kmersOptions.enrichmentThreshold, + reads, + threadCount, + mappedMemoryOwner); + + case 2: + return make_shared<KmerTable2>( + kmersOptions.k, + kmersOptions.probability, + seed, + kmersOptions.enrichmentThreshold, + reads, + threadCount, + mappedMemoryOwner); + + case 3: + return make_shared<KmerTable3>( + kmersOptions.k, + reads.representation, + kmersOptions.file, + mappedMemoryOwner); + + case 4: + return make_shared<KmerTable4>( + kmersOptions.k, + kmersOptions.probability, + seed, + kmersOptions.distanceThreshold, + reads, + threadCount, + mappedMemoryOwner); + + default: + throw runtime_error("Invalid --Kmers generationMethod. " + "Specify a value between 0 and 4, inclusive."); + } +} + + + +std::shared_ptr<shasta::KmerChecker> KmerCheckerFactory::createFromBinaryData( + uint64_t k, + uint64_t generationMethod, + const Reads& reads, + const MappedMemoryOwner& mappedMemoryOwner) +{ + // For generation method 0, always use the HashedKmerChecker. + if(generationMethod == 0) { + return make_shared<HashedKmerChecker>(mappedMemoryOwner); + } + + switch(generationMethod) { + case 0: + return make_shared<KmerTable0>(k, mappedMemoryOwner); + + case 1: + return make_shared<KmerTable1>(k, reads, mappedMemoryOwner); + + case 2: + return make_shared<KmerTable2>(k, reads, mappedMemoryOwner); + + case 3: + return make_shared<KmerTable3>(k, mappedMemoryOwner); + + case 4: + return make_shared<KmerTable4>(k, reads, mappedMemoryOwner); + + + default: + throw runtime_error("Invalid --Kmers generationMethod. " + "Specify a value between 0 and 4, inclusive."); + } +} diff --git a/src/KmerCheckerFactory.hpp b/src/KmerCheckerFactory.hpp new file mode 100644 index 0000000..b934727 --- /dev/null +++ b/src/KmerCheckerFactory.hpp @@ -0,0 +1,39 @@ +#ifndef SHASTA_KMER_CHECKER_FACTORY_HPP +#define SHASTA_KMER_CHECKER_FACTORY_HPP + +// Shasta. +#include "KmerChecker.hpp" +#include "memory.hpp" + +namespace shasta { + class KmerCheckerFactory; + + class KmerChecker; + class KmersOptions; + class Reads; + class MappedMemoryOwner; + +} + + + +// The KmerCheckerFactory knows how to create the appropriate +// type of KmerChecker for the options used. +class shasta::KmerCheckerFactory { +public: + + static shared_ptr<KmerChecker> createNew( + const KmersOptions&, + uint64_t threadCount, + const Reads&, + const MappedMemoryOwner&); + + static shared_ptr<KmerChecker> createFromBinaryData( + uint64_t k, + uint64_t generationMethod, + const Reads&, + const MappedMemoryOwner&); +}; + +#endif + diff --git a/src/AssemblerKmers.cpp b/src/KmerTable.cpp index 37ce89f..01fd0ad 100644 --- a/src/AssemblerKmers.cpp +++ b/src/KmerTable.cpp @@ -1,46 +1,31 @@ // Shasta. -#include "Assembler.hpp" +#include "KmerTable.hpp" +#include "AssemblerOptions.hpp" #include "deduplicate.hpp" -#include "MurmurHash2.hpp" +#include "Kmer.hpp" #include "Reads.hpp" -#include "timestamp.hpp" using namespace shasta; // Standard library. #include "fstream.hpp" #include <random> - - -void Assembler::accessKmers() -{ - kmerTable.accessExistingReadOnly(largeDataName("Kmers")); - if(kmerTable.size() != (1ULL<< (2*assemblerInfo->k))) { - throw runtime_error("Size of k-mer vector is inconsistent with stored value of k."); - } -} - -void Assembler::checkKmersAreOpen()const -{ - if(!kmerTable.isOpen) { - throw runtime_error("Kmers are not accessible."); - } -} +// Explicit template instantiations. +#include "MultithreadedObject.tpp" +template class MultithreadedObject<KmerTable1>; +template class MultithreadedObject<KmerTable2>; +template class MultithreadedObject<KmerTable4>; // Randomly select the k-mers to be used as markers. -void Assembler::randomlySelectKmers( - size_t k, // k-mer length. +KmerTable0::KmerTable0( + uint64_t k, double probability, // The probability that a k-mer is selected as a marker. - int seed // For random number generator. -) + int seed , // For random number generator. + const MappedMemoryOwner& mappedMemoryOwner + ) : KmerTable(k, true, mappedMemoryOwner) { - // Sanity check on the value of k, then store it. - if(k > Kmer::capacity) { - throw runtime_error("K-mer capacity exceeded."); - } - assemblerInfo->k = k; // The total number of k-mers of this length. // This includes both RLE and non-RLE k-mers. @@ -57,12 +42,6 @@ void Assembler::randomlySelectKmers( - // Fill in the fields of the k-mer table - // that depends only on k. - initializeKmerTable(); - - - // Compute the probability p with which we select // each k-mer and its reverse complement // in order to achieve the required k-mer fraction. @@ -98,57 +77,31 @@ void Assembler::randomlySelectKmers( } - // Do some counting. - uint64_t rleKmerCount = 0; - uint64_t markerKmerCount = 0; - uint64_t markerRleKmerCount = 0; - for(uint64_t kmerId=0; kmerId<kmerCount; kmerId++) { - const KmerInfo& kmerInfo = kmerTable[kmerId]; - if(kmerInfo.isRleKmer) { - ++rleKmerCount; - } - if(kmerInfo.isMarker) { - ++markerKmerCount; - } - if(kmerInfo.isRleKmer and kmerInfo.isMarker) { - ++markerRleKmerCount; - } - } - - - - // Summary messages. - if(assemblerInfo->readRepresentation == 0) { +} - // We are using the raw representation of the reads. - cout << "Total number of k-mers of length " << k << " is " << kmerCount << endl; - cout << "Of those, " << markerKmerCount << " will be used as markers." << endl; - cout << "Fraction of k-mers used as markers: requested " << - probability << ", actual " << - double(markerKmerCount)/double(kmerCount) << "." << endl; +KmerTable::KmerTable( + uint64_t k, + bool createNew, + const MappedMemoryOwner& mappedMemoryOwner) : + MappedMemoryOwner(mappedMemoryOwner), k(k) +{ + if(createNew) { + createKmerTable(); } else { - - // We are using the RLE representation of the reads. - cout << "Total number of k-mers of length " << k << " is " << kmerCount << endl; - cout << "Number of RLE k-mers of length " << k << " is " << rleKmerCount << endl; - cout << "Of those, " << markerRleKmerCount << " will be used as markers." << endl; - cout << "Fraction of k-mers used as markers: requested " << - probability << ", actual " << - double(markerRleKmerCount)/double(rleKmerCount) << "." << endl; - + accessKmerTable(); } - } -void Assembler::initializeKmerTable() +void KmerTable::createKmerTable() { + SHASTA_ASSERT(k <= Kmer16::capacity); + // Create the kmer table with the necessary size. kmerTable.createNew(largeDataName("Kmers"), largeDataPageSize); - const size_t k = assemblerInfo->k; const size_t kmerCount = 1ULL << (2ULL*k); kmerTable.resize(kmerCount); @@ -156,7 +109,7 @@ void Assembler::initializeKmerTable() for(uint64_t kmerId=0; kmerId<kmerCount; kmerId++) { const Kmer kmer(kmerId, k); const Kmer reverseComplementedKmer = kmer.reverseComplement(k); - kmerTable[kmerId].reverseComplementedKmerId = KmerId(reverseComplementedKmer.id(k)); + kmerTable[kmerId].reverseComplementedKmerId = KmerId16(reverseComplementedKmer.id(k)); } for(uint64_t kmerId=0; kmerId<kmerCount; kmerId++) { const uint64_t reverseComplementedKmerId = kmerTable[kmerId].reverseComplementedKmerId; @@ -178,48 +131,23 @@ void Assembler::initializeKmerTable() } } - - // Fill in hash values used for downsampling. - for(uint64_t kmerId=0; kmerId<kmerCount; kmerId++) { - const uint64_t n = kmerId + kmerTable[kmerId].reverseComplementedKmerId; - kmerTable[kmerId].hash = MurmurHash2(&n, sizeof(n), 13477); - } - } -void Assembler::writeKmers(const string& fileName) const +void KmerTable::accessKmerTable() { - checkKmersAreOpen(); - - // Get the k-mer length. - const size_t k = assemblerInfo->k; - const size_t kmerCount = 1ULL << (2ULL*k); - SHASTA_ASSERT(kmerTable.size() == kmerCount); - - // Open the output file and write the header line. - ofstream file(fileName); - file << "KmerId,Kmer,IsMarker,ReverseComplementedKmerId,ReverseComplementedKmer\n"; - - // Write a line for each k-mer. - for(uint64_t kmerId=0; kmerId<kmerCount; kmerId++) { - file << kmerId << ","; - file << Kmer(kmerId, k) << ","; - file << int(kmerTable[kmerId].isMarker) << ","; - file << kmerTable[kmerId].reverseComplementedKmerId << ","; - file << Kmer(kmerTable[kmerId].reverseComplementedKmerId, k) << "\n"; - } + kmerTable.accessExistingReadOnly(largeDataName("Kmers")); + SHASTA_ASSERT(kmerTable.size() == 1ULL << (2ULL*k)); } // Select marker k-mers randomly, but excluding // the ones that have high frequency in the reads. -void Assembler::selectKmersBasedOnFrequency( +KmerTable1::KmerTable1( - // k-mer length. - size_t k, + uint64_t k, // The desired marker density double markerDensity, @@ -232,15 +160,16 @@ void Assembler::selectKmersBasedOnFrequency( // over what a random distribution would give. double enrichmentThreshold, - size_t threadCount -) -{ + const Reads& reads, - // Sanity check on the value of k, then store it. - if(k > Kmer::capacity) { - throw runtime_error("K-mer capacity exceeded."); - } - assemblerInfo->k = k; + size_t threadCount, + + const MappedMemoryOwner& mappedMemoryOwner) : + + KmerTable(k, true, mappedMemoryOwner), + MultithreadedObject<KmerTable1>(*this), + reads(reads) +{ // Sanity check. if(markerDensity<0. || markerDensity>1.) { @@ -253,13 +182,9 @@ void Assembler::selectKmersBasedOnFrequency( threadCount = std::thread::hardware_concurrency(); } - // Fill in the fields of the k-mer table - // that depends only on k. - initializeKmerTable(); - // Compute the frequency of all k-mers in oriented reads. - setupLoadBalancing(reads->readCount(), 1000); - runThreads(&Assembler::computeKmerFrequency, threadCount); + setupLoadBalancing(reads.readCount(), 1000); + runThreads(&KmerTable1::computeKmerFrequency, threadCount); // Compute the total number of k-mer occurrences in reads // and the number of k-mers that can possibly occur. @@ -271,7 +196,7 @@ void Assembler::selectKmersBasedOnFrequency( for(uint64_t kmerId=0; kmerId!=kmerTable.size(); kmerId++) { const KmerInfo& info = kmerTable[kmerId]; totalKmerOccurrences += info.frequency; - if(assemblerInfo->readRepresentation == 0) { + if(reads.representation == 0) { ++possibleKmerCount; } else { if(info.isRleKmer) { @@ -284,7 +209,7 @@ void Assembler::selectKmersBasedOnFrequency( - if(assemblerInfo->readRepresentation == 0) { + if(reads.representation == 0) { // We are using raw read representation. cout << @@ -345,7 +270,7 @@ void Assembler::selectKmersBasedOnFrequency( vector<KmerId> candidateKmers; for(uint64_t kmerId=0; kmerId<kmerTable.size(); kmerId++) { const KmerInfo& info = kmerTable[kmerId]; - if((assemblerInfo->readRepresentation==1) and (not info.isRleKmer)) { + if((reads.representation==1) and (not info.isRleKmer)) { continue; } const uint64_t frequency = info.frequency; @@ -376,6 +301,7 @@ void Assembler::selectKmersBasedOnFrequency( // until we have enough. uint64_t kmerOccurrencesCount = 0; uint64_t kmerCount = 0; + const uint64_t giveUpCount = uint64_t(0.9 * double(candidateKmers.size())); const uint64_t desiredKmerOccurrencesCount = uint64_t(markerDensity * double(totalKmerOccurrences)); while(kmerOccurrencesCount < desiredKmerOccurrencesCount) { @@ -408,6 +334,10 @@ void Assembler::selectKmersBasedOnFrequency( reverseComplementedInfo.isMarker = true; kmerOccurrencesCount += reverseComplementedInfo.frequency; ++kmerCount; + + if(kmerCount >= giveUpCount) { + throw runtime_error("Giving up after selecting as markers 90% of the candidate kmers."); + } } cout << "Selected " << kmerCount << " k-mers as markers." << endl; @@ -415,7 +345,8 @@ void Assembler::selectKmersBasedOnFrequency( } -void Assembler::computeKmerFrequency(size_t threadId) + +void KmerTable1::computeKmerFrequency(size_t threadId) { // Create a frequency vector for this thread. MemoryMapped::Vector<uint64_t> frequency; @@ -428,7 +359,6 @@ void Assembler::computeKmerFrequency(size_t threadId) // Loop over all batches assigned to this thread. - const size_t k = assemblerInfo->k; uint64_t begin, end; while(getNextBatch(begin, end)) { @@ -436,7 +366,7 @@ void Assembler::computeKmerFrequency(size_t threadId) for(ReadId readId=ReadId(begin); readId!=ReadId(end); readId++) { // Access the sequence of this read. - const LongBaseSequenceView read = reads->getRead(readId); + const LongBaseSequenceView read = reads.getRead(readId); // If the read is pathologically short, it has no k-mers. if(read.baseCount < k) { @@ -489,17 +419,18 @@ void Assembler::computeKmerFrequency(size_t threadId) // Read the k-mers from file. -void Assembler::readKmersFromFile(uint64_t k, const string& fileName) +KmerTable3::KmerTable3( + uint64_t k, + uint64_t readRepresentation, + const string& fileName, + const MappedMemoryOwner& mappedMemoryOwner) : + KmerTable(k, true, mappedMemoryOwner) { - // Sanity check on the value of k, then store it. - if(k > Kmer::capacity) { - throw runtime_error("K-mer capacity exceeded."); + if(fileName.empty() or + fileName[0] != '/') { + throw runtime_error("Option --Kmers.file must specify an absolute path. " + "A relative path is not accepted."); } - assemblerInfo->k = k; - - // Fill in the fields of the k-mer table - // that depends only on k. - initializeKmerTable(); // Open the file. ifstream file(fileName); @@ -541,7 +472,7 @@ void Assembler::readKmersFromFile(uint64_t k, const string& fileName) const KmerId kmerId = KmerId(kmer.id(k)); SHASTA_ASSERT(kmerId < kmerTable.size()); KmerInfo& kmerInfo = kmerTable[kmerId]; - if((assemblerInfo->readRepresentation==1) and (not kmerInfo.isRleKmer)) { + if((readRepresentation==1) and (not kmerInfo.isRleKmer)) { throw runtime_error("Non-RLE k-mer (duplicate consecutive bases) in " + fileName + ":\n" + line); } @@ -561,7 +492,7 @@ void Assembler::readKmersFromFile(uint64_t k, const string& fileName) if(kmerInfo.isMarker) { ++usedKmerCount; } - if(assemblerInfo->readRepresentation == 0) { + if(readRepresentation == 0) { ++possibleKmerCount; } else { if(kmerInfo.isRleKmer) { @@ -577,31 +508,19 @@ void Assembler::readKmersFromFile(uint64_t k, const string& fileName) // In this version, marker k-mers are selected randomly, but excluding // any k-mer that is over-enriched even in a single oriented read. -void Assembler::selectKmers2( - - // k-mer length. - size_t k, - - // The desired marker density +KmerTable2::KmerTable2( + uint64_t k, double markerDensity, - - // Seed for random number generator. int seed, - - // Exclude k-mers enriched by more than this amount, - // even in a single oriented read. - // Enrichment is the ratio of k-mer frequency in reads - // over what a random distribution would give. double enrichmentThreshold, - - size_t threadCount -) + const Reads& reads, + uint64_t threadCount, + const MappedMemoryOwner& mappedMemoryOwner) : + KmerTable(k, true, mappedMemoryOwner), + MultithreadedObject<KmerTable2>(*this), + reads(reads), + enrichmentThreshold(enrichmentThreshold) { - // Sanity check on the value of k, then store it. - if(k > Kmer::capacity) { - throw runtime_error("K-mer capacity exceeded."); - } - assemblerInfo->k = k; // Sanity check. if(markerDensity<0. || markerDensity>1.) { @@ -614,31 +533,24 @@ void Assembler::selectKmers2( threadCount = std::thread::hardware_concurrency(); } - // Fill in the fields of the k-mer table - // that depends only on k. - initializeKmerTable(); - - // Store the enrichmentThreshold so all threads can see it. - selectKmers2Data.enrichmentThreshold = enrichmentThreshold; - // For each KmerId that is an RLE k-mer, compute the // global frequency (total number of occurrences in all // oriented reads) and the number of reads in // which the k-mer is over-enriched. - selectKmers2Data.globalFrequency.createNew( + globalFrequency.createNew( largeDataName("tmp-SelectKmers2-GlobalFrequency"), largeDataPageSize); - selectKmers2Data.overenrichedReadCount.createNew( + overenrichedReadCount.createNew( largeDataName("tmp-SelectKmers2-OverenrichedReadCount"), largeDataPageSize); - selectKmers2Data.globalFrequency.resize(kmerTable.size()); - selectKmers2Data.overenrichedReadCount.resize(kmerTable.size()); + globalFrequency.resize(kmerTable.size()); + overenrichedReadCount.resize(kmerTable.size()); fill( - selectKmers2Data.globalFrequency.begin(), - selectKmers2Data.globalFrequency.end(), 0); + globalFrequency.begin(), + globalFrequency.end(), 0); fill( - selectKmers2Data.overenrichedReadCount.begin(), - selectKmers2Data.overenrichedReadCount.end(), 0); - setupLoadBalancing(reads->readCount(), 100); - runThreads(&Assembler::selectKmers2ThreadFunction, threadCount); + overenrichedReadCount.begin(), + overenrichedReadCount.end(), 0); + setupLoadBalancing(reads.readCount(), 100); + runThreads(&KmerTable2::threadFunction, threadCount); @@ -647,8 +559,8 @@ void Assembler::selectKmers2( uint64_t totalKmerOccurrences = 0; uint64_t possibleKmerCount = 0; for(uint64_t kmerId=0; kmerId!=kmerTable.size(); kmerId++) { - totalKmerOccurrences += selectKmers2Data.globalFrequency[kmerId]; - if(assemblerInfo->readRepresentation == 0) { + totalKmerOccurrences += globalFrequency[kmerId]; + if(reads.representation == 0) { ++ possibleKmerCount; } else { if(kmerTable[kmerId].isRleKmer) { @@ -667,7 +579,7 @@ void Assembler::selectKmers2( "GlobalFrequency,GlobalEnrichment,NumberOfReadsOverenriched\n"; for(uint64_t kmerId=0; kmerId<kmerTable.size(); kmerId++) { const KmerInfo& info = kmerTable[kmerId]; - const uint64_t frequency = selectKmers2Data.globalFrequency[kmerId]; + const uint64_t frequency = globalFrequency[kmerId]; const Kmer kmer(kmerId, k); const Kmer reverseComplementedKmer(info.reverseComplementedKmerId, k); @@ -680,17 +592,20 @@ void Assembler::selectKmers2( csv << frequency << ","; csv << double(frequency) / averageOccurrenceCount; csv << ","; - csv << selectKmers2Data.overenrichedReadCount[kmerId]; + csv << overenrichedReadCount[kmerId]; csv << "\n"; } + csv.close(); + // Gather k-mers that are not overenriched in any read and therefore // can be used as markers. vector<KmerId> candidateKmers; for(uint64_t kmerId=0; kmerId<kmerTable.size(); kmerId++) { - if(kmerTable[kmerId].isRleKmer and selectKmers2Data.overenrichedReadCount[kmerId] == 0) { + const bool readIsUsable = (reads.representation==0) ? true : kmerTable[kmerId].isRleKmer; + if(readIsUsable and overenrichedReadCount[kmerId] == 0) { candidateKmers.push_back(KmerId(kmerId)); } } @@ -705,7 +620,7 @@ void Assembler::selectKmers2( " corresponds to one occurrence every " << double(possibleKmerCount) / enrichmentThreshold << " bases"; - if(assemblerInfo->readRepresentation == 1) { + if(reads.representation == 1) { cout << " (in RLE representation)"; } cout << "." << endl; @@ -728,6 +643,7 @@ void Assembler::selectKmers2( uint64_t kmerCount = 0; const uint64_t desiredKmerOccurrencesCount = uint64_t(markerDensity * double(totalKmerOccurrences)); + const uint64_t giveUpCount = uint64_t(0.9 * double(candidateKmers.size())); while(kmerOccurrencesCount < desiredKmerOccurrencesCount) { // Generate a random index into the candidateKmers vector. @@ -743,7 +659,7 @@ void Assembler::selectKmers2( // This k-mer is not already selected as a marker. // Let's add it. info.isMarker = true; - kmerOccurrencesCount += selectKmers2Data.globalFrequency[kmerId]; + kmerOccurrencesCount += globalFrequency[kmerId]; ++kmerCount; // If this k-mer is palindromic, we are done. @@ -756,8 +672,12 @@ void Assembler::selectKmers2( SHASTA_ASSERT(!reverseComplementedInfo.isMarker); SHASTA_ASSERT(reverseComplementedInfo.frequency == info.frequency); reverseComplementedInfo.isMarker = true; - kmerOccurrencesCount += selectKmers2Data.globalFrequency[info.reverseComplementedKmerId]; + kmerOccurrencesCount += globalFrequency[info.reverseComplementedKmerId]; ++kmerCount; + + if(kmerCount >= giveUpCount) { + throw runtime_error("Giving up after selecting as markers 90% of the candidate kmers."); + } } cout << "Selected " << kmerCount << " k-mers as markers." << endl; cout << "These k-mers have a total " << kmerOccurrencesCount << @@ -768,36 +688,33 @@ void Assembler::selectKmers2( -void Assembler::selectKmers2ThreadFunction(size_t threadId) +void KmerTable2::threadFunction(size_t threadId) { // Initialize globalFrequency for this thread. - MemoryMapped::Vector<uint64_t> globalFrequency; - globalFrequency.createNew( - largeDataName("tmp-SelectKmers2-GlobalFrequency-" + to_string(threadId)), + MemoryMapped::Vector<uint64_t> threadGlobalFrequency; + threadGlobalFrequency.createNew( + largeDataName("tmp-KmerTable2-GlobalFrequency-" + to_string(threadId)), largeDataPageSize); - globalFrequency.resize(kmerTable.size()); - fill(globalFrequency.begin(), globalFrequency.end(), 0); + threadGlobalFrequency.resize(kmerTable.size()); + fill(threadGlobalFrequency.begin(), threadGlobalFrequency.end(), 0); // Initialize overenrichedReadCount for this thread. - MemoryMapped::Vector<ReadId> overenrichedReadCount; - overenrichedReadCount.createNew( - largeDataName("tmp-SelectKmers2-OverenrichedReadCount-" + to_string(threadId)), + MemoryMapped::Vector<ReadId> threadOverenrichedReadCount; + threadOverenrichedReadCount.createNew( + largeDataName("tmp-KmerTable2-OverenrichedReadCount-" + to_string(threadId)), largeDataPageSize); - overenrichedReadCount.resize(kmerTable.size()); - fill(overenrichedReadCount.begin(), overenrichedReadCount.end(), 0); + threadOverenrichedReadCount.resize(kmerTable.size()); + fill(threadOverenrichedReadCount.begin(), threadOverenrichedReadCount.end(), 0); // Vectors to hold KmerIds and their frequencies for a single read. vector<KmerId> readKmerIds; vector<uint32_t> readKmerIdFrequencies; - // Access the enrichmentThreshold. - const double enrichmentThreshold = selectKmers2Data.enrichmentThreshold; - // Compute the total number of possible k-mers. // It is needed below for overenrichment computations. uint64_t possibleKmerCount = 0; for(const KmerInfo& kmerInfo: kmerTable) { - if(assemblerInfo->readRepresentation == 0) { + if(reads.representation == 0) { ++possibleKmerCount; } else { if(kmerInfo.isRleKmer) { @@ -808,7 +725,6 @@ void Assembler::selectKmers2ThreadFunction(size_t threadId) // Loop over all batches assigned to this thread. - const size_t k = assemblerInfo->k; uint64_t begin, end; while(getNextBatch(begin, end)) { @@ -816,7 +732,7 @@ void Assembler::selectKmers2ThreadFunction(size_t threadId) for(ReadId readId=ReadId(begin); readId!=ReadId(end); readId++) { // Access the sequence of this read. - const LongBaseSequenceView read = reads->getRead(readId); + const LongBaseSequenceView read = reads.getRead(readId); // If the read is pathologically short, it has no k-mers. if(read.baseCount < k) { @@ -836,10 +752,10 @@ void Assembler::selectKmers2ThreadFunction(size_t threadId) readKmerIds.push_back(kmerId); // Increment its global frequency. - ++globalFrequency[kmerId]; + ++threadGlobalFrequency[kmerId]; // Also increment the frequency of the reverse complemented k-mer. - ++globalFrequency[kmerTable[kmerId].reverseComplementedKmerId]; + ++threadGlobalFrequency[kmerTable[kmerId].reverseComplementedKmerId]; // Check if we reached the end of the read. if(position+k == read.baseCount) { @@ -866,30 +782,30 @@ void Assembler::selectKmers2ThreadFunction(size_t threadId) const KmerId kmerId = readKmerIds[i]; const uint32_t frequency = readKmerIdFrequencies[i]; if(frequency > frequencyThreshold) { - ++overenrichedReadCount[kmerId]; - ++overenrichedReadCount[kmerTable[kmerId].reverseComplementedKmerId]; + ++threadOverenrichedReadCount[kmerId]; + ++threadOverenrichedReadCount[kmerTable[kmerId].reverseComplementedKmerId]; } } } } + // Add our globalFrequency and overenrichedReadCount // to the values computer by the other threads. { std::lock_guard<std::mutex> lock(mutex); for(uint64_t kmerId=0; kmerId!=globalFrequency.size(); kmerId++) { - selectKmers2Data.globalFrequency[kmerId] += globalFrequency[kmerId]; - selectKmers2Data.overenrichedReadCount[kmerId] += overenrichedReadCount[kmerId]; + globalFrequency[kmerId] += threadGlobalFrequency[kmerId]; + overenrichedReadCount[kmerId] += threadOverenrichedReadCount[kmerId]; } } - globalFrequency.remove(); - overenrichedReadCount.remove(); + threadGlobalFrequency.remove(); + threadOverenrichedReadCount.remove(); } - // In this version, marker k-mers are selected randomly, but excluding // k-mers that appear repeated at short distances in any oriented read. // More precisely, for each k-mer we compute the minimum distance @@ -897,32 +813,19 @@ void Assembler::selectKmers2ThreadFunction(size_t threadId) // K-mers for which this minimum distance is less than distanceThreshold // are not used as markers. Marker k-mers are selected randomly among the // remaining k-mers, until the desired marker density is achieved. -void Assembler::selectKmers4( - - // k-mer length. +KmerTable4::KmerTable4( uint64_t k, - - // The desired marker density double markerDensity, - - // Seed for random number generator. - uint64_t seed, - - // Exclude k-mers that appear in any read in two copies, - // with the two copies closer than this distance (in RLE bases). + int seed, uint64_t distanceThreshold, - - size_t threadCount -) + const Reads& reads, + uint64_t threadCount, + const MappedMemoryOwner& mappedMemoryOwner) : + KmerTable(k, true, mappedMemoryOwner), + MultithreadedObject<KmerTable4>(*this), + reads(reads) { const bool debug = false; - cout << timestamp << "Begin selectKmers4." << endl; - - // Sanity check on the value of k, then store it. - if(k > Kmer::capacity) { - throw runtime_error("K-mer capacity exceeded."); - } - assemblerInfo->k = k; // Sanity check. if(markerDensity<0. || markerDensity>1.) { @@ -935,41 +838,37 @@ void Assembler::selectKmers4( threadCount = std::thread::hardware_concurrency(); } - // Fill in the fields of the k-mer table - // that depends only on k. - initializeKmerTable(); - // Initialize the global frequency of all k-mers. - selectKmers4Data.globalFrequency.createNew( - largeDataName("tmp-SelectKmers4-GlobalFrequency"), largeDataPageSize); - selectKmers4Data.globalFrequency.resize(kmerTable.size()); + globalFrequency.createNew( + largeDataName("tmp-KmerTable44-GlobalFrequency"), largeDataPageSize); + globalFrequency.resize(kmerTable.size()); fill( - selectKmers4Data.globalFrequency.begin(), - selectKmers4Data.globalFrequency.end(), 0); + globalFrequency.begin(), + globalFrequency.end(), 0); // Initialize the minimumDistance vector, which stores // the minimum RLE distance between any two copies of each k-mer // in any oriented read. - selectKmers4Data.minimumDistance.createNew( - largeDataName("tmp-selectKmers4-minimumDistance"), largeDataPageSize); + minimumDistance.createNew( + largeDataName("tmp-KmerTable4-minimumDistance"), largeDataPageSize); const uint64_t kmerCount = kmerTable.size(); - selectKmers4Data.minimumDistance.resize(kmerCount); + minimumDistance.resize(kmerCount); for(uint64_t i=0; i<kmerCount; i++) { - selectKmers4Data.minimumDistance[i].second = std::numeric_limits<uint32_t>::max(); + minimumDistance[i].second = std::numeric_limits<uint32_t>::max(); } // Compute the minimumDistance vector. - setupLoadBalancing(reads->readCount(), 100); - runThreads(&Assembler::selectKmers4ThreadFunction, threadCount); + setupLoadBalancing(reads.readCount(), 100); + runThreads(&KmerTable4::threadFunction, threadCount); // Write out what we found. if(debug) { const uint64_t totalFrequency = std::accumulate( - selectKmers4Data.globalFrequency.begin(), - selectKmers4Data.globalFrequency.end(), 0ULL); + globalFrequency.begin(), + globalFrequency.end(), 0ULL); cout << "Total number of k-mer occurrences in all oriented reads is " << totalFrequency << endl; ofstream csv("KmerInfo.csv"); csv << "KmerId,Kmer,KmerIdRc,KmerRc,Frequency,FrequencyRc,TotalFrequency," @@ -980,13 +879,13 @@ void Assembler::selectKmers4( continue; } - const uint64_t frequency = selectKmers4Data.globalFrequency[kmerId]; - const uint64_t frequencyReverseComplement = selectKmers4Data.globalFrequency[info.reverseComplementedKmerId]; + const uint64_t frequency = globalFrequency[kmerId]; + const uint64_t frequencyReverseComplement = globalFrequency[info.reverseComplementedKmerId]; const uint64_t totalFrequency = frequency + frequencyReverseComplement; - const uint32_t minimumDistance = selectKmers4Data.minimumDistance[kmerId].second; - const uint32_t minimumDistanceReverseComplement = - selectKmers4Data.minimumDistance[info.reverseComplementedKmerId].second; + const uint32_t kmerMinimumDistance = minimumDistance[kmerId].second; + const uint32_t kmerMinimumDistanceReverseComplement = + minimumDistance[info.reverseComplementedKmerId].second; const Kmer kmer(kmerId, k); const Kmer reverseComplementedKmer(info.reverseComplementedKmerId, k); @@ -999,9 +898,9 @@ void Assembler::selectKmers4( csv << frequency << ","; csv << frequencyReverseComplement << ","; csv << totalFrequency << ","; - csv << minimumDistance << ","; - csv << minimumDistanceReverseComplement << ","; - csv << min(minimumDistance, minimumDistanceReverseComplement) << "\n"; + csv << kmerMinimumDistance << ","; + csv << kmerMinimumDistanceReverseComplement << ","; + csv << min(kmerMinimumDistance, kmerMinimumDistanceReverseComplement) << "\n"; } } @@ -1013,11 +912,11 @@ void Assembler::selectKmers4( uint64_t rleKmerCount = 0; for(uint64_t kmerId=0; kmerId!=kmerTable.size(); kmerId++) { const KmerInfo& info = kmerTable[kmerId]; - if(not info.isRleKmer) { - SHASTA_ASSERT(selectKmers4Data.globalFrequency[kmerId] == 0); + if((reads.representation==1) and (not info.isRleKmer)) { + SHASTA_ASSERT(globalFrequency[kmerId] == 0); continue; } - totalKmerOccurrences += selectKmers4Data.globalFrequency[kmerId]; + totalKmerOccurrences += globalFrequency[kmerId]; if(kmerTable[kmerId].isRleKmer) { ++rleKmerCount; } @@ -1039,7 +938,7 @@ void Assembler::selectKmers4( for(uint64_t kmerId=0; kmerId<kmerTable.size(); kmerId++) { const KmerInfo& info = kmerTable[kmerId]; const KmerId kmerIdRc = info.reverseComplementedKmerId; - if(not info.isRleKmer) { + if((reads.representation==1) and (not info.isRleKmer)) { continue; } if(kmerIdRc == kmerId) { @@ -1050,18 +949,18 @@ void Assembler::selectKmers4( if(kmerId > kmerIdRc) { continue; } - if(selectKmers4Data.minimumDistance[kmerId].second < distanceThreshold) { + if(minimumDistance[kmerId].second < distanceThreshold) { // Too close. skip. continue; } - if(selectKmers4Data.minimumDistance[kmerIdRc].second < distanceThreshold) { + if(minimumDistance[kmerIdRc].second < distanceThreshold) { // Too close. Skip. continue; } candidateKmers.push_back(KmerId(kmerId)); - candidateFrequency += selectKmers4Data.globalFrequency[kmerId]; - candidateFrequency += selectKmers4Data.globalFrequency[kmerIdRc]; + candidateFrequency += globalFrequency[kmerId]; + candidateFrequency += globalFrequency[kmerIdRc]; } cout << "Markers will be chosen randomly from the a pool of " << 2*candidateKmers.size() << " RLE k-mers." << endl; @@ -1105,8 +1004,8 @@ void Assembler::selectKmers4( // Increment counters. markerCount += 2; - markerOccurrencesCount += selectKmers4Data.globalFrequency[kmerId]; - markerOccurrencesCount += selectKmers4Data.globalFrequency[kmerIdRc]; + markerOccurrencesCount += globalFrequency[kmerId]; + markerOccurrencesCount += globalFrequency[kmerIdRc]; // Remove kmerId from the vector of candidates. if(i != candidateKmers.size()-1) { @@ -1124,28 +1023,23 @@ void Assembler::selectKmers4( // Clean up. - selectKmers4Data.minimumDistance.remove(); - selectKmers4Data.globalFrequency.remove(); + minimumDistance.remove(); + globalFrequency.remove(); - // Done. - cout << timestamp << "End selectKmers4." << endl; } -void Assembler::selectKmers4ThreadFunction(size_t threadId) +void KmerTable4::threadFunction(size_t threadId) { - // K-mer length. - const size_t k = assemblerInfo->k; - // Initialize globalFrequency for this thread. // Having all threads accumulate atomically on the global frequency vector is too slow. - MemoryMapped::Vector<uint64_t> globalFrequency; - globalFrequency.createNew( - largeDataName("tmp-SelectKmers4-GlobalFrequency-" + to_string(threadId)), + MemoryMapped::Vector<uint64_t> threadGlobalFrequency; + threadGlobalFrequency.createNew( + largeDataName("tmp-KmerTable4-GlobalFrequency-" + to_string(threadId)), largeDataPageSize); - globalFrequency.resize(kmerTable.size()); - fill(globalFrequency.begin(), globalFrequency.end(), 0); + threadGlobalFrequency.resize(kmerTable.size()); + fill(threadGlobalFrequency.begin(), threadGlobalFrequency.end(), 0); // Vector to hold pairs(KmerId, RLE position) for one read. vector< pair<KmerId, uint32_t> > readKmers; @@ -1158,7 +1052,7 @@ void Assembler::selectKmers4ThreadFunction(size_t threadId) for(ReadId readId=ReadId(begin); readId!=ReadId(end); readId++) { // Access the sequence of this read. - const LongBaseSequenceView read = reads->getRead(readId); + const LongBaseSequenceView read = reads.getRead(readId); // If the read is pathologically short, it has no k-mers. if(read.baseCount < k) { @@ -1178,8 +1072,8 @@ void Assembler::selectKmers4ThreadFunction(size_t threadId) readKmers.push_back(make_pair(kmerId, position)); // Update the frequency of this k-mer. - ++globalFrequency[kmerId]; - ++globalFrequency[kmerTable[kmerId].reverseComplementedKmerId]; + ++threadGlobalFrequency[kmerId]; + ++threadGlobalFrequency[kmerTable[kmerId].reverseComplementedKmerId]; // Check if we reached the end of the read. if(position+k == read.baseCount) { @@ -1205,7 +1099,7 @@ void Assembler::selectKmers4ThreadFunction(size_t threadId) } const uint32_t distance = p1.second - p0.second; - pair<std::mutex, uint32_t>& p = selectKmers4Data.minimumDistance[kmerId0]; + pair<std::mutex, uint32_t>& p = minimumDistance[kmerId0]; std::lock_guard<std::mutex> lock(p.first);; p.second = min(p.second, distance); } @@ -1216,9 +1110,63 @@ void Assembler::selectKmers4ThreadFunction(size_t threadId) { std::lock_guard<std::mutex> lock(mutex); for(uint64_t kmerId=0; kmerId!=globalFrequency.size(); kmerId++) { - selectKmers4Data.globalFrequency[kmerId] += globalFrequency[kmerId]; + globalFrequency[kmerId] += threadGlobalFrequency[kmerId]; } } - globalFrequency.remove(); + threadGlobalFrequency.remove(); +} + + + +KmerTable0::KmerTable0( + uint64_t k, + const MappedMemoryOwner& mappedMemoryOwner) : + KmerTable(k, false, mappedMemoryOwner) +{ +} + + + +KmerTable1::KmerTable1( + uint64_t k, + const Reads& reads, + const MappedMemoryOwner& mappedMemoryOwner) : + KmerTable(k, false, mappedMemoryOwner), + MultithreadedObject<KmerTable1>(*this), + reads(reads) +{ +} + + + +KmerTable2::KmerTable2( + uint64_t k, + const Reads& reads, + const MappedMemoryOwner& mappedMemoryOwner) : + KmerTable(k, false, mappedMemoryOwner), + MultithreadedObject<KmerTable2>(*this), + reads(reads) +{ +} + + + +KmerTable3::KmerTable3( + uint64_t k, + const MappedMemoryOwner& mappedMemoryOwner) : + KmerTable(k, false, mappedMemoryOwner) +{ +} + + + +KmerTable4::KmerTable4( + uint64_t k, + const Reads& reads, + const MappedMemoryOwner& mappedMemoryOwner) : + KmerTable(k, false, mappedMemoryOwner), + MultithreadedObject<KmerTable4>(*this), + reads(reads) +{ } diff --git a/src/KmerTable.hpp b/src/KmerTable.hpp new file mode 100644 index 0000000..2b134c1 --- /dev/null +++ b/src/KmerTable.hpp @@ -0,0 +1,215 @@ +#ifndef SHASTA_KMER_TABLE_HPP +#define SHASTA_KMER_TABLE_HPP + +// Shasta. +#include "KmerChecker.hpp" +#include "MappedMemoryOwner.hpp" +#include "MemoryMappedVector.hpp" +#include "MultithreadedObject.hpp" + +// Standard library. +#include "utility.hpp" + +namespace shasta { + + class KmerTable; + class KmerTable0; + class KmerTable1; + class KmerTable2; + class KmerTable3; + class KmerTable4; + + class KmersOptions; + class Reads; +} + + + +// Old implementations of KmerChecker are table based. +// There are derived classes to support all 5 marker generation methods +// but they are limited to k-mer lengths k<16. +class shasta::KmerTable : + public KmerChecker, + public MappedMemoryOwner { +public: + + bool isMarker(KmerId kmerId) const + { + return kmerTable[kmerId].isMarker; + } + + KmerTable(uint64_t k, bool createNew, const MappedMemoryOwner&); + +protected: + + class KmerInfo { + public: + + // Frequency of this k-mer in input reads. + // This is only used in some of the derived classes and + // so there is opportunity for some cleanup here. + uint64_t frequency = 0; + + KmerId16 reverseComplementedKmerId; + bool isMarker; + bool isRleKmer; + }; + + uint64_t k; + MemoryMapped::Vector<KmerInfo> kmerTable; + +private: + void createKmerTable(); + void accessKmerTable(); + +}; + + + +// Marker k-mer generation method 0 (used when --Kmers.generationMethod 0). +class shasta::KmerTable0 : public KmerTable { +public: + + // Construct from scratch. + KmerTable0( + uint64_t k, + double probability, + int seed, + const MappedMemoryOwner&); + + // Construct from binary data. + KmerTable0( + uint64_t k, + const MappedMemoryOwner&); + +}; + + + +// Marker k-mer generation method 1 (used when --Kmers.generationMethod 1). +class shasta::KmerTable1 : + public KmerTable, + public MultithreadedObject<KmerTable1> { +public: + + // Construct from scratch. + KmerTable1( + uint64_t k, + double probability, + int seed, + double enrichmentThreshold, + const Reads&, + uint64_t threadCount, + const MappedMemoryOwner&); + + // Construct from binary data. + KmerTable1( + uint64_t k, + const Reads&, + const MappedMemoryOwner&); + +private: + const Reads& reads; + void computeKmerFrequency(size_t threadId); +}; + + + +// Marker k-mer generation method 2 (used when --Kmers.generationMethod 2). +class shasta::KmerTable2 : + public KmerTable, + public MultithreadedObject<KmerTable2> { +public: + + // Construct from scratch. + KmerTable2( + uint64_t k, + double probability, + int seed, + double enrichmentThreshold, + const Reads&, + uint64_t threadCount, + const MappedMemoryOwner&); + + // Construct from binary data. + KmerTable2( + uint64_t k, + const Reads&, + const MappedMemoryOwner&); + +private: + const Reads& reads; + double enrichmentThreshold; + + // The number of times each k-mer appears in an oriented read. + // Indexed by KmerId. + MemoryMapped::Vector<uint64_t> globalFrequency; + + // The number of oriented reads that each k-mer is + // over-enriched in by more than a factor enrichmentThreshold. + // Indexed by KmerId. + MemoryMapped::Vector<ReadId> overenrichedReadCount; + + void threadFunction(size_t threadId); +}; + + + +// Marker k-mer generation method 3 (used when --Kmers.generationMethod 3). +class shasta::KmerTable3: public KmerTable { +public: + + // Construct from scratch. + KmerTable3( + uint64_t k, + uint64_t readRepresentation, + const string& fileName, + const MappedMemoryOwner&); + + // Construct from binary data. + KmerTable3( + uint64_t k, + const MappedMemoryOwner&); + +}; + + + +// Marker k-mer generation method 4 (used when --Kmers.generationMethod 4). +class shasta::KmerTable4 : + public KmerTable, + public MultithreadedObject<KmerTable4> { +public: + + // Construct from scratch. + KmerTable4( + uint64_t k, + double probability, + int seed, + uint64_t distanceThreshold, + const Reads&, + uint64_t threadCount, + const MappedMemoryOwner&); + + // Construct from binary data. + KmerTable4( + uint64_t k, + const Reads&, + const MappedMemoryOwner&); + +public: + const Reads& reads; + + void threadFunction(size_t threadId); + + // The number of times each k-mer appears in an oriented read. + // Indexed by KmerId. + MemoryMapped::Vector<uint64_t> globalFrequency; + + // The minimum distance at which two copies of each k-mer + // appear in any oriented read. + // Indexed by KmerId. + MemoryMapped::Vector< pair<std::mutex, uint32_t> > minimumDistance; +}; + +#endif diff --git a/src/LocalAssemblyGraph.cpp b/src/LocalAssemblyGraph.cpp index 573b4f7..95dc083 100644 --- a/src/LocalAssemblyGraph.cpp +++ b/src/LocalAssemblyGraph.cpp @@ -2,6 +2,7 @@ #include "LocalAssemblyGraph.hpp" #include "approximateTopologicalSort.hpp" using namespace shasta; +using namespace mode0; // Boost libraries. #include <boost/graph/graphviz.hpp> @@ -214,7 +215,7 @@ void LocalAssemblyGraph::Writer::operator()(std::ostream& s, vertex_descriptor v "\""; // URL. - s << " URL=\"exploreMarkerGraph?" + s << " URL=\"exploreMarkerGraph0?" "?vertexId=" << vertex.markerGraphVertexId << "&maxDistance=10" "&timeout=30" diff --git a/src/LocalAssemblyGraph.hpp b/src/LocalAssemblyGraph.hpp index ab0be79..cefd969 100644 --- a/src/LocalAssemblyGraph.hpp +++ b/src/LocalAssemblyGraph.hpp @@ -4,7 +4,7 @@ /******************************************************************************* -The local marker graph created by class LocalMarkerGraph is a subgraph +The local marker graph created by class LocalAssemblyGraph is a subgraph of the global assembly graph, created by starting at a given vertex, and extending out to a specified distance in both directions. Distance is number of edges on the global assembly graph. @@ -40,7 +40,7 @@ public: // The vertex id of the vertex of the global assembly // graph that corresponds to this vertex. - AssemblyGraph::VertexId assemblyGraphVertexId; + mode0::AssemblyGraph::VertexId assemblyGraphVertexId; // The vertex id of the vertex of the global marker // graph that corresponds to this vertex. @@ -54,7 +54,7 @@ public: size_t rank = 0; LocalAssemblyGraphVertex( - AssemblyGraph::VertexId assemblyGraphVertexId, + mode0::AssemblyGraph::VertexId assemblyGraphVertexId, MarkerGraph::VertexId markerGraphVertexId, int distance) : assemblyGraphVertexId(assemblyGraphVertexId), @@ -70,7 +70,7 @@ class shasta::LocalAssemblyGraphEdge { public: // The global edge id of the edge of the global assembly // graph that corresponds to this edge. - AssemblyGraph::EdgeId edgeId; + mode0::AssemblyGraph::EdgeId edgeId; // Field used by approximateTopologicalSort. bool isDagEdge = true; @@ -87,11 +87,11 @@ class shasta::LocalAssemblyGraph : public: LocalAssemblyGraph( - AssemblyGraph& + mode0::AssemblyGraph& ); - using VertexId = AssemblyGraph::VertexId; - using EdgeId = AssemblyGraph::EdgeId; + using VertexId = mode0::AssemblyGraph::VertexId; + using EdgeId = mode0::AssemblyGraph::EdgeId; // Add a vertex with the given vertex ids // and return its vertex descriptor. @@ -136,7 +136,7 @@ private: std::map<VertexId, vertex_descriptor> vertexMap; // Reference to the global assembly graph. - AssemblyGraph& globalAssemblyGraph; + mode0::AssemblyGraph& globalAssemblyGraph; // Writer class used for Graphviz output. class Writer { diff --git a/src/LocalMarkerGraph-Write.cpp b/src/LocalMarkerGraph0-Write.cpp index 689f1a2..1bbce65 100644 --- a/src/LocalMarkerGraph-Write.cpp +++ b/src/LocalMarkerGraph0-Write.cpp @@ -1,10 +1,11 @@ // Shasta. -#include "LocalMarkerGraph.hpp" +#include "LocalMarkerGraph0.hpp" #include "ConsensusCaller.hpp" #include "Marker.hpp" #include "MemoryMappedVectorOfVectors.hpp" #include "orderPairs.hpp" using namespace shasta; +using namespace mode0; // Boost libraries. #include <boost/graph/graphviz.hpp> @@ -15,9 +16,9 @@ using namespace shasta; // Write the graph in Graphviz format. -void LocalMarkerGraph::write( +void LocalMarkerGraph0::write( const string& fileName, - const LocalMarkerGraphRequestParameters& localMarkerGraphRequestParameters) const + const LocalMarkerGraph0RequestParameters& localMarkerGraphRequestParameters) const { ofstream outputFileStream(fileName); if(!outputFileStream) { @@ -25,19 +26,19 @@ void LocalMarkerGraph::write( } write(outputFileStream, localMarkerGraphRequestParameters); } -void LocalMarkerGraph::write( +void LocalMarkerGraph0::write( ostream& s, - const LocalMarkerGraphRequestParameters& localMarkerGraphRequestParameters) const + const LocalMarkerGraph0RequestParameters& localMarkerGraphRequestParameters) const { Writer writer(*this, localMarkerGraphRequestParameters); boost::write_graphviz(s, *this, writer, writer, writer, - boost::get(&LocalMarkerGraphVertex::vertexId, *this)); + boost::get(&LocalMarkerGraph0Vertex::vertexId, *this)); } -LocalMarkerGraph::Writer::Writer( - const LocalMarkerGraph& graph, - const LocalMarkerGraphRequestParameters& parameters) : - LocalMarkerGraphRequestParameters(parameters), +LocalMarkerGraph0::Writer::Writer( + const LocalMarkerGraph0& graph, + const LocalMarkerGraph0RequestParameters& parameters) : + LocalMarkerGraph0RequestParameters(parameters), graph(graph) { } @@ -45,26 +46,26 @@ LocalMarkerGraph::Writer::Writer( // Vertex and edge colors. -const string LocalMarkerGraph::Writer::vertexColorZeroDistance = "#6666ff"; -const string LocalMarkerGraph::Writer::vertexColorIntermediateDistance = "#00ccff"; -const string LocalMarkerGraph::Writer::vertexColorMaxDistance = "#66ffff"; -const string LocalMarkerGraph::Writer::edgeArrowColorRemovedDuringTransitiveReduction = "#ff0000"; -const string LocalMarkerGraph::Writer::edgeArrowColorRemovedDuringPruning = "#ff00ff"; -const string LocalMarkerGraph::Writer::edgeArrowColorRemovedDuringSuperBubbleRemoval = "#009900"; -const string LocalMarkerGraph::Writer::edgeArrowColorRemovedAsLowCoverageCrossEdge = "#c0c000"; -const string LocalMarkerGraph::Writer::edgeArrowColorRemovedWhileSplittingSecondaryEdges = "#ff0000"; -const string LocalMarkerGraph::Writer::edgeArrowColorNotRemovedNotAssembled = "#fcba03"; -const string LocalMarkerGraph::Writer::edgeArrowColorNotRemovedAssembled = "#000000"; -const string LocalMarkerGraph::Writer::edgeLabelColorRemovedDuringTransitiveReduction = "#ff9999"; -const string LocalMarkerGraph::Writer::edgeLabelColorRemovedDuringPruning = "#c03280"; -const string LocalMarkerGraph::Writer::edgeLabelColorRemovedDuringSuperBubbleRemoval = "#99ff99"; -const string LocalMarkerGraph::Writer::edgeLabelColorRemovedAsLowCoverageCrossEdge = "#e0e000"; -const string LocalMarkerGraph::Writer::edgeLabelColorNotRemovedNotAssembled = "#996600"; -const string LocalMarkerGraph::Writer::edgeLabelColorNotRemovedAssembled = "#999999"; - - - -string LocalMarkerGraph::Writer::vertexColor(const LocalMarkerGraphVertex& vertex) const +const string LocalMarkerGraph0::Writer::vertexColorZeroDistance = "#6666ff"; +const string LocalMarkerGraph0::Writer::vertexColorIntermediateDistance = "#00ccff"; +const string LocalMarkerGraph0::Writer::vertexColorMaxDistance = "#66ffff"; +const string LocalMarkerGraph0::Writer::edgeArrowColorRemovedDuringTransitiveReduction = "#ff0000"; +const string LocalMarkerGraph0::Writer::edgeArrowColorRemovedDuringPruning = "#ff00ff"; +const string LocalMarkerGraph0::Writer::edgeArrowColorRemovedDuringSuperBubbleRemoval = "#009900"; +const string LocalMarkerGraph0::Writer::edgeArrowColorRemovedAsLowCoverageCrossEdge = "#c0c000"; +const string LocalMarkerGraph0::Writer::edgeArrowColorRemovedWhileSplittingSecondaryEdges = "#ff0000"; +const string LocalMarkerGraph0::Writer::edgeArrowColorNotRemovedNotAssembled = "#fcba03"; +const string LocalMarkerGraph0::Writer::edgeArrowColorNotRemovedAssembled = "#000000"; +const string LocalMarkerGraph0::Writer::edgeLabelColorRemovedDuringTransitiveReduction = "#ff9999"; +const string LocalMarkerGraph0::Writer::edgeLabelColorRemovedDuringPruning = "#c03280"; +const string LocalMarkerGraph0::Writer::edgeLabelColorRemovedDuringSuperBubbleRemoval = "#99ff99"; +const string LocalMarkerGraph0::Writer::edgeLabelColorRemovedAsLowCoverageCrossEdge = "#e0e000"; +const string LocalMarkerGraph0::Writer::edgeLabelColorNotRemovedNotAssembled = "#996600"; +const string LocalMarkerGraph0::Writer::edgeLabelColorNotRemovedAssembled = "#999999"; + + + +string LocalMarkerGraph0::Writer::vertexColor(const LocalMarkerGraph0Vertex& vertex) const { if(vertexColoring == "none") { return "black"; @@ -103,7 +104,7 @@ string LocalMarkerGraph::Writer::vertexColor(const LocalMarkerGraphVertex& verte -string LocalMarkerGraph::Writer::edgeArrowColor(const LocalMarkerGraphEdge& edge) const +string LocalMarkerGraph0::Writer::edgeArrowColor(const LocalMarkerGraph0Edge& edge) const { if(edgeColoring == "none") { @@ -164,7 +165,7 @@ string LocalMarkerGraph::Writer::edgeArrowColor(const LocalMarkerGraphEdge& edge -string LocalMarkerGraph::Writer::edgeLabelColor(const LocalMarkerGraphEdge& edge) const +string LocalMarkerGraph0::Writer::edgeLabelColor(const LocalMarkerGraph0Edge& edge) const { if(edgeColoring == "none") { return "white"; @@ -211,7 +212,7 @@ string LocalMarkerGraph::Writer::edgeLabelColor(const LocalMarkerGraphEdge& edge -void LocalMarkerGraph::writeColorLegendVerticesByDistance(ostream& html) +void LocalMarkerGraph0::writeColorLegendVerticesByDistance(ostream& html) { html << "<table>" @@ -226,7 +227,7 @@ void LocalMarkerGraph::writeColorLegendVerticesByDistance(ostream& html) -void LocalMarkerGraph::writeColorLegendEdgeArrowsByFlags(ostream& html) +void LocalMarkerGraph0::writeColorLegendEdgeArrowsByFlags(ostream& html) { if(assemblyMode == 2) { html << @@ -261,7 +262,7 @@ void LocalMarkerGraph::writeColorLegendEdgeArrowsByFlags(ostream& html) -void LocalMarkerGraph::writeColorLegendEdgeLabelsByFlags(ostream& html) +void LocalMarkerGraph0::writeColorLegendEdgeLabelsByFlags(ostream& html) { html << "<table>" @@ -283,7 +284,7 @@ void LocalMarkerGraph::writeColorLegendEdgeLabelsByFlags(ostream& html) -void LocalMarkerGraph::Writer::operator()(std::ostream& s) const +void LocalMarkerGraph0::Writer::operator()(std::ostream& s) const { // This turns off the tooltip on the graph and the edges. s << "tooltip = \" \";\n"; @@ -316,9 +317,9 @@ void LocalMarkerGraph::Writer::operator()(std::ostream& s) const -void LocalMarkerGraph::Writer::operator()(std::ostream& s, vertex_descriptor v) const +void LocalMarkerGraph0::Writer::operator()(std::ostream& s, vertex_descriptor v) const { - const LocalMarkerGraphVertex& vertex = graph[v]; + const LocalMarkerGraph0Vertex& vertex = graph[v]; const auto coverage = vertex.markerInfos.size(); const string color = vertexColor(vertex); SHASTA_ASSERT(coverage > 0); @@ -464,10 +465,10 @@ void LocalMarkerGraph::Writer::operator()(std::ostream& s, vertex_descriptor v) -void LocalMarkerGraph::Writer::operator()(std::ostream& s, edge_descriptor e) const +void LocalMarkerGraph0::Writer::operator()(std::ostream& s, edge_descriptor e) const { - const LocalMarkerGraphEdge& edge = graph[e]; + const LocalMarkerGraph0Edge& edge = graph[e]; const size_t coverage = edge.coverage(); const string arrowColor = edgeArrowColor(edge); const string labelColor = edgeLabelColor(edge); @@ -607,7 +608,7 @@ void LocalMarkerGraph::Writer::operator()(std::ostream& s, edge_descriptor e) co // Verbose labels include the detail of all oriented read ids on this edge. if(edgeLabels == 2) { - vector< pair<OrientedReadId, LocalMarkerGraphEdge::Sequence> > table; + vector< pair<OrientedReadId, LocalMarkerGraph0Edge::Sequence> > table; for(const auto& info: edge.infos) { const auto& sequence = info.first; const auto& intervals = info.second; @@ -616,7 +617,7 @@ void LocalMarkerGraph::Writer::operator()(std::ostream& s, edge_descriptor e) co } } sort(table.begin(), table.end(), - OrderPairsByFirstOnly<OrientedReadId, LocalMarkerGraphEdge::Sequence>()); + OrderPairsByFirstOnly<OrientedReadId, LocalMarkerGraph0Edge::Sequence>()); s << "<hr/>"; for(const auto& p: table) { diff --git a/src/LocalMarkerGraph.cpp b/src/LocalMarkerGraph0.cpp index b9c9262..a7ac4b2 100644 --- a/src/LocalMarkerGraph.cpp +++ b/src/LocalMarkerGraph0.cpp @@ -1,5 +1,5 @@ // Shasta. -#include "LocalMarkerGraph.hpp" +#include "LocalMarkerGraph0.hpp" #include "approximateTopologicalSort.hpp" #include "findMarkerId.hpp" #include "orderPairs.hpp" @@ -10,12 +10,13 @@ using namespace shasta; -LocalMarkerGraph::LocalMarkerGraph( +LocalMarkerGraph0::LocalMarkerGraph0( uint64_t readRepresentation, uint32_t k, uint64_t assemblyMode, const Reads& reads, const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, + const MarkerGraph& markerGraph, const MemoryMapped::Vector<MarkerGraph::CompressedVertexId>& globalMarkerGraphVertex, const ConsensusCaller& consensusCaller ) : @@ -24,6 +25,7 @@ LocalMarkerGraph::LocalMarkerGraph( assemblyMode(assemblyMode), reads(reads), markers(markers), + markerGraph(markerGraph), globalMarkerGraphVertex(globalMarkerGraphVertex), consensusCaller(consensusCaller) { @@ -34,8 +36,8 @@ LocalMarkerGraph::LocalMarkerGraph( // Find out if a vertex with the given MarkerGraph::VertexId exists. // If it exists, return make_pair(true, v). // Otherwise, return make_pair(false, null_vertex()); -std::pair<bool, LocalMarkerGraph::vertex_descriptor> - LocalMarkerGraph::findVertex(MarkerGraph::VertexId vertexId) const +std::pair<bool, LocalMarkerGraph0::vertex_descriptor> + LocalMarkerGraph0::findVertex(MarkerGraph::VertexId vertexId) const { const auto it = vertexMap.find(vertexId); if(it == vertexMap.end()) { @@ -50,8 +52,8 @@ std::pair<bool, LocalMarkerGraph::vertex_descriptor> // Add a vertex with the given MarkerGraph::VertexId // and return its vertex descriptor. // A vertex with this MarkerGraph::VertexId must not exist. -LocalMarkerGraph::vertex_descriptor - LocalMarkerGraph::addVertex( +LocalMarkerGraph0::vertex_descriptor + LocalMarkerGraph0::addVertex( MarkerGraph::VertexId vertexId, uint64_t distance, span<MarkerId> vertexMarkers) @@ -60,14 +62,14 @@ LocalMarkerGraph::vertex_descriptor SHASTA_ASSERT(vertexMap.find(vertexId) == vertexMap.end()); // Add the vertex and store it in the vertex map. - const vertex_descriptor v = add_vertex(LocalMarkerGraphVertex(vertexId, distance), *this); + const vertex_descriptor v = add_vertex(LocalMarkerGraph0Vertex(vertexId, distance), *this); vertexMap.insert(make_pair(vertexId, v)); // Fill in the marker information for this vertex. - LocalMarkerGraphVertex& vertex = (*this)[v]; + LocalMarkerGraph0Vertex& vertex = (*this)[v]; vertex.markerInfos.reserve(vertexMarkers.size()); for(const MarkerId markerId: vertexMarkers) { - LocalMarkerGraphVertex::MarkerInfo markerInfo; + LocalMarkerGraph0Vertex::MarkerInfo markerInfo; markerInfo.markerId = markerId; tie(markerInfo.orientedReadId, markerInfo.ordinal) = findMarkerId(markerId, markers); @@ -80,29 +82,17 @@ LocalMarkerGraph::vertex_descriptor // Get the KmerId for a vertex. -KmerId LocalMarkerGraph::getKmerId(vertex_descriptor v) const +KmerId LocalMarkerGraph0::getKmerId(vertex_descriptor v) const { - const LocalMarkerGraphVertex& vertex = (*this)[v]; - SHASTA_ASSERT(!vertex.markerInfos.empty()); - const MarkerId firstMarkerId = vertex.markerInfos.front().markerId; - const CompressedMarker& firstMarker = markers.begin()[firstMarkerId]; - const KmerId kmerId = firstMarker.kmerId; - - // Sanity check that all markers have the same kmerId. - // At some point this can be removed. - for(const auto& markerInfo: vertex.markerInfos){ - const CompressedMarker& marker = markers.begin()[markerInfo.markerId]; - SHASTA_ASSERT(marker.kmerId == kmerId); - } - - return kmerId; + const LocalMarkerGraph0Vertex& vertex = (*this)[v]; + return markerGraph.getVertexKmerId(vertex.vertexId, k, reads, markers); } // Get the repeat counts for a MarkerInfo of a vertex. -vector<uint8_t> LocalMarkerGraph::getRepeatCounts( - const LocalMarkerGraphVertex::MarkerInfo& markerInfo) const +vector<uint8_t> LocalMarkerGraph0::getRepeatCounts( + const LocalMarkerGraph0Vertex::MarkerInfo& markerInfo) const { if(readRepresentation == 1) { @@ -132,20 +122,20 @@ vector<uint8_t> LocalMarkerGraph::getRepeatCounts( // Fill in the ConsensusInfo's for each vertex. -void LocalMarkerGraph::computeVertexConsensusInfo() +void LocalMarkerGraph0::computeVertexConsensusInfo() { - LocalMarkerGraph& graph = *this; - BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph) { + LocalMarkerGraph0& graph = *this; + BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph0) { computeVertexConsensusInfo(v); } } -void LocalMarkerGraph::computeVertexConsensusInfo( vertex_descriptor v) +void LocalMarkerGraph0::computeVertexConsensusInfo( vertex_descriptor v) { // Short-hands for the graph and the vertex. - LocalMarkerGraph& graph = *this; - LocalMarkerGraphVertex& vertex = graph[v]; + LocalMarkerGraph0& graph = *this; + LocalMarkerGraph0Vertex& vertex = graph[v]; // Get the marker k-mer of this vertex. const KmerId kmerId = graph.getKmerId(v); @@ -174,22 +164,22 @@ void LocalMarkerGraph::computeVertexConsensusInfo( vertex_descriptor v) // Store sequence information in the edge. // This version takes as input a vector of the -// LocalMarkerGraphEdge::Info that caused the edge to be created. -void LocalMarkerGraph::storeEdgeInfo( +// LocalMarkerGraph0Edge::Info that caused the edge to be created. +void LocalMarkerGraph0::storeEdgeInfo( edge_descriptor e, const vector<MarkerInterval>& intervals) { - LocalMarkerGraph& graph = *this; - LocalMarkerGraphEdge& edge = graph[e]; + LocalMarkerGraph0& graph = *this; + LocalMarkerGraph0Edge& edge = graph[e]; // Map to store the oriented read ids and ordinals, grouped by sequence. - std::map<LocalMarkerGraphEdge::Sequence, vector<MarkerIntervalWithRepeatCounts> > sequenceTable; + std::map<LocalMarkerGraph0Edge::Sequence, vector<MarkerIntervalWithRepeatCounts> > sequenceTable; for(const MarkerInterval& interval: intervals) { const CompressedMarker& marker0 = markers.begin(interval.orientedReadId.getValue())[interval.ordinals[0]]; const CompressedMarker& marker1 = markers.begin(interval.orientedReadId.getValue())[interval.ordinals[1]]; // Fill in the sequence information and, if necessary, the base repeat counts. - LocalMarkerGraphEdge::Sequence sequence; + LocalMarkerGraph0Edge::Sequence sequence; MarkerIntervalWithRepeatCounts intervalWithRepeatCounts(interval); if(marker1.position <= marker0.position + k) { @@ -265,7 +255,7 @@ void LocalMarkerGraph::storeEdgeInfo( // Sort by decreasing size of the infos vector. sort(edge.infos.begin(), edge.infos.end(), OrderPairsBySizeOfSecondGreater< - LocalMarkerGraphEdge::Sequence, + LocalMarkerGraph0Edge::Sequence, vector<MarkerIntervalWithRepeatCounts> >()); } @@ -276,7 +266,7 @@ void LocalMarkerGraph::storeEdgeInfo( // If found, returns pair(true, ordinal). // Otherwise, returns pair(false, don't care). // If more than an ordinal is found, the first one is returned. -pair<bool, uint32_t> LocalMarkerGraphVertex::getOrdinal( +pair<bool, uint32_t> LocalMarkerGraph0Vertex::getOrdinal( OrientedReadId orientedReadId) const { for(const MarkerInfo& markerInfo: markerInfos) { @@ -292,7 +282,7 @@ pair<bool, uint32_t> LocalMarkerGraphVertex::getOrdinal( // Look for the ordinals for a given oriented read id. // If found, returns true. // If more than an ordinal pairs is found, the first one is returned. -bool LocalMarkerGraphEdge::getOrdinals( +bool LocalMarkerGraph0Edge::getOrdinals( OrientedReadId orientedReadId, array<uint32_t, 2>& ordinals) const { @@ -314,12 +304,12 @@ bool LocalMarkerGraphEdge::getOrdinals( // Approximate topological sort, adding edges // in order of decreasing coverage. The topological sort // stored in LocalMarkerGrapg2Vertex::rank. -void LocalMarkerGraph::approximateTopologicalSort() +void LocalMarkerGraph0::approximateTopologicalSort() { - LocalMarkerGraph& graph = *this; + LocalMarkerGraph0& graph = *this; vector<pair<uint32_t, edge_descriptor> > edgeTable; - BGL_FORALL_EDGES(e, graph, LocalMarkerGraph) { + BGL_FORALL_EDGES(e, graph, LocalMarkerGraph0) { edgeTable.push_back(make_pair(graph[e].coverage(), e)); } sort(edgeTable.begin(), edgeTable.end(), @@ -335,7 +325,7 @@ void LocalMarkerGraph::approximateTopologicalSort() // Also store the vertices in topological sort order. vector< pair<size_t, vertex_descriptor> > vertexTable; - BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph) { + BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph0) { vertexTable.push_back(make_pair(graph[v].rank, v)); } sort(vertexTable.begin(), vertexTable.end()); diff --git a/src/LocalMarkerGraph.hpp b/src/LocalMarkerGraph0.hpp index f3b055b..fd38654 100644 --- a/src/LocalMarkerGraph.hpp +++ b/src/LocalMarkerGraph0.hpp @@ -1,9 +1,9 @@ -#ifndef SHASTA_LOCAL_MARKER_GRAPH_HPP -#define SHASTA_LOCAL_MARKER_GRAPH_HPP +#ifndef SHASTA_LOCAL_MARKER_GRAPH0_HPP +#define SHASTA_LOCAL_MARKER_GRAPH0_HPP /******************************************************************************* -The local marker graph created by class LocalMarkerGraph is a subgraph +The local marker graph created by class LocalMarkerGraph0 is a subgraph of the global marker graph, created by starting at a given vertex, and extending out to a specified distance in both directions. Distance is number of edges on the global marker graph. @@ -17,7 +17,7 @@ a group of aligned markers. #include "AssemblyGraph.hpp" #include "Coverage.hpp" #include "Kmer.hpp" -#include "LocalMarkerGraphRequestParameters.hpp" +#include "LocalMarkerGraph0RequestParameters.hpp" #include "MarkerGraph.hpp" #include "Reads.hpp" @@ -26,26 +26,25 @@ a group of aligned markers. namespace shasta { - class LocalMarkerGraphVertex; - class LocalMarkerGraphEdge; - class LocalMarkerGraph; - using LocalMarkerGraphBaseClass = boost::adjacency_list< + class LocalMarkerGraph0Vertex; + class LocalMarkerGraph0Edge; + class LocalMarkerGraph0; + using LocalMarkerGraph0BaseClass = boost::adjacency_list< boost::listS, // Permit parallel edges created by createMarkerGraphEdgesStrict boost::listS, boost::bidirectionalS, - LocalMarkerGraphVertex, - LocalMarkerGraphEdge + LocalMarkerGraph0Vertex, + LocalMarkerGraph0Edge >; class CompressedMarker; class ConsensusCaller; - class LocalMarkerGraphRequestParameters; class LongBaseSequences; } -class shasta::LocalMarkerGraphVertex { +class shasta::LocalMarkerGraph0Vertex { public: // The global vertex id of the vertex of the global marker @@ -64,7 +63,7 @@ public: }; vector<MarkerInfo> markerInfos; - LocalMarkerGraphVertex( + LocalMarkerGraph0Vertex( MarkerGraph::VertexId vertexId, uint64_t distance) : vertexId(vertexId), @@ -103,7 +102,7 @@ public: -class shasta::LocalMarkerGraphEdge { +class shasta::LocalMarkerGraph0Edge { public: // Class to describe the intervening sequence between @@ -182,7 +181,7 @@ public: // in the assembly graph. However, after detangling a marker // graph edge can correspond to multiple locations in the // assembly graph. - vector< pair<AssemblyGraph::EdgeId, uint32_t> > assemblyGraphLocations; + vector< pair<mode0::AssemblyGraph::EdgeId, uint32_t> > assemblyGraphLocations; // Flag that is set if the edge was removed during // approximate transitive reduction by flagWeakMarkerGraphEdges. @@ -219,16 +218,17 @@ public: -class shasta::LocalMarkerGraph : - public LocalMarkerGraphBaseClass { +class shasta::LocalMarkerGraph0 : + public LocalMarkerGraph0BaseClass { public: - LocalMarkerGraph( + LocalMarkerGraph0( uint64_t readRepresentation, uint32_t k, uint64_t assemblyMode, const Reads& reads, const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, + const MarkerGraph&, const MemoryMapped::Vector<MarkerGraph::CompressedVertexId>& globalMarkerGraphVertex, const ConsensusCaller& ); @@ -254,7 +254,7 @@ public: KmerId getKmerId(vertex_descriptor) const; // Get the repeat counts for a MarkerInfo of a vertex. - vector<uint8_t> getRepeatCounts(const LocalMarkerGraphVertex::MarkerInfo&) const; + vector<uint8_t> getRepeatCounts(const LocalMarkerGraph0Vertex::MarkerInfo&) const; // Fill in the ConsensusInfo's for each vertex. void computeVertexConsensusInfo(); @@ -262,17 +262,17 @@ public: // Store sequence information in the edge. // Takes as input a vector of the - // LocalMarkerGraphEdge::Info that caused the edge to be created. + // LocalMarkerGraph0Edge::Info that caused the edge to be created. void storeEdgeInfo(edge_descriptor, const vector<MarkerInterval>&); // Write in Graphviz format. void write( ostream&, - const LocalMarkerGraphRequestParameters&) const; + const LocalMarkerGraph0RequestParameters&) const; void write( const string& fileName, - const LocalMarkerGraphRequestParameters&) const; + const LocalMarkerGraph0RequestParameters&) const; // Approximate topological sort, adding edges @@ -301,6 +301,7 @@ private: // (not just those in this local marker graph). const Reads& reads; const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers; + const MarkerGraph& markerGraph; // A reference to the vector containing the global marker graph vertex id // corresponding to each marker. @@ -313,16 +314,16 @@ private: // Class used for graphviz output. - class Writer : public LocalMarkerGraphRequestParameters { + class Writer : public LocalMarkerGraph0RequestParameters { public: Writer( - const LocalMarkerGraph&, - const LocalMarkerGraphRequestParameters&); + const LocalMarkerGraph0&, + const LocalMarkerGraph0RequestParameters&); void operator()(ostream&) const; void operator()(ostream&, vertex_descriptor) const; void operator()(ostream&, edge_descriptor) const; - const LocalMarkerGraph& graph; + const LocalMarkerGraph0& graph; // Vertex and edge colors. static const string vertexColorZeroDistance; @@ -341,9 +342,9 @@ private: static const string edgeLabelColorRemovedAsLowCoverageCrossEdge; static const string edgeLabelColorNotRemovedNotAssembled; static const string edgeLabelColorNotRemovedAssembled; - string vertexColor(const LocalMarkerGraphVertex&) const; - string edgeArrowColor(const LocalMarkerGraphEdge&) const; - string edgeLabelColor(const LocalMarkerGraphEdge&) const; + string vertexColor(const LocalMarkerGraph0Vertex&) const; + string edgeArrowColor(const LocalMarkerGraph0Edge&) const; + string edgeLabelColor(const LocalMarkerGraph0Edge&) const; }; friend class Writer; diff --git a/src/LocalMarkerGraphRequestParameters.hpp b/src/LocalMarkerGraph0RequestParameters.hpp index 899f612..bd63457 100644 --- a/src/LocalMarkerGraphRequestParameters.hpp +++ b/src/LocalMarkerGraph0RequestParameters.hpp @@ -5,13 +5,13 @@ #include <map> namespace shasta { - class LocalMarkerGraphRequestParameters; + class LocalMarkerGraph0RequestParameters; } // Class describing the parameters in the form // in the local marker graph page. -class shasta::LocalMarkerGraphRequestParameters { +class shasta::LocalMarkerGraph0RequestParameters { public: uint64_t vertexId; diff --git a/src/LocalMarkerGraph1.cpp b/src/LocalMarkerGraph1.cpp new file mode 100644 index 0000000..1c8b9cf --- /dev/null +++ b/src/LocalMarkerGraph1.cpp @@ -0,0 +1,1067 @@ +// Shasta. +#include "LocalMarkerGraph1.hpp" +#include "Base.hpp" +#include "computeLayout.hpp" +#include "findLinearChains.hpp" +#include "html.hpp" +#include "invalid.hpp" +#include "Marker.hpp" +#include "MarkerGraph.hpp" +#include "MurmurHash2.hpp" +#include "orderPairs.hpp" +#include "platformDependent.hpp" +#include "runCommandWithTimeout.hpp" +using namespace shasta; + +// Boost libraries. +#include "boost/graph/filtered_graph.hpp" +#include "boost/graph/iteration_macros.hpp" +#include <boost/uuid/uuid.hpp> +#include <boost/uuid/uuid_generators.hpp> +#include <boost/uuid/uuid_io.hpp> + +// Standard library. +#include "chrono.hpp" +#include "fstream.hpp" +#include <queue> +#include <stack> + + + +LocalMarkerGraph1::LocalMarkerGraph1( + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, + const MarkerGraph& markerGraph, + MarkerGraphVertexId startVertexId, + uint64_t maxDistance, + uint64_t minVertexCoverage, + uint64_t minEdgeCoverage) : + markers(markers), + markerGraph(markerGraph), + maxDistance(maxDistance) +{ + LocalMarkerGraph1& graph = *this; + + // Do a BFS to generate the vertices. + // Edges will be created later. + const vertex_descriptor vStart = addVertex(startVertexId, 0); + std::queue<vertex_descriptor> q; + q.push(vStart); + while(!q.empty()) { + + // Dequeue a vertex. + const vertex_descriptor v0 = q.front(); + q.pop(); + const LocalMarkerGraph1Vertex& vertex0 = graph[v0]; + const MarkerGraphVertexId vertexId0 = vertex0.vertexId; + const uint64_t distance0 = vertex0.distance; + const uint64_t distance1 = distance0 + 1; + + // Loop over outgoing edges. + for(uint64_t edgeId: markerGraph.edgesBySource[vertexId0]) { + const auto& edge = markerGraph.edges[edgeId]; + + // If coverage is too low, skip it. + if(markerGraph.edgeCoverage(edgeId) < minEdgeCoverage) { + continue; + } + + // Get the target vertex. + const MarkerGraph::VertexId vertexId1 = edge.target; + SHASTA_ASSERT(edge.source == vertexId0); + SHASTA_ASSERT(vertexId1 < markerGraph.vertexCount()); + + // If vertex coverage is too low, skip it. + if(markerGraph.vertexCoverage(vertexId1) < minVertexCoverage) { + continue; + } + + // Add this vertex, if we don't already have it. + if(not vertexMap.contains(vertexId1)) { + const vertex_descriptor v1 = graph.addVertex(vertexId1, distance1); + + // Also enqueue it, unless it is at maximum distance. + if(distance1 < maxDistance) { + q.push(v1); + } + } + } + + // Loop over incoming edges. + for(uint64_t edgeId: markerGraph.edgesByTarget[vertexId0]) { + const auto& edge = markerGraph.edges[edgeId]; + + // If coverage is too low, skip it. + if(markerGraph.edgeCoverage(edgeId) < minEdgeCoverage) { + continue; + } + + // Get the source vertex. + const MarkerGraph::VertexId vertexId1 = edge.source; + SHASTA_ASSERT(edge.target == vertexId0); + SHASTA_ASSERT(vertexId1 < markerGraph.vertexCount()); + + // If vertex coverage is too low, skip it. + if(markerGraph.vertexCoverage(vertexId1) < minVertexCoverage) { + continue; + } + + // Add this vertex, if we don't already have it. + if(not vertexMap.contains(vertexId1)) { + const vertex_descriptor v1 = graph.addVertex(vertexId1, distance1); + + // Also enqueue it, unless it is at maximum distance. + if(distance1 < maxDistance) { + q.push(v1); + } + } + } + } + + + + // Create edges. + BGL_FORALL_VERTICES(v0, graph, LocalMarkerGraph1) { + const LocalMarkerGraph1Vertex& vertex0 = graph[v0]; + const MarkerGraphVertexId vertexId0 = vertex0.vertexId; + + for(uint64_t edgeId: markerGraph.edgesBySource[vertexId0]) { + + // If coverage is too low, skip it. + if(markerGraph.edgeCoverage(edgeId) < minEdgeCoverage) { + continue; + } + const auto& edge = markerGraph.edges[edgeId]; + + const MarkerGraph::VertexId vertexId1 = edge.target; + SHASTA_ASSERT(edge.source == vertexId0); + SHASTA_ASSERT(vertexId1 < markerGraph.vertexCount()); + + // If vertexId1 is in the local marker graph, add this edge. + auto it = vertexMap.find(vertexId1); + if(it != vertexMap.end()) { + const vertex_descriptor v1 = it->second; + edge_descriptor e; + tie(e, ignore) = add_edge(v0, v1, LocalMarkerGraph1Edge(edgeId), graph); + edgeMap.insert({edgeId, e}); + } + } + } +} + + + +LocalMarkerGraph1::vertex_descriptor LocalMarkerGraph1::addVertex( + MarkerGraphVertexId vertexId, + uint64_t distance) +{ + LocalMarkerGraph1& graph = *this; + + SHASTA_ASSERT(not vertexMap.contains(vertexId)); + const vertex_descriptor v = add_vertex(LocalMarkerGraph1Vertex(vertexId, distance), graph); + vertexMap.insert(make_pair(vertexId, v)); + + return v; +} + + + +void LocalMarkerGraph1::writeGfa(const string& fileName) const +{ + const LocalMarkerGraph1& graph = *this; + ofstream gfa(fileName); + + // Write the header. + gfa << "H\tVN:Z:1.0\n"; + + // Write one segment for each edge. + BGL_FORALL_EDGES(e, graph, LocalMarkerGraph1) { + const MarkerGraphEdgeId edgeId = graph[e].edgeId; + gfa << + "S\t" << edgeId << "\t"; + + auto sequence = markerGraph.edgeSequence[edgeId]; + copy(sequence.begin(), sequence.end(), ostream_iterator<shasta::Base>(gfa)); + + // RC is multiplied by sequence length so reports the number of reads + // (edge coverage) as depth. + gfa << + "\tLN:i:" << sequence.size() << + "\tRC:i:" << sequence.size() * markerGraph.edgeCoverage(edgeId) << + "\n"; + } + + + + // Write the links. + // For each vertex, we write links between all pairs of incomint/outgoing edges. + BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph1) { + BGL_FORALL_INEDGES(v, e0, graph, LocalMarkerGraph1) { + const MarkerGraphEdgeId edgeId0 = graph[e0].edgeId; + BGL_FORALL_OUTEDGES(v, e1, graph, LocalMarkerGraph1) { + const MarkerGraphEdgeId edgeId1 = graph[e1].edgeId; + gfa << "L\t" << + edgeId0 << "\t+\t" << + edgeId1 << "\t+\t0M\n"; + } + } + } +} + + + +void LocalMarkerGraph1::writeHtml0( + ostream& html, + uint64_t sizePixels, + uint64_t quality, + double timeout, + bool useSvg) const +{ + const LocalMarkerGraph1& graph = *this; + + // Compute the layout. + std::map<edge_descriptor, double> edgeLengthMap; + BGL_FORALL_EDGES(e, graph, LocalMarkerGraph1) { + edgeLengthMap.insert(make_pair(e, 1.)); + } + std::map<vertex_descriptor, array<double, 2> > positionMap; + // const auto t0 = steady_clock::now(); + const ComputeLayoutReturnCode returnCode = computeLayoutCustom( + graph, edgeLengthMap, positionMap, quality, timeout); + // const auto t1 = steady_clock::now(); + // html << "<br>Graph layout computation took " << seconds(t1 - t0) << "s."; + if(returnCode == ComputeLayoutReturnCode::Timeout) { + throw runtime_error("Graph layout took too long. " + "Increase the timeout or decrease the maximum distance."); + } + if(returnCode != ComputeLayoutReturnCode::Success) { + throw runtime_error("Graph layout failed."); + } + + // Find minimum and maximum of x and y. + double xMin = std::numeric_limits<double>::max(); + double xMax = std::numeric_limits<double>::min(); + double yMin = xMin; + double yMax = xMax; + for(const auto& p: positionMap) { + const auto& xy = p.second; + const double x = xy[0]; + const double y = xy[1]; + xMin = min(xMin, x); + xMax = max(xMax, x); + yMin = min(yMin, y); + yMax = max(yMax, y); + } + const double range = max(xMax - xMin, yMax - yMin); + const double factor = double(sizePixels) / range; + + + + // Gather positions, discretized to integers. + // Each of these will generate a pixel. + class PixelInfo { + public: + uint64_t maxCoverage; + MarkerGraphVertexId vertexId; + }; + std::map< pair<int64_t, int64_t>, PixelInfo> pixels; + for(const auto& p: positionMap) { + const vertex_descriptor v = p.first; + const auto& xy = p.second; + const MarkerGraphVertexId vertexId = graph[v].vertexId; + const uint64_t coverage = markerGraph.vertexCoverage(vertexId); + const double x = xy[0]; + const double y = xy[1]; + const uint64_t ix = int64_t(x * factor); + const uint64_t iy = int64_t(y * factor); + auto it = pixels.find({ix, iy}); + if(it == pixels.end()) { + pixels.insert(make_pair(make_pair(ix, iy), PixelInfo({coverage, vertexId}))); + } else { + if(coverage > it->second.maxCoverage) { + it->second.maxCoverage = coverage; + it->second.vertexId = vertexId; + } + } + } + + + + // Find minimum and maximum ix, iy. + int64_t ixMin = std::numeric_limits<int64_t>::max(); + int64_t ixMax = std::numeric_limits<int64_t>::min(); + int64_t iyMin = ixMin; + int64_t iyMax = ixMax; + for(const auto& pixel :pixels) { + const auto& ixy = pixel.first; + ixMin = min(ixMin, ixy.first); + ixMax = max(ixMax, ixy.first); + iyMin = min(iyMin, ixy.second); + iyMax = max(iyMax, ixy.second); + } + + const int64_t width = ixMax - ixMin + 1; + const int64_t height = iyMax - iyMin + 1; + + + + if(useSvg) { + + // Display using svg. + html << "\n<br><svg width=" << width << " height=" << height << ">"; + const string coverage1Color = "red"; + const string coverage2Color = "yellow"; + const string highCoverageColor = "black"; + + for(const auto& pixel :pixels) { + const auto& ixy = pixel.first; + const uint64_t coverage = pixel.second.maxCoverage; + const MarkerGraphVertexId vertexId = pixel.second.vertexId; + const int64_t ix = ixy.first - ixMin; + SHASTA_ASSERT(ix >= 0); + SHASTA_ASSERT(ix < width); + const int64_t iy = ixy.second - iyMin; + SHASTA_ASSERT(iy >= 0); + SHASTA_ASSERT(iy < height); + + string color; + if(coverage == 1) { + color = coverage1Color; + } else if(coverage == 2) { + color = coverage2Color; + } else { + color = highCoverageColor; + } + + html << + "\n<a href='" + "exploreMarkerGraph1?vertexId=" << vertexId << "&outputType=createAndOpenGfa" + "'>" + "<line x1=" << ix << " y1=" << iy << " x2=" << ix << " y2=" << iy << + " stroke=" << color << " stroke-width=1px stroke-linecap=square />" + "</a>"; + + } + + + html << "</svg>"; + + + + } else { + + // Display using canvas + const array<uint8_t, 3> coverage1Color = {255, 0, 0}; + const array<uint8_t, 3> coverage2Color = {255, 255, 0}; + const array<uint8_t, 3> highCoverageColor = {0, 0, 0}; + html << + "\n<br><canvas id=canvas width=" << width << " height=" << height << + ">" + "\n <script>" + "\n var canvas = document.getElementById('canvas');" + "\n var ctx = canvas.getContext('2d');" + "\n var i = ctx.createImageData(" << width << "," << height << ");\n"; + for(const auto& pixel :pixels) { + const auto& ixy = pixel.first; + const uint64_t coverage = pixel.second.maxCoverage; + const int64_t ix = ixy.first - ixMin; + SHASTA_ASSERT(ix >= 0); + SHASTA_ASSERT(ix < width); + const int64_t iy = ixy.second - iyMin; + SHASTA_ASSERT(iy >= 0); + SHASTA_ASSERT(iy < height); + const uint64_t index = (4 * width) * iy + 4 * ix; + if(coverage == 1) { + for(uint64_t k=0; k<3; k++) { + html << "i.data[" << index+k << "]=" << int(coverage1Color[k]) << ";"; + } + } else if(coverage == 2) { + for(uint64_t k=0; k<3; k++) { + html << "i.data[" << index+k << "]=" << int(coverage2Color[k]) << ";"; + } + } else { + for(uint64_t k=0; k<3; k++) { + html << "i.data[" << index+k << "]=" << int(highCoverageColor[k]) << ";"; + } + } + html << "i.data[" << index+3 << "]=255;"; + } + html << + "\n ctx.putImageData(i, 0, 0);" + "\n </script>"; + } + +} + + + +void LocalMarkerGraph1::writeHtml1( + ostream& html, + uint64_t sizePixels, + double thicknessScaling, + uint64_t quality, + double edgeResolution, + const string& coloring, + uint64_t redCoverage, + uint64_t greenCoverage, + MarkerGraphEdgeId readFollowingStartEdgeId, + int64_t firstMarkerOffset, + int64_t lastMarkerOffset, + bool showLabels, + double timeout) const +{ + const LocalMarkerGraph1& graph = *this; + + + + // To compute the layout, use an auxiliary graph with a vertex + // for each vertex of the LocalMarkerGraph1 plus zero or more vertices + // for each edge of the LocalMarkerGraph1. + // In this initial implementation we divide each LocalMarkerGraph1 edge into a number + // of AuxiliaryGraph edges equal to the number of bases in its sequence. + using AuxiliaryGraph = boost::adjacency_list<boost::vecS, boost::vecS, boost::undirectedS>; + AuxiliaryGraph auxiliaryGraph; + + // The auxiliary graph vertex corresponding to each vertex of the LocalMarkerGraph1. + std::map<vertex_descriptor, AuxiliaryGraph::vertex_descriptor> auxiliaryVertexMap; + + // The auxiliary graph vertices corresponding to each edge of the LocalMarkerGraph1. + std::map<edge_descriptor, vector<AuxiliaryGraph::vertex_descriptor> > auxiliaryEdgeMap; + + // The desired length of each edge of the auxiliary graph. + std::map<AuxiliaryGraph::edge_descriptor, double> auxiliaryEdgeLengthMap; + + // Create vertices and edges of the AuxiliaryGraph. + BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph1) { + auxiliaryVertexMap.insert({v, add_vertex(auxiliaryGraph)}); + } + vector<AuxiliaryGraph::vertex_descriptor> auxiliaryVertices; + BGL_FORALL_EDGES(e, graph, LocalMarkerGraph1) { + const vertex_descriptor v0 = source(e, graph); + const vertex_descriptor v1 = target(e, graph); + const MarkerGraphEdgeId edgeId = graph[e].edgeId; + const uint64_t sequenceLength = markerGraph.edgeSequence[edgeId].size(); + const uint64_t auxiliaryVertexCount = max(1UL, uint64_t(edgeResolution * double(sequenceLength))); + const double edgeLength = double(sequenceLength) / double(auxiliaryVertexCount + 1); + auxiliaryVertices.clear(); + for(uint64_t i=0; i<auxiliaryVertexCount; i++) { + auxiliaryVertices.push_back(add_vertex(auxiliaryGraph)); + } + auxiliaryEdgeMap.insert({e, auxiliaryVertices}); + + // Add the necessary auxiliary graph edges. + AuxiliaryGraph::edge_descriptor ae; + if(auxiliaryVertexCount == 0) { + tie(ae, ignore) = add_edge(auxiliaryVertexMap[v0], auxiliaryVertexMap[v1], auxiliaryGraph); + auxiliaryEdgeLengthMap.insert({ae, edgeLength}); + } else { + tie(ae, ignore) = add_edge(auxiliaryVertexMap[v0], auxiliaryVertices.front(), auxiliaryGraph); + auxiliaryEdgeLengthMap.insert({ae, edgeLength}); + for(uint64_t i=1; i<auxiliaryVertexCount; i++) { + tie(ae, ignore) = add_edge(auxiliaryVertices[i-1], auxiliaryVertices[i], auxiliaryGraph); + auxiliaryEdgeLengthMap.insert({ae, edgeLength}); + } + tie(ae, ignore) = add_edge(auxiliaryVertices.back(), auxiliaryVertexMap[v1], auxiliaryGraph); + auxiliaryEdgeLengthMap.insert({ae, edgeLength}); + } + } + + // Compute the layout of the auxiliary graph. + std::map<AuxiliaryGraph::vertex_descriptor, array<double, 2> > positionMap; + computeLayoutCustom(auxiliaryGraph, auxiliaryEdgeLengthMap, positionMap, quality, timeout); + + + + // If we are doing read following, we need to compute + // followed read coverage for each edge. + std::map<edge_descriptor, uint64_t> readFollowingCoverageMap; + uint64_t readFollowingStartEdgeCoverage = 0; + if(coloring == "readFollowing") { + readFollowingStartEdgeCoverage = markerGraph.edgeCoverage(readFollowingStartEdgeId); + + // Loop over the MarkerIntervals of the start edge for read following. + for(const MarkerInterval& startMarkerInterval: + markerGraph.edgeMarkerIntervals[readFollowingStartEdgeId]) { + const OrientedReadId orientedReadId = startMarkerInterval.orientedReadId; + const int64_t startOrdinal0 = int64_t(startMarkerInterval.ordinals[0]); + + // The number of markers in this oriented read. + const int64_t orientedReadMarkerCount = int64_t(markers.size(orientedReadId.getValue())); + + // Get the MarkerId of the first marker of this oriented read. + // We can use this later to easily get the MarkerId corresponding to any + // marker in the same oriented read. + const MarkerId firstOrientedReadMarkerId = + markers.begin(orientedReadId.getValue()) - markers.begin(); + + // Loop over the requested range of offsets. + for(int64_t offset=firstMarkerOffset; offset<=lastMarkerOffset; offset++) { + const int64_t ordinal0 = startOrdinal0 + offset; + if(ordinal0 < 0) { + // This offset takes us before the beginning of this oriented read. + continue; + } + const int64_t ordinal1 = ordinal0 + 1; + if(ordinal1 > orientedReadMarkerCount-1) { + // This offset takes us past the end of this oriented read. + continue; + } + + // Find the MarkerIds corresponding to these two ordinals. + const MarkerId markerId0 = firstOrientedReadMarkerId + ordinal0; + const MarkerId markerId1 = firstOrientedReadMarkerId + ordinal1; + + // Find the corresponding marker graph vertices. + // We are using the complete marker graph, so the vertices must exist. + const MarkerGraph::CompressedVertexId compressedVertexId0 = markerGraph.vertexTable[markerId0]; + const MarkerGraph::CompressedVertexId compressedVertexId1 = markerGraph.vertexTable[markerId1]; + SHASTA_ASSERT(compressedVertexId0 != MarkerGraph::invalidCompressedVertexId); + SHASTA_ASSERT(compressedVertexId1 != MarkerGraph::invalidCompressedVertexId); + const MarkerGraphVertexId vertexId0 = compressedVertexId0; + // const MarkerGraphVertexId vertexId1 = compressedVertexId1; + + // Find the edge vertexId0->vertexId1 that contains the MarkerInterval + // with these oriented read and ordinals. + MarkerInterval targetMarkerInterval(orientedReadId, uint32_t(ordinal0), uint32_t(ordinal1)); + MarkerGraphEdgeId edgeId = invalid<MarkerGraphEdgeId>; + for(const MarkerGraphEdgeId candidateEdgeId: markerGraph.edgesBySource[vertexId0]) { + const auto edgeMarkerIntervals = markerGraph.edgeMarkerIntervals[candidateEdgeId]; + if(find(edgeMarkerIntervals.begin(), edgeMarkerIntervals.end(), targetMarkerInterval) + != edgeMarkerIntervals.end()) { + edgeId = candidateEdgeId; + break; + } + } + SHASTA_ASSERT(edgeId != invalid<MarkerGraphEdgeId>); + + // cout << orientedReadId << " at offset " << offset << endl; + + // If this edge is in the LocalMarkerGraph1, increment its read following coverage. + auto it = edgeMap.find(edgeId); + if(it != edgeMap.end()) { + const edge_descriptor e = it->second; + auto jt = readFollowingCoverageMap.find(e); + if(jt == readFollowingCoverageMap.end()){ + readFollowingCoverageMap.insert({e, 1}); + // cout << "Added a new entry in the readFollowingCoverageMap." << endl; + } else { + ++jt->second; + // cout << "Incremented readFollowingCoverageMap to " << jt->second << endl; + } + } else { + // cout << "Not found in the edge map." << endl; + } + } + } + + /* + for(const auto& p: readFollowingCoverageMap) { + const edge_descriptor e = p.first; + const uint64_t coverage = p.second; + cout << graph[e].edgeId << " " << coverage << endl; + } + */ + } + + + + // Compute the view box. + double xMin = std::numeric_limits<double>::max(); + double xMax = std::numeric_limits<double>::min(); + double yMin = xMin; + double yMax = xMax; + for(const auto& p: positionMap) { + const array<double, 2>& xy = p.second; + const double x = xy[0]; + const double y = xy[1]; + xMin = min(xMin, x); + xMax = max(xMax, x); + yMin = min(yMin, y); + yMax = max(yMax, y); + } + const double extend = thicknessScaling; + xMin -= extend; + xMax += extend; + yMin -= extend; + yMax += extend; + const double fontSize = 16. * max(xMax-xMin, yMax-yMin) / double(sizePixels); + + // Make the "arrow" length equal to the desired length of 1 base. + const double arrowLength = 1.; + + // Begin the svg. + const string svgId = "LocalMarkerGraph1"; + html << "\n<div style='display: inline-block; vertical-align:top'>" + "<br><svg id='" << svgId << + "' width='" << sizePixels << + "' height='" << sizePixels << + "' viewbox='" << xMin << " " << yMin << " " << + xMax - xMin << " " << + yMax - yMin << "'" + " font-size='" << fontSize << "' style='border-style:solid;border-color:Black;stroke-linecap:round'" + " font-family=monospace" + ">\n"; + + + + // Create a vector to contain edges in the order in which we write them out. + // Edges written last are less likely to be superimposed by other edges. + vector< pair<edge_descriptor, uint64_t> > allEdges; + if(coloring == "readFollowing") { + BGL_FORALL_EDGES(e, graph, LocalMarkerGraph1) { + uint64_t readFollowingCoverage = 0; + auto it = readFollowingCoverageMap.find(e); + if(it != readFollowingCoverageMap.end()) { + readFollowingCoverage = it->second; + } + allEdges.push_back({e, readFollowingCoverage}); + } + } else { + BGL_FORALL_EDGES(e, graph, LocalMarkerGraph1) { + const MarkerGraphEdgeId edgeId = graph[e].edgeId; + const uint64_t coverage = markerGraph.edgeCoverage(edgeId); + allEdges.push_back({e, coverage}); + } + } + sort(allEdges.begin(), allEdges.end(), + OrderPairsBySecondOnly<edge_descriptor, uint64_t>()); + + + + // Write the edges. + html << "\n<g id=edges stroke-width='" << thicknessScaling << "'>"; + for(const auto& p: allEdges) { + const edge_descriptor e = p.first; + const MarkerGraphEdgeId edgeId = graph[e].edgeId; + const uint64_t coverage = markerGraph.edgeCoverage(edgeId); + const vertex_descriptor v0 = source(e, graph); + const vertex_descriptor v1 = target(e, graph); + const auto& p0 = positionMap[auxiliaryVertexMap[v0]]; + const auto& p1 = positionMap[auxiliaryVertexMap[v1]]; + const vector<AuxiliaryGraph::vertex_descriptor>& auxiliaryVertices = auxiliaryEdgeMap[e]; + + string color; + uint64_t readFollowingCoverage = 0; + if(coloring == "random") { + const uint32_t hue = MurmurHash2(&edgeId, sizeof(edgeId), 231) % 360; + color = "hsl(" + to_string(hue) + ",50%,50%)"; + } else if(coloring == "byCoverage") { + if(coverage <= redCoverage) { + color = "Red"; + } else if(coverage >= greenCoverage) { + color = "Green"; + } else { + const uint32_t hue = uint32_t(120. * + (double(coverage) - double(redCoverage)) / (double(greenCoverage) - double(redCoverage))); + color = "hsl(" + to_string(hue) + ",50%,50%)"; + } + } else if(coloring == "readFollowing") { + auto it = readFollowingCoverageMap.find(e); + if(it == readFollowingCoverageMap.end()) { + color = "LightGrey"; + } else { + const uint64_t coverage = it->second; + readFollowingCoverage = coverage; + if(coverage <= redCoverage) { + color = "Red"; + } else if(coverage >= greenCoverage) { + color = "Green"; + } else { + const uint32_t hue = uint32_t(120. * + (double(coverage) - double(redCoverage)) / (double(greenCoverage) - double(redCoverage))); + color = "hsl(" + to_string(hue) + ",50%,50%)"; + } + } + } else { + SHASTA_ASSERT(0); + } + const string properties = "stroke='" + color + "'"; + + SHASTA_ASSERT(not auxiliaryVertices.empty()); + + // Create a group for this edge. + const auto sequence = markerGraph.edgeSequence[edgeId]; + html << "<g>"; + + // Add a title. + html << + "<title>Edge " << edgeId << ", coverage " << coverage << + ", " << sequence.size() << " bases: "; + copy(sequence.begin(), sequence.end(), ostream_iterator<shasta::Base>(html)); + if(coloring == "readFollowing") { + html << ", read following coverage " << readFollowingCoverage << "/" << + readFollowingStartEdgeCoverage; + } + html << "</title>"; + + // Add a hyperlink. + html << "<a href='exploreMarkerGraphEdge?edgeId=" << edgeId << "'>"; + html << "<g id='Edge-"<< edgeId << "' " << properties << " >"; + + // Line from p0 to the first auxiliary vertex. + const auto& xyFirst = positionMap[auxiliaryVertices.front()]; + html << "\n<line x1=" << p0[0] << " y1=" << p0[1] << + " x2=" << xyFirst[0] << " y2=" << xyFirst[1] << " />"; + + // Lines between auxiliary vertices. + for(uint64_t i=1; i<auxiliaryVertices.size(); i++) { + const auto& xyA = positionMap[auxiliaryVertices[i-1]]; + const auto& xyB = positionMap[auxiliaryVertices[i]]; + html << "\n<line x1=" << xyA[0] << " y1=" << xyA[1] << + " x2=" << xyB[0] << " y2=" << xyB[1] << " />"; + } + + // Line from the last auxiliary vertex to p1. + const auto& xyLast = positionMap[auxiliaryVertices.back()]; + html << "\n<line x1=" << xyLast[0] << " y1=" << xyLast[1] << + " x2=" << p1[0] << " y2=" << p1[1] << " />"; + html << "</g></a>"; + + // Label. + if(showLabels) { + double x, y; + if((auxiliaryVertices.size() %2) == 0) { + const auto positionA = positionMap[auxiliaryVertices[auxiliaryVertices.size()/2 -1]]; + const auto positionB = positionMap[auxiliaryVertices[auxiliaryVertices.size()/2]]; + x = (positionA[0] + positionB[0]) / 2; + y = (positionA[1] + positionB[1]) / 2; + } else { + const auto position = positionMap[auxiliaryVertices[auxiliaryVertices.size()/2]]; + x = position[0]; + y = position[1]; + } + html << "<text x='" << x << "' << y='" << y << "' dominant-baseline=middle text-anchor=middle>"; + copy(sequence.begin(), sequence.end(), ostream_iterator<shasta::Base>(html)); + html << "</text>"; + } + + // End the group for this edge. + html << "</g>"; + } + html << "\n</g>"; + + + + // Write the "arrows". + html << "\n<g id=arrows stroke-width='" << thicknessScaling/3. << "'>"; + BGL_FORALL_EDGES(e, graph, LocalMarkerGraph1) { + const vertex_descriptor v1 = target(e, graph); + const auto& p1 = positionMap[auxiliaryVertexMap[v1]]; + const vector<AuxiliaryGraph::vertex_descriptor>& auxiliaryVertices = auxiliaryEdgeMap[e]; + SHASTA_ASSERT(not auxiliaryVertices.empty()); + + // Position of the last auxiliary vertex. + const auto& xyLast = positionMap[auxiliaryVertices.back()]; + + // Draw the "arrow". + // We need to compute a unit vector in the direction (p1, xyLast). + const double vx = xyLast[0] - p1[0]; + const double vy = xyLast[1] - p1[1]; + const double v = sqrt(vx*vx + vy * vy); + if(v < 1.e-3) { + // Trouble. This can happen if two vertices are very close. Skip the arrow. + continue; + } + const double ux = vx / v; + const double uy = vy / v; + const double xArrow = p1[0] + ux * arrowLength; + const double yArrow = p1[1] + uy * arrowLength; + html << "\n<line x1=" << xArrow << " y1=" << yArrow << + " x2=" << p1[0] << " y2=" << p1[1] << " stroke=Black />"; + } + html << "\n</g>"; + + + // Write the vertices. + // They can obscure coverage coloring. + if(true /*coloring == "random"*/) { + html << "\n<g id=vertices stroke-width='" << thicknessScaling << "'>"; + BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph1) { + const auto& p = positionMap[auxiliaryVertexMap[v]]; + const double x = p[0]; + const double y = p[1]; + const string color = (graph[v].distance == maxDistance ? "Grey" : "Black"); + + // Create a group for this edge. + const MarkerGraphVertexId vertexId = graph[v].vertexId; + const uint64_t coverage = markerGraph.vertexCoverage(vertexId); + html << "<g><title>Vertex " << vertexId << ", coverage " << coverage; + html << "</title>"; + html << "<a href='exploreMarkerGraphVertex?vertexId=" << vertexId << "'>"; + + // Write the vertex. + html << "\n<line x1=" << x << " y1=" << y << + " x2=" << x << " y2=" << y << " stroke=" << color << " />"; + + // End the group. + html << "</a></g>"; + } + html << "\n</g>"; + } + + // Finish the svg. + html << "\n</svg></div>"; + + // Add drag and zoom. + addSvgDragAndZoom(html); + + // Side panel. + html << "<div style='display: inline-block'>"; + + // Change thickness + html << R"stringDelimiter( + <p><table> + <tr><th class=left>Thickness<td> + <button type='button' onClick='changeThickness(0.1)' style='width:3em'>---</button> + <button type='button' onClick='changeThickness(0.5)' style='width:3em'>--</button> + <button type='button' onClick='changeThickness(0.8)' style='width:3em'>-</button> + <button type='button' onClick='changeThickness(1.25)' style='width:3em'>+</button> + <button type='button' onClick='changeThickness(2.)' style='width:3em'>++</button> + <button type='button' onClick='changeThickness(10.)' style='width:3em'>+++</button> + <script> + function changeThickness(factor) + { + edges = document.getElementById('edges'); + edges.setAttribute('stroke-width', factor * edges.getAttribute('stroke-width')); + vertices = document.getElementById('vertices'); + vertices.setAttribute('stroke-width', factor * vertices.getAttribute('stroke-width')); + arrows = document.getElementById('arrows'); + arrows.setAttribute('stroke-width', factor * arrows.getAttribute('stroke-width')); + } + </script> + )stringDelimiter"; + + + + // Zoom buttons. + html << R"stringDelimiter( + <tr title='Or use the mouse wheel.'><th class=left>Zoom<td> + <button type='button' onClick='zoomSvg(0.1)' style='width:3em'>---</button> + <button type='button' onClick='zoomSvg(0.5)' style='width:3em'>--</button> + <button type='button' onClick='zoomSvg(0.8)' style='width:3em'>-</button> + <button type='button' onClick='zoomSvg(1.25)' style='width:3em'>+</button> + <button type='button' onClick='zoomSvg(2.)' style='width:3em'>++</button> + <button type='button' onClick='zoomSvg(10.)' style='width:3em'>+++</button> + )stringDelimiter"; + + + + // Buttons to highlight an edge and zoom to an edge. + html << R"stringDelimiter( + <tr><td colspan=2> + <button onClick='highlightEdge()'>Highlight</button> + <button onClick='zoomToEdge()'>Zoom to</button>edge + <input id=selectedEdgeId type=text size=10 style='text-align:center'> + <script> + function zoomToEdge() + { + // Get the edge id from the input field. + var edgeId = document.getElementById("selectedEdgeId").value; + zoomToGivenEdge(edgeId); + } + function zoomToGivenEdge(edgeId) + { + var element = document.getElementById("Edge-" + edgeId); + + // Find the bounding box and its center. + var box = element.getBBox(); + var xCenter = box.x + 0.5 * box.width; + var yCenter = box.y + 0.5 * box.height; + + // Change the viewbox of the svg to be a bit larger than a square + // containing the bounding box. + var enlargeFactor = 2.; + var size = enlargeFactor * Math.max(box.width, box.height); + var factor = size / width; + width = size; + height = size; + x = xCenter - 0.5 * size; + y = yCenter - 0.5 * size; + var svg = document.querySelector('svg'); + svg.setAttribute('viewBox', `${x} ${y} ${size} ${size}`); + ratio = size / svg.getBoundingClientRect().width; + svg.setAttribute('font-size', svg.getAttribute('font-size') * factor); + + } + function highlightEdge() + { + // Get the edge id from the input field. + var edgeId = document.getElementById("selectedEdgeId").value; + var element = document.getElementById("Edge-" + edgeId); + + element.style.stroke = "Magenta"; + } + </script> + )stringDelimiter"; + + + + // End the side panel. + html << "</table></div>"; +} + + + +void LocalMarkerGraph1::pruneLowCoverageLeaves(uint64_t maxPruneEdgeCoverage) +{ + if(maxPruneEdgeCoverage == 0) { + return; + } + + pruneLowCoverageForwardLeaves(maxPruneEdgeCoverage); + pruneLowCoverageBackwardLeaves(maxPruneEdgeCoverage); + +} + + + +void LocalMarkerGraph1::pruneLowCoverageForwardLeaves(uint64_t maxPruneCoverage) +{ + LocalMarkerGraph1& graph = *this; + + // Start will all vertices with out-degree 0 and low coverage. + std::stack<vertex_descriptor> leaves; + BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph1) { + const MarkerGraphVertexId vertexId = graph[v].vertexId; + const uint64_t coverage = markerGraph.vertexCoverage(vertexId); + if(coverage > maxPruneCoverage) { + continue; + } + if(out_degree(v, graph) == 0) { + leaves.push(v); + } + } + + // Main loop. At each iteration we remove a leaf, and add others as required. + while(not leaves.empty()) { + const vertex_descriptor leaf = leaves.top(); + leaves.pop(); + + // If any parent has out-degree 1 and low coverage, + // it becomes a leaf to be removed when we remove this one. + BGL_FORALL_INEDGES(leaf, e, graph, LocalMarkerGraph1) { + const vertex_descriptor parent = source(e, graph); + if(parent == leaf) { + continue; + } + const MarkerGraphVertexId vertexId = graph[parent].vertexId; + const uint64_t coverage = markerGraph.vertexCoverage(vertexId); + if(coverage > maxPruneCoverage) { + continue; + } + if(out_degree(parent, graph) == 1) { + leaves.push(parent); + } + } + + clear_vertex(leaf, graph); + remove_vertex(leaf, graph); + } +} + + + +void LocalMarkerGraph1::pruneLowCoverageBackwardLeaves(uint64_t maxPruneCoverage) +{ + LocalMarkerGraph1& graph = *this; + + // Start will all vertices with in-degree 0 and low coverage. + std::stack<vertex_descriptor> leaves; + BGL_FORALL_VERTICES(v, graph, LocalMarkerGraph1) { + const MarkerGraphVertexId vertexId = graph[v].vertexId; + const uint64_t coverage = markerGraph.vertexCoverage(vertexId); + if(coverage > maxPruneCoverage) { + continue; + } + if(in_degree(v, graph)==0) { + leaves.push(v); + } + } + + // Main loop. At each iteration we remove a leaf, and add others as required. + while(not leaves.empty()) { + const vertex_descriptor leaf = leaves.top(); + leaves.pop(); + + // If any child has in-degree 1 and low coverage, + // it becomes a leaf to be removed when we remove this one. + BGL_FORALL_OUTEDGES(leaf, e, graph, LocalMarkerGraph1) { + const vertex_descriptor child = target(e, graph); + if(child == leaf) { + continue; + } + const MarkerGraphVertexId vertexId = graph[child].vertexId; + const uint64_t coverage = markerGraph.vertexCoverage(vertexId); + if(coverage > maxPruneCoverage) { + continue; + } + if(in_degree(child, graph)==1) { + leaves.push(child); + } + } + + clear_vertex(leaf, graph); + remove_vertex(leaf, graph); + } + +} + + + +void LocalMarkerGraph1::findLowCoverageChains( + uint64_t maxChainCoverage, + vector< vector<vertex_descriptor> >& chains + ) const +{ + const LocalMarkerGraph1& graph = *this; + + // Create a filtered graph containing only the vertices + // with coverage up to maxChainCoverage. + class VertexPredicate { + public: + VertexPredicate() : graph(0), maxChainCoverage(invalid<uint64_t>) {} + VertexPredicate( + const LocalMarkerGraph1& graph, + uint64_t maxChainCoverage) : + graph(&graph), + maxChainCoverage(maxChainCoverage) + {} + const LocalMarkerGraph1* graph; + uint64_t maxChainCoverage; + bool operator()(const vertex_descriptor v) const + { + const MarkerGraphVertexId vertexId = (*graph)[v].vertexId; + const uint64_t coverage = graph->markerGraph.vertexCoverage(vertexId); + return coverage <= maxChainCoverage; + } + }; + boost::filtered_graph<LocalMarkerGraph1, boost::keep_all, VertexPredicate> + filteredGraph(graph, boost::keep_all(), VertexPredicate(graph, maxChainCoverage)); + + // Find linear chains in this filtered graph. + findLinearVertexChains(filteredGraph, chains); +} + + + +void LocalMarkerGraph1::removeLongLowCoverageChains( + uint64_t maxChainCoverage, + uint64_t minLength) +{ + LocalMarkerGraph1& graph = *this; + + // Find low coverage chains. + vector< vector<LocalMarkerGraph1::vertex_descriptor> > lowCoverageChains; + findLowCoverageChains(1, lowCoverageChains); + + // Remove the long ones. + for(const auto& chain: lowCoverageChains) { + if(chain.size() >= minLength) { + for(const vertex_descriptor v: chain) { + clear_vertex(v, graph); + remove_vertex(v, graph); + } + } + } + +} + diff --git a/src/LocalMarkerGraph1.hpp b/src/LocalMarkerGraph1.hpp new file mode 100644 index 0000000..0991a3c --- /dev/null +++ b/src/LocalMarkerGraph1.hpp @@ -0,0 +1,134 @@ +#ifndef SHASTA_LOCAL_MARKER_GRAPH1_HPP +#define SHASTA_LOCAL_MARKER_GRAPH1_HPP + +// Shasta. +#include "shastaTypes.hpp" + +// Boost libraries. +#include <boost/graph/adjacency_list.hpp> + +// Standard library. +#include "iosfwd.hpp" +#include <map> +#include "string.hpp" +#include "vector.hpp" + +namespace shasta { + + class LocalMarkerGraph1Vertex; + class LocalMarkerGraph1Edge; + class LocalMarkerGraph1; + using LocalMarkerGraph1BaseClass = boost::adjacency_list< + boost::listS, + boost::listS, + boost::bidirectionalS, + LocalMarkerGraph1Vertex, + LocalMarkerGraph1Edge + >; + + class CompressedMarker; + class MarkerGraph; + namespace MemoryMapped { + template<class T, class Int> class VectorOfVectors; + } +} + + +class shasta::LocalMarkerGraph1Vertex { +public: + + // The id of the corresponding marker graph vertex. + MarkerGraphVertexId vertexId; + + // The distance from the start vertex. + uint64_t distance; + + LocalMarkerGraph1Vertex( + MarkerGraphVertexId vertexId, + uint64_t distance) : + vertexId(vertexId), + distance(distance) + { + } + +}; + + + +class shasta::LocalMarkerGraph1Edge { +public: + + // The id of the corresponding marker graph edge. + MarkerGraphEdgeId edgeId; + + LocalMarkerGraph1Edge(MarkerGraphEdgeId edgeId) : + edgeId(edgeId) + { + } + +}; + + + +class shasta::LocalMarkerGraph1 : + public LocalMarkerGraph1BaseClass { +public: + + LocalMarkerGraph1( + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, + const MarkerGraph&, + MarkerGraphVertexId, + uint64_t maxDistance, + uint64_t minVertexCoverage, + uint64_t minEdgeCoverage + ); + + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers; + const MarkerGraph& markerGraph; + uint64_t maxDistance; + + std::map<MarkerGraphVertexId, vertex_descriptor> vertexMap; + std::map<MarkerGraphEdgeId, edge_descriptor> edgeMap; + vertex_descriptor addVertex(MarkerGraphVertexId, uint64_t distance); + + void writeGfa(const string& fileName) const; + void writeHtml0( + ostream&, + uint64_t sizePixels, + uint64_t quality, + double timeout, + bool useSvg) const; + void writeHtml1( + ostream&, + uint64_t sizePixels, + double thicknessScaling, + uint64_t quality, + double edgeResolution, + const string& coloring, + uint64_t redCoverage, + uint64_t greenCoverage, + MarkerGraphEdgeId readFollowingStartEdgeId, + int64_t firstMarkerOffset, + int64_t lastMarkerOffset, + bool showLabels, + double timeout) const; + + void pruneLowCoverageLeaves(uint64_t maxPruneCoverage); +private: + void pruneLowCoverageForwardLeaves(uint64_t maxPruneCoverage); + void pruneLowCoverageBackwardLeaves(uint64_t maxPruneCoverage); + +public: + + void removeLongLowCoverageChains( + uint64_t maxChainCoverage, + uint64_t minLength); +private: + void findLowCoverageChains( + uint64_t maxChainCoverage, + vector< vector<vertex_descriptor> >& + ) const; + +}; + +#endif diff --git a/src/LocalReadGraph.cpp b/src/LocalReadGraph.cpp index 196edd3..e54195d 100644 --- a/src/LocalReadGraph.cpp +++ b/src/LocalReadGraph.cpp @@ -1,6 +1,7 @@ // Shasta. #include "LocalReadGraph.hpp" #include "Alignment.hpp" +#include "Assembler.hpp" #include "writeGraph.hpp" using namespace shasta; @@ -177,9 +178,9 @@ void LocalReadGraph::Writer::operator()(std::ostream& s, edge_descriptor e) cons // Edge thickness is determined by the number of aligned markers. s << " penwidth=\"" << edgeThicknessScalingFactor * (1.e-4 * edge.markerCount) << "\""; - // An edge that crosses strands is drawn dashed. + // An edge that crosses strands is drawn purple. if(edge.crossesStrands) { - s << " style=dashed"; + s << " color=purple"; } s << "]"; @@ -224,6 +225,7 @@ void LocalReadGraph::writeSvg( double vertexScalingFactor, double edgeThicknessScalingFactor, uint64_t maxDistance, + const Assembler& assembler, ostream& svg) const { using Graph = LocalReadGraph; @@ -275,15 +277,42 @@ void LocalReadGraph::writeSvg( EdgeAttributes attributes; attributes.thickness = edgeThicknessScalingFactor * 1.e-6 * double(edge.markerCount); - if(edge.color.empty()) { - attributes.color = "midnightblue"; + + // Extract the uniqueness metric. It is only valid for alignment method 5. + // In all other cases it is a signaling Nan. + // If the uniqueness metric is available, use it to color the edge. + const uint64_t globalEdgeId = edge.globalEdgeId; + const ReadGraphEdge& globalEdge = assembler.readGraph.edges[globalEdgeId]; + const uint64_t alignmentId = globalEdge.alignmentId; + const AlignmentData& alignmentData = assembler.alignmentData[alignmentId]; + const AlignmentInfo& alignmentInfo = alignmentData.info; + + // Set the edge color. + if(false /* not std::isnan(alignmentInfo.uniquenessMetric) */) { + const float red = 1.; + const float green = 5.; + if(alignmentInfo.uniquenessMetric <= red) { + attributes.color = "red"; + } else if(alignmentInfo.uniquenessMetric >= green) { + attributes.color = "green"; + } else { + const uint64_t h = uint64_t(std::round(alignmentInfo.uniquenessMetric - red) * 120. / (green - red)); + attributes.color = "hsl(" + to_string(h) + ",100%,50%)"; + } } else { - attributes.color = edge.color; + if(edge.color.empty()) { + attributes.color = "midnightblue"; + } else { + attributes.color = edge.color; + } } attributes.tooltip = vertex0.orientedReadId.getString() + " " + vertex1.orientedReadId.getString() + ", " + to_string(edge.markerCount) + " aligned markers"; + if(not std::isnan(alignmentInfo.uniquenessMetric)) { + attributes.tooltip += ", uniqueness metric " + to_string(alignmentInfo.uniquenessMetric); + } edgeAttributes.insert(make_pair(e, attributes)); } diff --git a/src/LocalReadGraph.hpp b/src/LocalReadGraph.hpp index 50f9cdc..5898152 100644 --- a/src/LocalReadGraph.hpp +++ b/src/LocalReadGraph.hpp @@ -44,6 +44,8 @@ namespace shasta { {}; enum class AlignmentType; + + class Assembler; } @@ -158,6 +160,7 @@ public: double vertexScalingFactor, double edgeThicknessScalingFactor, uint64_t maxDistance, + const Assembler&, ostream& svg) const; // Write in Graphviz format. diff --git a/src/LowHash0.cpp b/src/LowHash0.cpp index a09ed36..b5e6a5c 100644 --- a/src/LowHash0.cpp +++ b/src/LowHash0.cpp @@ -33,9 +33,8 @@ LowHash0::LowHash0( size_t maxBucketSize, // The maximum size for a bucket to be used. size_t minFrequency, // Minimum number of minHash hits for a pair to be considered a candidate. size_t threadCountArgument, - const MemoryMapped::Vector<KmerInfo>& kmerTable, const Reads& reads, - const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, + const MemoryMapped::VectorOfVectors<KmerId, uint64_t>& kmerIds, MemoryMapped::Vector<OrientedReadPair>& candidateAlignments, MemoryMapped::Vector< array<uint64_t, 3> >& readLowHashStatistics, const string& largeDataFileNamePrefix, @@ -48,9 +47,8 @@ LowHash0::LowHash0( maxBucketSize(maxBucketSize), minFrequency(minFrequency), threadCount(threadCountArgument), - kmerTable(kmerTable), reads(reads), - markers(markers), + kmerIds(kmerIds), readLowHashStatistics(readLowHashStatistics), largeDataFileNamePrefix(largeDataFileNamePrefix), largeDataPageSize(largeDataPageSize), @@ -71,7 +69,7 @@ LowHash0::LowHash0( // and each feature generates a low hash with probability hashFraction. // So an estimate of the total number of hashes is: const uint64_t totalLowHashCountEstimate = - uint64_t(hashFraction * double(markers.totalSize())); + uint64_t(hashFraction * double(kmerIds.totalSize())); const uint32_t leadingZeroBitCount = uint32_t(__builtin_clzl(totalLowHashCountEstimate)); const uint32_t log2TotalLowHashCountEstimate = 64 - leadingZeroBitCount; @@ -98,18 +96,11 @@ LowHash0::LowHash0( cout << " = " << bucketCount << " buckets. "<< endl; - - - // Create vectors containing only the k-mer ids of all markers. - // This is used to speed up the computation of hash functions. - performanceLog << timestamp << "Creating kmer ids for oriented reads." << endl; - createKmerIds(); - // Compute the threshold for a hash value to be considered low. hashThreshold = uint64_t(double(hashFraction) * double(std::numeric_limits<uint64_t>::max())); // The number of oriented reads, each with its own vector of markers. - const OrientedReadId::Int orientedReadCount = OrientedReadId::Int(markers.size()); + const OrientedReadId::Int orientedReadCount = OrientedReadId::Int(kmerIds.size()); const ReadId readCount = orientedReadCount / 2; @@ -127,6 +118,9 @@ LowHash0::LowHash0( // Write the header of the histogram file. histogramCsv << "Iteration,BucketSize,BucketCount,FeatureCount\n"; + // If minBucketSize and maxBucketSize are both zero, + // they are chosen automatically for each iteration. + const bool dynamicMinMaxBucketSizes = ((minBucketSize == 0) and (maxBucketSize == 0)); // LowHash0 iteration loop. @@ -174,7 +168,16 @@ LowHash0::LowHash0( setupLoadBalancing(readCount, batchSize); runThreads(&LowHash0::pass2ThreadFunction, threadCount); buckets.endPass2(false, false); - computeBucketHistogram(); + + // Compute a histogram of bucket size. + vector<uint64_t> bucketHistogram; + computeBucketHistogram(bucketHistogram); + + // If dynamic adjustment of min/max bucket size was requested, + // do it now for this iteration, based on the current bucket size histogram. + if(dynamicMinMaxBucketSizes) { + adjustMinMaxBucketSizes(bucketHistogram); + } // Pass 3: inspect the buckets to find candidates. batchSize = 10000; @@ -223,7 +226,7 @@ LowHash0::LowHash0( for(ReadId readId=0; readId<readCount; readId++) { const array<uint64_t, 3>& counters = readLowHashStatistics[readId]; const uint64_t total = std::accumulate(counters.begin(), counters.end(), 0); - const uint64_t featureCount = markers.size(OrientedReadId(readId, 0).getValue()) - (m-1); + const uint64_t featureCount = kmerIds.size(OrientedReadId(readId, 0).getValue()) - (m-1); const double featureSampling = double(total) / double(featureCount); csv << readId << ","; csv << (reads.getFlags(readId).isPalindromic ? "Yes," : "No,"); @@ -242,13 +245,8 @@ LowHash0::LowHash0( } } - - - // Clean up work areas. + // Clean up . buckets.remove(); - kmerIds.remove(); - - // Done. const auto tEnd = steady_clock::now(); @@ -258,57 +256,6 @@ LowHash0::LowHash0( -void LowHash0::createKmerIds() -{ - kmerIds.createNew( - largeDataFileNamePrefix.empty() ? "" : (largeDataFileNamePrefix + "tmp-LowHash0-Markers"), - largeDataPageSize); - const ReadId orientedReadCount = ReadId(markers.size()); - const ReadId readCount = orientedReadCount / 2; - kmerIds.beginPass1(orientedReadCount); - for(ReadId readId=0; readId!=readCount; readId++) { - for(Strand strand=0; strand<2; strand++) { - const OrientedReadId orientedReadId(readId, strand); - const auto markerCount = markers.size(orientedReadId.getValue()); - kmerIds.incrementCount(orientedReadId.getValue(), markerCount); - } - } - kmerIds.beginPass2(); - kmerIds.endPass2(false); - const size_t batchSize = 10000; - setupLoadBalancing(readCount, batchSize); - runThreads(&LowHash0::createKmerIds, threadCount); -} - - - -// Thread function for createKmerIds. -void LowHash0::createKmerIds(size_t threadId) -{ - - // Loop over batches assigned to this thread. - uint64_t begin, end; - while(getNextBatch(begin, end)) { - - // Loop over reads assigned to this batch. - for(ReadId readId=ReadId(begin); readId!=ReadId(end); readId++) { - for(Strand strand=0; strand<2; strand++) { - const OrientedReadId orientedReadId(readId, strand); - const auto orientedReadMarkers = markers[orientedReadId.getValue()]; - - SHASTA_ASSERT(kmerIds.size(orientedReadId.getValue()) == orientedReadMarkers.size()); - - auto pointer = kmerIds.begin(orientedReadId.getValue()); - for(const CompressedMarker& marker: orientedReadMarkers) { - *pointer++ = marker.kmerId; - } - } - } - } -} - - - // Pass1: compute the low hashes for each oriented read // and prepare the buckets for filling. void LowHash0::pass1ThreadFunction(size_t threadId) @@ -322,7 +269,11 @@ void LowHash0::pass1ThreadFunction(size_t threadId) // Loop over oriented reads assigned to this batch. for(ReadId readId=ReadId(begin); readId!=ReadId(end); readId++) { - if(reads.getFlags(readId).isPalindromic) { + const ReadFlags& flags = reads.getFlags(readId); + if(flags.discardDueToDuplicates) { + continue; + } + if(flags.isPalindromic) { continue; } for(Strand strand=0; strand<2; strand++) { @@ -340,7 +291,7 @@ void LowHash0::pass1ThreadFunction(size_t threadId) // Get the markers for this oriented read. - KmerId* kmerIdsPointer = kmerIds.begin(orientedReadId.getValue()); + const KmerId* kmerIdsPointer = kmerIds.begin(orientedReadId.getValue()); const size_t featureCount = markerCount - m + 1; // Loop over features of this oriented read. @@ -371,7 +322,11 @@ void LowHash0::pass2ThreadFunction(size_t threadId) // Loop over oriented reads assigned to this batch. for(ReadId readId=ReadId(begin); readId!=ReadId(end); readId++) { - if(reads.getFlags(readId).isPalindromic) { + const ReadFlags& flags = reads.getFlags(readId); + if(flags.discardDueToDuplicates) { + continue; + } + if(flags.isPalindromic) { continue; } for(Strand strand=0; strand<2; strand++) { @@ -563,7 +518,7 @@ void LowHash0::merge( -void LowHash0::computeBucketHistogram() +void LowHash0::computeBucketHistogram(vector<uint64_t>& bucketHistogram) { threadBucketHistogram.clear(); threadBucketHistogram.resize(threadCount); @@ -576,7 +531,8 @@ void LowHash0::computeBucketHistogram() for(const vector<uint64_t>& histogram: threadBucketHistogram) { largestBucketSize = max(largestBucketSize, uint64_t(histogram.size())); } - vector<uint64_t> bucketHistogram(largestBucketSize, 0); + bucketHistogram.clear(); + bucketHistogram.resize(largestBucketSize, 0); for(const vector<uint64_t>& histogram: threadBucketHistogram) { for(uint64_t bucketSize=0; bucketSize<histogram.size(); bucketSize++) { bucketHistogram[bucketSize] += histogram[bucketSize]; @@ -611,3 +567,37 @@ void LowHash0::computeBucketHistogramThreadFunction(size_t threadId) } } } + + + +// Adjust minBucketSize and maxBucketSize based on the current +// bucket size histogram. +void LowHash0::adjustMinMaxBucketSizes(const vector<uint64_t>& histogram) +{ + // Set minBucketSize to the lowest bucket size B0 + // such that histogram[B0] > histogram[B0-1]. + bool done = false; + for(uint64_t B0=1; B0<histogram.size(); B0++) { + if(histogram[B0] > histogram[B0 - 1]) { + minBucketSize = B0; + done = true; + break; + } + } + SHASTA_ASSERT(done); + + // Set maxBucketSize to the largest bucket size B1 such that histogram[B1] >= histogram[B0] = histogram[minBucketSize] + done = false; + for(uint64_t B1=histogram.size()-1; B1>=minBucketSize; B1--) { + if(histogram[B1] >= histogram[minBucketSize]) { + maxBucketSize = B1; + done = true; + break; + } + } + SHASTA_ASSERT(done); + + cout << "Automatic settings for this iteration: minBucketSize " << minBucketSize << + ", maxBucketSize " << maxBucketSize << endl; +} + diff --git a/src/LowHash0.hpp b/src/LowHash0.hpp index 6bd28ce..3527e93 100644 --- a/src/LowHash0.hpp +++ b/src/LowHash0.hpp @@ -42,9 +42,8 @@ public: size_t maxBucketSize, // The maximum size for a bucket to be used. size_t minFrequency, // Minimum number of minHash hits for a pair to be considered a candidate. size_t threadCount, - const MemoryMapped::Vector<KmerInfo>& kmerTable, const Reads& reads, - const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>&, + const MemoryMapped::VectorOfVectors<KmerId, uint64_t>& kmerIds, MemoryMapped::Vector<OrientedReadPair>&, MemoryMapped::Vector< array<uint64_t, 3> >& readLowHashStatistics, const string& largeDataFileNamePrefix, @@ -60,21 +59,12 @@ private: size_t maxBucketSize; // The maximum size for a bucket to be used. size_t minFrequency; // Minimum number of minHash hits for a pair to be considered a candidate. size_t threadCount; - const MemoryMapped::Vector<KmerInfo>& kmerTable; const Reads& reads; - const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers; + const MemoryMapped::VectorOfVectors<KmerId, uint64_t>& kmerIds; MemoryMapped::Vector< array<uint64_t, 3> > &readLowHashStatistics; const string& largeDataFileNamePrefix; size_t largeDataPageSize; - // Vectors containing only the k-mer ids of all markers - // for all oriented reads. - // Indexed by OrientedReadId.getValue(). - // This is used to speed up the computation of hash functions. - MemoryMapped::VectorOfVectors<KmerId, uint64_t> kmerIds; - void createKmerIds(); - void createKmerIds(size_t threadId); - // The current MinHash iteration. // This is used to compute a different MurmurHash function // at each iteration. @@ -181,12 +171,14 @@ private: // Compute a histogram of the number of entries in each histogram. - void computeBucketHistogram(); + void computeBucketHistogram(vector<uint64_t>& bucketHistogram); void computeBucketHistogramThreadFunction(size_t threadId); vector< vector<uint64_t> > threadBucketHistogram; ofstream histogramCsv; - + // Adjust minBucketSize and maxBucketSize based on the current + // bucket size histogram. + void adjustMinMaxBucketSizes(const vector<uint64_t>& bucketHistogram); // Thread functions. diff --git a/src/LowHash1.cpp b/src/LowHash1.cpp deleted file mode 100644 index 15ecd27..0000000 --- a/src/LowHash1.cpp +++ /dev/null @@ -1,685 +0,0 @@ -// Shasta. -#include "LowHash1.hpp" -#include "AlignmentCandidates.hpp" -#include "Marker.hpp" -#include "MurmurHash2.hpp" -using namespace shasta; - -// Standad library. -#include "algorithm.hpp" -#include "chrono.hpp" - -#include "MultithreadedObject.tpp" -template class MultithreadedObject<LowHash1>; - - -LowHash1::LowHash1( - size_t m, // Number of consecutive markers that define a feature. - double hashFraction, - size_t minHashIterationCount, // Number of minHash iterations. - size_t log2MinHashBucketCount, // Base 2 log of number of buckets for minHash. - size_t minBucketSize, // The minimum size for a bucket to be used. - size_t maxBucketSize, // The maximum size for a bucket to be used. - size_t minFrequency, // Minimum number of minHash hits for a pair to be considered a candidate. - size_t threadCountArgument, - const MemoryMapped::Vector<KmerInfo>& kmerTable, - const Reads& reads, - const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, - AlignmentCandidates& candidates, - const string& largeDataFileNamePrefix, - size_t largeDataPageSize - ) : - MultithreadedObject(*this), - m(m), - hashFraction(hashFraction), - minBucketSize(minBucketSize), - maxBucketSize(maxBucketSize), - minFrequency(minFrequency), - threadCount(threadCountArgument), - kmerTable(kmerTable), - reads(reads), - markers(markers), - candidates(candidates), - largeDataFileNamePrefix(largeDataFileNamePrefix), - largeDataPageSize(largeDataPageSize), - histogramCsv("LowHashBucketHistogram.csv") - -{ - cout << timestamp << "LowHash1 begins." << endl; - const auto tBegin = steady_clock::now(); - - // Adjust the numbers of threads, if necessary. - if(threadCount == 0) { - threadCount = std::thread::hardware_concurrency(); - } - - // Estimate the total number of low hashes and its base 2 log. - // Except for very short reads, each marker generates a feature, - // and each feature generates a low hash with probability hashFraction. - // So an estimate of the total number of hashes is: - const uint64_t totalLowHashCountEstimate = - uint64_t(hashFraction * double(markers.totalSize())); - const uint64_t leadingZeroBitCount = __builtin_clzl(totalLowHashCountEstimate); - const uint64_t log2TotalLowHashCountEstimate = 64 - leadingZeroBitCount; - - // If log2MinHashBucketCount is 0, choose a reasonable value - // for the current number of reads. - // Otherwise, check that log2MinHashBucketCount is not unreasonably small. - if(log2MinHashBucketCount == 0) { - log2MinHashBucketCount = 5 + log2TotalLowHashCountEstimate; - } else { - if(log2MinHashBucketCount < log2TotalLowHashCountEstimate) { - throw runtime_error("LowHash1: log2MinHashBucketCount is unreasonably small."); - } - } - - // Set the number of buckets and the corresponding mask. - const uint64_t bucketCount = 1ULL << log2MinHashBucketCount; - mask = bucketCount - 1; - cout << "LowHash1 algorithm will use 2^" << log2MinHashBucketCount; - cout << " = " << bucketCount << " buckets. "<< endl; - cout << "Estimated number of low hashes per iteration " << totalLowHashCountEstimate << endl; - cout << "Estimated load factor " << double(totalLowHashCountEstimate)/double(bucketCount) << endl; - - // Create vectors containing only the k-mer ids of all markers. - // This is used to speed up the computation of hash functions. - cout << timestamp << "Creating kmer ids for oriented reads." << endl; - createKmerIds(); - - // Compute the threshold for a hash value to be considered low. - hashThreshold = uint64_t(hashFraction * double(std::numeric_limits<uint64_t>::max())); - - // The number of oriented reads, each with its own vector of markers. - const OrientedReadId::Int orientedReadCount = OrientedReadId::Int(markers.size()); - const ReadId readCount = orientedReadCount / 2; - SHASTA_ASSERT(orientedReadCount == 2*readCount); - - // Set up work areas. - buckets.createNew( - largeDataFileNamePrefix.empty() ? "" : (largeDataFileNamePrefix + "tmp-LowHash-Buckets"), - largeDataPageSize); - lowHashes.resize(orientedReadCount); - threadCommonFeatures.resize(threadCount); - for(size_t threadId=0; threadId!=threadCount; threadId++) { - threadCommonFeatures[threadId] = make_shared<MemoryMapped::Vector<CommonFeature> >(); - threadCommonFeatures[threadId]->createNew( - largeDataFileNamePrefix.empty() ? "" : - (largeDataFileNamePrefix + "tmp-LowHash-ThreadCommonFeatures-" + to_string(threadId)), - largeDataPageSize); - } - - // Write the header of the histogram file. - histogramCsv << "Iteration,BucketSize,BucketCount,FeatureCount\n"; - - // LowHash iteration loop. - for(iteration=0; iteration<minHashIterationCount; iteration++) { - cout << timestamp << "LowHash iteration " << iteration << " begins." << endl; - - // Compute the low hashes for each oriented read - // and count the number of low hash features in each bucket. - buckets.clear(); - buckets.beginPass1(bucketCount); - size_t batchSize = 10000; - setupLoadBalancing(readCount, batchSize); - runThreads(&LowHash1::computeHashesThreadFunction, threadCount); - - // Fill the buckets. - buckets.beginPass2(); - setupLoadBalancing(readCount, batchSize); - runThreads(&LowHash1::fillBucketsThreadFunction, threadCount); - buckets.endPass2(false, false); - cout << "Load factor at this iteration " << - double(buckets.totalSize()) / double(buckets.size()) << endl; - computeBucketHistogram(); - - // Scan the buckets to find common features. - // Each thread stores the common features it finds in its own vector. - const uint64_t oldCommonFeatureCount = countTotalThreadCommonFeatures(); - batchSize = 10000; - setupLoadBalancing(bucketCount, batchSize); - runThreads(&LowHash1::scanBucketsThreadFunction, threadCount); - const uint64_t newCommonFeatureCount = countTotalThreadCommonFeatures(); - cout << "Stored " << newCommonFeatureCount-oldCommonFeatureCount << - " common features at this iteration." << endl; - } - - // Gather together all the common features found by all threads. - cout << timestamp << "Gathering common features found by all threads." << endl; - gatherCommonFeatures(); - cout << timestamp << "Total number of common features including duplicates is " << - commonFeatures.totalSize() << endl; - - // We no longer need the common features by thread. - for(size_t threadId=0; threadId!=threadCount; threadId++) { - threadCommonFeatures[threadId]->remove(); - threadCommonFeatures[threadId] = 0; - } - threadCommonFeatures.clear(); - - // Process the common features. - // For each orientedReadId0, we look at all the CommonFeatureInfo we have - // and sort them by orientedReadId1, then by ordinals, and remove duplicates. - // We then find groups of at least minFrequency common features involving the - // same pair(orientedReadId0, orientedReadId1) - cout << timestamp << "Processing the common features we found." << endl; - processCommonFeatures(); - - // Clean up. - buckets.remove(); - kmerIds.remove(); - lowHashes.clear(); - commonFeatures.remove(); - - // Done. - const auto tEnd = steady_clock::now(); - const double tTotal = seconds(tEnd - tBegin); - cout << timestamp << "LowHash1 completed in " << tTotal << " s." << endl; -} - - - -void LowHash1::createKmerIds() -{ - kmerIds.createNew( - largeDataFileNamePrefix.empty() ? "" : (largeDataFileNamePrefix + "tmp-LowHash-Markers"), - largeDataPageSize); - const ReadId orientedReadCount = ReadId(markers.size()); - const ReadId readCount = orientedReadCount / 2; - kmerIds.beginPass1(orientedReadCount); - for(ReadId readId=0; readId!=readCount; readId++) { - for(Strand strand=0; strand<2; strand++) { - const OrientedReadId orientedReadId(readId, strand); - const auto markerCount = markers.size(orientedReadId.getValue()); - kmerIds.incrementCount(orientedReadId.getValue(), markerCount); - } - } - kmerIds.beginPass2(); - kmerIds.endPass2(false); - const size_t batchSize = 10000; - setupLoadBalancing(readCount, batchSize); - runThreads(&LowHash1::createKmerIds, threadCount); -} - - - -// Thread function for createKmerIds. -void LowHash1::createKmerIds(size_t threadId) -{ - - // Loop over batches assigned to this thread. - uint64_t begin, end; - while(getNextBatch(begin, end)) { - - // Loop over reads assigned to this batch. - for(ReadId readId=ReadId(begin); readId!=ReadId(end); readId++) { - for(Strand strand=0; strand<2; strand++) { - const OrientedReadId orientedReadId(readId, strand); - const auto orientedReadMarkers = markers[orientedReadId.getValue()]; - - SHASTA_ASSERT(kmerIds.size(orientedReadId.getValue()) == orientedReadMarkers.size()); - - auto pointer = kmerIds.begin(orientedReadId.getValue()); - for(const CompressedMarker& marker: orientedReadMarkers) { - *pointer++ = marker.kmerId; - } - } - } - } -} - - - -// Thread function to compute the low hashes for each oriented read -// and count the number of entries in each bucket. -void LowHash1::computeHashesThreadFunction(size_t threadId) -{ - const int featureByteCount = int(m * sizeof(KmerId)); - const uint64_t seed = iteration * 37; - - // Loop over batches assigned to this thread. - uint64_t begin, end; - while(getNextBatch(begin, end)) { - - // Loop over oriented reads assigned to this batch. - for(ReadId readId=ReadId(begin); readId!=ReadId(end); readId++) { - if(reads.getFlags(readId).isPalindromic) { - continue; - } - for(Strand strand=0; strand<2; strand++) { - const OrientedReadId orientedReadId(readId, strand); - - vector< pair<uint64_t, uint32_t> >& orientedReadLowHashes = lowHashes[orientedReadId.getValue()]; - orientedReadLowHashes.clear(); - const size_t markerCount = kmerIds.size(orientedReadId.getValue()); - - // Handle the pathological case where there are fewer than m markers. - // This oriented read ends up in no bucket. - if(markerCount < m) { - continue; - } - - // Get the markers for this oriented read. - KmerId* kmerIdsPointer = kmerIds.begin(orientedReadId.getValue()); - const size_t featureCount = markerCount - m + 1; - - // Loop over features of this oriented read. - // Features are sequences of m consecutive markers. - for(size_t j=0; j<featureCount; j++, kmerIdsPointer++) { - const uint64_t hash = MurmurHash64A(kmerIdsPointer, featureByteCount, seed); - if(hash < hashThreshold) { - orientedReadLowHashes.push_back(make_pair(hash, j)); - const uint64_t bucketId = hash & mask; - buckets.incrementCountMultithreaded(bucketId); - } - } - } - } - } - -} - - - -// Thread function to fill the buckets. -void LowHash1::fillBucketsThreadFunction(size_t threadId) -{ - - // Loop over batches assigned to this thread. - uint64_t begin, end; - while(getNextBatch(begin, end)) { - - // Loop over oriented reads assigned to this batch. - for(ReadId readId=ReadId(begin); readId!=ReadId(end); readId++) { - if(reads.getFlags(readId).isPalindromic) { - continue; - } - for(Strand strand=0; strand<2; strand++) { - const OrientedReadId orientedReadId(readId, strand); - const vector< pair<uint64_t, uint32_t> > & orientedReadLowHashes = lowHashes[orientedReadId.getValue()]; - - for(const auto& p: orientedReadLowHashes) { - const uint64_t hash = p.first; - const uint64_t bucketId = hash & mask; - const uint32_t ordinal = p.second; - buckets.storeMultithreaded(bucketId, BucketEntry(orientedReadId, ordinal)); - } - } - } - } -} - - - -void LowHash1::computeBucketHistogram() -{ - threadBucketHistogram.clear(); - threadBucketHistogram.resize(threadCount); - const uint64_t batchSize = 10000; - setupLoadBalancing(buckets.size(), batchSize); - runThreads(&LowHash1::computeBucketHistogramThreadFunction, threadCount); - - // Combine the histograms found by each thread. - uint64_t largestBucketSize = 0; - for(const vector<uint64_t>& histogram: threadBucketHistogram) { - largestBucketSize = max(largestBucketSize, uint64_t(histogram.size())); - } - vector<uint64_t> bucketHistogram(largestBucketSize, 0); - for(const vector<uint64_t>& histogram: threadBucketHistogram) { - for(uint64_t bucketSize=0; bucketSize<histogram.size(); bucketSize++) { - bucketHistogram[bucketSize] += histogram[bucketSize]; - } - } - - for(uint64_t bucketSize=0; bucketSize<bucketHistogram.size(); bucketSize++) { - const uint64_t frequency = bucketHistogram[bucketSize]; - if(frequency) { - histogramCsv << - iteration << "," << - bucketSize << "," << - frequency << "," << - bucketSize*frequency << "\n"; - } - } - - -} -void LowHash1::computeBucketHistogramThreadFunction(size_t threadId) -{ - vector<uint64_t>& histogram = threadBucketHistogram[threadId]; - histogram.clear(); - uint64_t begin, end; - while(getNextBatch(begin, end)) { - for(uint64_t bucketId=begin; bucketId!=end; bucketId++) { - const uint64_t bucketSize = buckets.size(bucketId); - if(bucketSize >= histogram.size()) { - histogram.resize(bucketSize + 1, 0); - } - ++histogram[bucketSize]; - } - } -} - - - -// Thread function to scan the buckets to find common features. -void LowHash1::scanBucketsThreadFunction(size_t threadId) -{ - // Access the vector where this thread will store - // the common features it finds. - MemoryMapped::Vector<CommonFeature>& commonFeatures = *threadCommonFeatures[threadId]; - - const uint64_t mLocal = uint64_t(m); - - // Loop over batches assigned to this thread. - uint64_t begin, end; - while(getNextBatch(begin, end)) { - - // Loop over buckets in this batch. - for(uint64_t bucketId=begin; bucketId!=end; bucketId++) { - - // Access this bucket. - const span<BucketEntry> bucket = buckets[bucketId]; - if(bucket.size() < max(size_t(2), minBucketSize)) { - continue; - } - if(bucket.size() > maxBucketSize) { - continue; - } - - // Loop over pairs of bucket entries. - for(const BucketEntry& feature0: bucket) { - const OrientedReadId orientedReadId0 = feature0.orientedReadId; - const ReadId readId0 = orientedReadId0.getReadId(); - const Strand strand0 = orientedReadId0.getStrand(); - const uint32_t ordinal0 = feature0.ordinal; - const auto allKmerIds0 = kmerIds[orientedReadId0.getValue()]; - const auto featureKmerIds0 = allKmerIds0.begin() + ordinal0; - const uint32_t markerCount0 = uint32_t(allKmerIds0.size()); - - for(const BucketEntry& feature1: bucket) { - const OrientedReadId orientedReadId1 = feature1.orientedReadId; - const ReadId readId1 = orientedReadId1.getReadId(); - - // Only consider the ones where readId0 < readId1. - if(readId0 >= readId1) { - continue; - } - - const Strand strand1 = orientedReadId1.getStrand(); - const uint32_t ordinal1 = feature1.ordinal; - const auto allKmerIds1 = kmerIds[orientedReadId1.getValue()]; - const auto featureKmerIds1 = allKmerIds1.begin() + ordinal1; - const uint32_t markerCount1 = uint32_t(allKmerIds1.size()); - - // If the k-mers are not the same, this is a collision. Discard. - if(not std::equal(featureKmerIds0, featureKmerIds0+mLocal, featureKmerIds1)) { - continue; - } - - // We found a common feature. Store it. - // If read0 is on strand 1, we have to reverse the ordinals. - if(strand0 == 0) { - commonFeatures.push_back(CommonFeature( - readId0, - readId1, - strand0==strand1, - ordinal0, - ordinal1)); - } else { - commonFeatures.push_back(CommonFeature( - readId0, - readId1, - strand0==strand1, - markerCount0-1-ordinal0, - markerCount1-1-ordinal1)); - } - } - } - } - } -} - - -// Add up the number of common feature found by all threads. -uint64_t LowHash1::countTotalThreadCommonFeatures() const -{ - uint64_t n = 0; - for(const auto& v: threadCommonFeatures) { - n += v->size(); - } - return n; -} - - - -void LowHash1::gatherCommonFeatures() -{ - commonFeatures.createNew( - largeDataFileNamePrefix.empty() ? "" : (largeDataFileNamePrefix + "tmp-CommonFeatures"), - largeDataPageSize); - commonFeatures.beginPass1(kmerIds.size()/2); - runThreads(&LowHash1::gatherCommonFeaturesPass1, threadCount); - commonFeatures.beginPass2(); - runThreads(&LowHash1::gatherCommonFeaturesPass2, threadCount); - commonFeatures.endPass2(false); -} -void LowHash1::gatherCommonFeaturesPass1(size_t threadId) -{ - const MemoryMapped::Vector<CommonFeature>& v = *threadCommonFeatures[threadId]; - for(const CommonFeature& commonFeature: v) { - commonFeatures.incrementCountMultithreaded(commonFeature.orientedReadPair.readIds[0]); - } -} -void LowHash1::gatherCommonFeaturesPass2(size_t threadId) -{ - const MemoryMapped::Vector<CommonFeature>& v = *threadCommonFeatures[threadId]; - for(const CommonFeature& commonFeature: v) { - commonFeatures.storeMultithreaded( - commonFeature.orientedReadPair.readIds[0], - CommonFeatureInfo(commonFeature)); - } -} - - - -// Process the common features. -// For each readId0, we look at all the CommonFeatureInfo we have -// and sort them by readId1, then by ordinals, and remove duplicates. -// We then find groups of at least minFrequency common features involving the -// same pair(orientedReadId0, orientedReadId1) -// Each group generates an alignment candidate and the -// corresponding common features. -// Each thread stores the alignment candidates it finds in its own vector. -void LowHash1::processCommonFeatures() -{ - const uint64_t readCount = kmerIds.size() / 2; - const uint64_t batchSize = 1000; - - // Prepare areas where each thread will store what it finds. - threadCandidateTable.resize(readCount); - threadAlignmentCandidates.resize(threadCount); - threadCandidateHistogram.resize(threadCount); - - // Extract the candidates and features. - setupLoadBalancing(readCount, batchSize); - runThreads(&LowHash1::processCommonFeaturesThreadFunction, threadCount); - - - - // Gather the candidates and the features. - for(ReadId readId0=0; readId0<readCount; readId0++) { - - // Figure out where the candidates are stored. - const auto& info = threadCandidateTable[readId0]; - const uint64_t threadId = info[0]; - const uint64_t begin = info[1]; - const uint64_t end = info[2]; - - // Loop over all these candidates. - for(uint64_t i=begin; i!=end; ++i) { - const OrientedReadPair& orientedReadPair = - threadAlignmentCandidates[threadId]->candidates[i]; - SHASTA_ASSERT(orientedReadPair.readIds[0] == readId0); - candidates.candidates.push_back(orientedReadPair); - const auto features = threadAlignmentCandidates[threadId]->featureOrdinals[i]; - candidates.featureOrdinals.appendVector(features.begin(), features.end()); - } - } - SHASTA_ASSERT(candidates.candidates.size() == candidates.featureOrdinals.size()); - cout << timestamp << "Found " << candidates.candidates.size() << - " alignment candidates with a total " << - candidates.featureOrdinals.totalSize() << - " features." << endl; - - - - // Combine the histograms found by each thread. - for(size_t threadId=0; threadId!=threadCount; threadId++) { - const vector<uint64_t>& v = threadCandidateHistogram[threadId]; - for(uint64_t i=0; i<v.size(); i++){ - const uint64_t n = v[i]; - if(n > 0) { - if(candidateHistogram.size() <= n){ - candidateHistogram.resize(n+1, 0); - } - candidateHistogram[i] += n; - } - } - } - ofstream csv("LowHashCandidateHistogram.csv"); - csv << "CommonFeatureCount,Frequency\n"; - for(uint64_t i=0; i<candidateHistogram.size(); i++) { - const uint64_t n = candidateHistogram[i]; - if(n > 0) { - csv << i << "," << n << "\n"; - } - } - - - - // Clean up. - threadCandidateTable.clear(); - for(size_t threadId=0; threadId<threadCount; threadId++) { - threadAlignmentCandidates[threadId]->candidates.remove(); - threadAlignmentCandidates[threadId]->featureOrdinals.remove(); - } - threadAlignmentCandidates.clear(); -} - - - -void LowHash1::processCommonFeaturesThreadFunction(size_t threadId) -{ - // Access the vector where this thread will store - // the alignment candidates it finds. - threadAlignmentCandidates[threadId] = make_shared<AlignmentCandidates>(); - AlignmentCandidates& alignmentCandidates = *threadAlignmentCandidates[threadId]; - alignmentCandidates.candidates.createNew( - largeDataFileNamePrefix.empty() ? "" : - (largeDataFileNamePrefix + "tmp-ThreadAlignmentCandidates-" + to_string(threadId)), - largeDataPageSize); - alignmentCandidates.featureOrdinals.createNew( - largeDataFileNamePrefix.empty() ? "" : - (largeDataFileNamePrefix + "tmp-ThreadAlignmentCandidatesOrdinals-" + to_string(threadId)), - largeDataPageSize); - vector<uint64_t>& histogram = threadCandidateHistogram[threadId]; - - // Loop over all batches assigned to this thread. - uint64_t begin, end; - while(getNextBatch(begin, end)) { - - // Loop over ReadId's in this batch. - for(ReadId readId0=ReadId(begin); readId0!=ReadId(end); readId0++) { - // std::lock_guard<std::mutex> lock(mutex); // ************************** TAKE OUT! - // cout << "Working on readId0 " << readId0 << endl; - const span<CommonFeatureInfo> features = commonFeatures[readId0]; - threadCandidateTable[readId0][0] = uint64_t(threadId); - threadCandidateTable[readId0][1] = alignmentCandidates.candidates.size();; - - /* - cout << features.size() << " features before deduplication:" << endl; - for(auto it=features.begin(); it!=features.end(); ++it) { - const CommonFeatureInfo& feature = *it; - cout << - feature.readId1 << " " << - (feature.isSameStrand ? "same strand " : " opposite strands ") << - feature.ordinals[0] << " " << - feature.ordinals[1] << " " << - int32_t(feature.ordinals[1]) - int32_t(feature.ordinals[0]) << "\n"; - } - */ - - // Deduplicate. - const auto uniqueBegin = features.begin(); - auto uniqueEnd = features.end(); - sort(uniqueBegin, uniqueEnd); - uniqueEnd = unique(uniqueBegin, uniqueEnd); - - /* - cout << uniqueEnd-uniqueBegin << " features after deduplication:" << endl; - for(auto it=uniqueBegin; it!=uniqueEnd; ++it) { - const CommonFeatureInfo& feature = *it; - cout << - feature.readId1 << " " << - (feature.isSameStrand ? "same strand " : " opposite strands ") << - feature.ordinals[0] << " " << - feature.ordinals[1] << " " << - int32_t(feature.ordinals[1]) - int32_t(feature.ordinals[0]) << "\n"; - } - */ - - // Loop over streaks of features with the same readId1 and isSameStrand. - for(auto it=uniqueBegin; it!=uniqueEnd;) { - auto streakBegin = it; - auto streakEnd = streakBegin; - const ReadId readId1 = streakBegin->readId1; - const bool isSameStrand = streakBegin->isSameStrand; - while(streakEnd!=uniqueEnd and streakEnd->readId1==readId1 and streakEnd->isSameStrand==isSameStrand) { - ++streakEnd; - } - - // Increment the histogram. - const int64_t streakLength = streakEnd - streakBegin; - if(histogram.size() <= uint64_t(streakLength)) { - histogram.resize(streakLength + 1, 0); - } - ++histogram[streakLength]; - - // If too few, skip. - if(streakLength < int64_t(minFrequency)) { - it = streakEnd; - continue; - } - - /* - cout << "Common features of reads " << - readId0 << " " << - readId1 << (isSameStrand ? " same strand" : " opposite strands") << ":\n"; - for(auto it=streakBegin; it!=streakEnd; ++it) { - const CommonFeatureInfo& feature = *it; - cout << - feature.ordinals[0] << " " << - feature.ordinals[1] << " " << - int32_t(feature.ordinals[1]) - int32_t(feature.ordinals[0]) << "\n"; - } - cout << "Marker count " << - kmerIds[OrientedReadId(readId0, 0).getValue()].size() << " " << - kmerIds[OrientedReadId(readId1, 0).getValue()].size() << ":\n"; - */ - - // This streak generates an alignment candidate - // and the corresponding common features. - alignmentCandidates.candidates.push_back(OrientedReadPair(readId0, readId1, isSameStrand)); - alignmentCandidates.featureOrdinals.appendVector(); - for(auto it=streakBegin; it!=streakEnd; ++it) { - const CommonFeatureInfo& feature = *it; - alignmentCandidates.featureOrdinals.append(feature.ordinals); - } - - // Prepare for the next streak. - it = streakEnd; - } - threadCandidateTable[readId0][2] = alignmentCandidates.candidates.size();; - } - } -} diff --git a/src/LowHash1.hpp b/src/LowHash1.hpp deleted file mode 100644 index 88548d6..0000000 --- a/src/LowHash1.hpp +++ /dev/null @@ -1,224 +0,0 @@ -#ifndef SHASTA_LOW_HASH1_HPP -#define SHASTA_LOW_HASH1_HPP - -// Shasta -#include "Kmer.hpp" -#include "MemoryMappedVectorOfVectors.hpp" -#include "MultithreadedObject.hpp" -#include "OrientedReadPair.hpp" -#include "Reads.hpp" - -// Standard library. -#include "fstream.hpp" -#include "memory.hpp" - -namespace shasta { - class AlignmentCandidates; - class LowHash1; - class CompressedMarker; - class OrientedReadPair; - - extern template class MultithreadedObject<LowHash1>; -} - - -// This class uses the LowHash algorithm to find candidate pairs of aligned reads. -// It uses as features sequences of m consecutive markers. -// This is the new version that also stores alignmentCandidates.featureOrdinals -class shasta::LowHash1 : - public MultithreadedObject<LowHash1> { -public: - - // The constructor does all the work. - LowHash1( - size_t m, // Number of consecutive markers that define a feature. - double hashFraction, - size_t minHashIterationCount, // Number of minHash iterations. - size_t log2MinHashBucketCount, // Base 2 log of number of buckets for minHash. - size_t minBucketSize, // The minimum size for a bucket to be used. - size_t maxBucketSize, // The maximum size for a bucket to be used. - size_t minFrequency, // Minimum number of minHash hits for a pair to be considered a candidate. - size_t threadCount, - const MemoryMapped::Vector<KmerInfo>& kmerTable, - const Reads& reads, - const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>&, - AlignmentCandidates& candidates, - const string& largeDataFileNamePrefix, - size_t largeDataPageSize - ); - -private: - - // Store some of the arguments passed to the constructor. - size_t m; // Number of consecutive markers that define a feature. - double hashFraction; - size_t minBucketSize; // The minimum size for a bucket to be used. - size_t maxBucketSize; // The maximum size for a bucket to be used. - size_t minFrequency; // Minimum number of minHash hits for a pair to be considered a candidate. - size_t threadCount; - const MemoryMapped::Vector<KmerInfo>& kmerTable; - const Reads& reads; - const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers; - AlignmentCandidates& candidates; - const string& largeDataFileNamePrefix; - size_t largeDataPageSize; - - // Vectors containing only the k-mer ids of all markers - // for all oriented reads. - // Indexed by OrientedReadId.getValue(). - // This is used to speed up the computation of hash functions. - MemoryMapped::VectorOfVectors<KmerId, uint64_t> kmerIds; - void createKmerIds(); - void createKmerIds(size_t threadId); - - // The mask used to compute to compute the bucket - // corresponding to a hash value. - uint64_t mask; - - // The threshold for a hash value to be considered low. - uint64_t hashThreshold; - - // The current MinHash iteration. - // This is used to compute a different MurmurHash function - // at each iteration. - size_t iteration; - - // The low hashes of each oriented read and the ordinals at - // which the corresponding feature occurs. - // This is recomputed at each iteration. - // Indexed by OrientedReadId::getValue(). - vector< vector< pair<uint64_t, uint32_t> > > lowHashes; - void computeLowHashes(size_t threadId); - - // Each bucket entry describes a low hash feature. - // It consists of an oriented read id and - // the ordinal where the low hash feature appears. - class BucketEntry { - public: - OrientedReadId orientedReadId; - uint32_t ordinal; - BucketEntry( - OrientedReadId orientedReadId, - uint32_t ordinal) : - orientedReadId(orientedReadId), - ordinal(ordinal) {} - BucketEntry() {} - }; - MemoryMapped::VectorOfVectors<BucketEntry, uint64_t> buckets; - - - // Compute a histogram of the number of entries in each histogram. - void computeBucketHistogram(); - void computeBucketHistogramThreadFunction(size_t threadId); - vector< vector<uint64_t> > threadBucketHistogram; - ofstream histogramCsv; - - - - // When two oriented reads appear in the same bucket, we - // check if that happens by chance or because we found a - // common feature between the two oriented reads. - // In the latter case, we store a new CommonFeature - // containing the OrientedReadIdPair and - // the ordinals where the feature appears. - // Note that the OrientedReadIdPair is interpreted - // with readId0 on strand 0 and readId1 on the strand - // implied by isSameStrand. - // This means that if we encounter the common feature - // with readId0 on strand 1 we have to reverse the - // strands and adjust the ordinals. - // Each thread stores into its own vector of common features. - // We only store common features with readId0<readId1. - class CommonFeature { - public: - OrientedReadPair orientedReadPair; - array<uint32_t, 2> ordinals; - CommonFeature() {} - CommonFeature( - ReadId readId0, - ReadId readId1, - bool isSameStrand, - uint32_t ordinal0, - uint32_t ordinal1 - ) : - orientedReadPair(readId0, readId1, isSameStrand), - ordinals({ordinal0, ordinal1}) - {} - }; - vector< shared_ptr<MemoryMapped::Vector<CommonFeature> > > threadCommonFeatures; - uint64_t countTotalThreadCommonFeatures() const; - - - - // The common features found by each thread are stored together, - // segregated by the first ReadId, readId0. - // This vector of vectors is indexed by readId0. - // That is, commonFeatures[readId0] - // is a vector contaiOrientedReadId is readId0. - class CommonFeatureInfo { - public: - ReadId readId1; - array<uint32_t, 2> ordinals; - bool isSameStrand; - CommonFeatureInfo() {} - CommonFeatureInfo(const CommonFeature& commonFeature) : - readId1(commonFeature.orientedReadPair.readIds[1]), - ordinals(commonFeature.ordinals), - isSameStrand(commonFeature.orientedReadPair.isSameStrand) - {} - bool operator<(const CommonFeatureInfo& that) const { - return tie(readId1, isSameStrand, ordinals) < tie(that.readId1, that.isSameStrand, that.ordinals); - } - bool operator==(const CommonFeatureInfo& that) const { - return tie(readId1, isSameStrand, ordinals) == tie(that.readId1, that.isSameStrand, that.ordinals); - } - }; - MemoryMapped::VectorOfVectors<CommonFeatureInfo, uint64_t> commonFeatures; - void gatherCommonFeatures(); - void gatherCommonFeaturesPass1(size_t threadId); - void gatherCommonFeaturesPass2(size_t threadId); - - - - // Process the common features. - // For each readId0, we look at all the CommonFeatureInfo we have - // and sort them by readId1, then by ordinals, and remove duplicates. - // We then find groups of at least minFrequency common features involving the - // same pair(orientedReadId0, orientedReadId1). - // Each group generates an alignment candidate and the - // corresponding common features. - // Each thread stores the alignment candidates it finds in its own vector. - void processCommonFeatures(); - void processCommonFeaturesThreadFunction(size_t threadId); - - // Alignment candidates found by each thread. - vector< shared_ptr<AlignmentCandidates> > threadAlignmentCandidates; - - // A table used to gather threadAlignmentCandidates in order - // of increasing readId0. Indexed by readId0, gives - // (thread, begin, end) for the candidates for which the first read is readId0. - vector< array<uint64_t, 3> > threadCandidateTable; - - - // During processCommonFeatures, we also create a histogram that tells us - // how many (readId0, readId1) pairs with exactly n common features were found. - // Only the pairs with n>=minFrquency generate an alignment candidate. - vector<uint64_t> candidateHistogram; - vector< vector<uint64_t> > threadCandidateHistogram; - - - - // Thread functions. - - // Thread function to compute the low hashes for each oriented read - // and count the number of entries in each bucket. - void computeHashesThreadFunction(size_t threadId); - - // Thread function to fill the buckets. - void fillBucketsThreadFunction(size_t threadId); - - // Thread function to scan the buckets to find common features. - void scanBucketsThreadFunction(size_t threadId); -}; - -#endif diff --git a/src/MappedMemoryOwner.hpp b/src/MappedMemoryOwner.hpp new file mode 100644 index 0000000..7caee88 --- /dev/null +++ b/src/MappedMemoryOwner.hpp @@ -0,0 +1,45 @@ +#ifndef SHASTA_MAPPED_MEMORY_OWNER_HPP +#define SHASTA_MAPPED_MEMORY_OWNER_HPP + +#include "cstdint.hpp" +#include "string.hpp" + +namespace shasta { + class MappedMemoryOwner; +} + + + +class shasta::MappedMemoryOwner { +public: + + string largeDataFileNamePrefix; + uint64_t largeDataPageSize; + + // Function to construct names for binary objects. + // The output can be passed to createNew or accessExisting + // member functions of MemoryMapped obkects. + string largeDataName(const string& name) const + { + if(largeDataFileNamePrefix.empty()) { + return ""; // Anonymous; + } else { + return largeDataFileNamePrefix + name; + } + } + + MappedMemoryOwner() {} + MappedMemoryOwner(const MappedMemoryOwner&) = default; + + template<class T> void createNew(T& t, const string& name) + { + t.createNew(largeDataName(name), largeDataPageSize); + } + template<class T> void accessExistingReadOnly(T& t, const string& name) + { + t.accessExistingReadOnly(largeDataName(name)); + } +}; + + +#endif diff --git a/src/Marker.hpp b/src/Marker.hpp index 3a18dc6..2b9fd5c 100644 --- a/src/Marker.hpp +++ b/src/Marker.hpp @@ -10,12 +10,6 @@ and never changed, and selected in such a way that, if (and only if) a k-mer is a marker, its reverse complement is also a marker. -The k-mer table is a vector of 4^k KmerInfo object, -indexed by k-mer id as computed using Kmer::id(k). -Because of the way markers are selected, the following is -true for all permitted values of i, 0 <= i < 4^k: -kmerTable[i].isMarker == kmerTable[kmerTable[i].reverseComplementKmerId].isMarker - *******************************************************************************/ #include "Kmer.hpp" @@ -38,35 +32,14 @@ namespace shasta { -// Markers in shared memory are stored using class CompressedMarker -// which requires only 5 bytes per marker. - -// For a run with 120 Gb of coverage and 10% of k-mers -// used as markers, storing all the 24 G markers requires -// 120 GB (we store markers for each read on both strands). -// This compares with 30 GB to store the reads -// (we store reads on one strand only). - -// This layout results in unaligned memory accesses. -// This is not a problem as modern processors (beginning with Nehalem) -// have a much lower performance penalty for unaligned memory access -// than older processors did: -// http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.455.4198&rep=rep1&type=pdf - +// Markers in shared memory are stored using class CompressedMarker. class shasta::CompressedMarker { public: - // The id of the k-mer for this marker. - KmerId kmerId __attribute__ ((packed)); - // The position of this marker in the oriented read. // This limits the length of a read to 2^24=16Mib bases. Uint24 position; - }; -static_assert(sizeof(shasta::CompressedMarker) == - sizeof(shasta::KmerId) + sizeof(shasta::Uint24), - "Unexpected size of class CompressedMarker."); @@ -81,11 +54,8 @@ public: // The position of this marker in the oriented read. uint32_t position; - // Constructor from a CompressedMarker. - Marker(const CompressedMarker& compressedMarker) : - kmerId(compressedMarker.kmerId), - position(compressedMarker.position) - {} + Marker(KmerId kmerId, uint32_t position) : + kmerId(kmerId), position(position) {} // Default constructor. Marker() {} @@ -100,11 +70,8 @@ class shasta::MarkerWithOrdinal : public Marker { public: uint32_t ordinal; - // Constructor from a marker and an ordinal. - MarkerWithOrdinal(const Marker& marker, uint32_t ordinal) : - Marker(marker), - ordinal(ordinal) - {} + MarkerWithOrdinal(KmerId kmerId, uint32_t position, uint32_t ordinal) : + Marker(kmerId, position), ordinal(ordinal) {} // Default constructor. MarkerWithOrdinal() {} diff --git a/src/MarkerFinder.cpp b/src/MarkerFinder.cpp index f919e07..d46bad0 100644 --- a/src/MarkerFinder.cpp +++ b/src/MarkerFinder.cpp @@ -1,6 +1,7 @@ // shasta. #include "MarkerFinder.hpp" #include "LongBaseSequence.hpp" +#include "KmerChecker.hpp" #include "performanceLog.hpp" #include "ReadId.hpp" #include "timestamp.hpp" @@ -15,13 +16,13 @@ template class MultithreadedObject<MarkerFinder>; MarkerFinder::MarkerFinder( size_t k, - const MemoryMapped::Vector<KmerInfo>& kmerTable, + const KmerChecker& kmerChecker, const Reads& reads, MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, size_t threadCountArgument) : MultithreadedObject(*this), k(k), - kmerTable(kmerTable), + kmerChecker(kmerChecker), reads(reads), markers(markers), threadCount(threadCountArgument) @@ -83,19 +84,17 @@ void MarkerFinder::threadFunction(size_t threadId) } for(uint32_t position=0; /*The check is done later */; position++) { const KmerId kmerId = KmerId(kmer.id(k)); - if(kmerTable[kmerId].isMarker) { + if(kmerChecker.isMarker(kmerId)) { // This k-mer is a marker. if(pass == 1) { ++markerCount; } else { // Strand 0. - markerPointerStrand0->kmerId = kmerId; markerPointerStrand0->position = position; ++markerPointerStrand0; // Strand 1. - markerPointerStrand1->kmerId = kmerTable[kmerId].reverseComplementedKmerId; markerPointerStrand1->position = uint32_t(read.baseCount - k - position); --markerPointerStrand1; diff --git a/src/MarkerFinder.hpp b/src/MarkerFinder.hpp index fedda4c..dcdbff3 100644 --- a/src/MarkerFinder.hpp +++ b/src/MarkerFinder.hpp @@ -8,6 +8,7 @@ namespace shasta { class MarkerFinder; class LongBaseSequences; + class KmerChecker; namespace MemoryMapped { template<class T> class Vector; @@ -26,7 +27,7 @@ public: // The constructor does all the work. MarkerFinder( size_t k, - const MemoryMapped::Vector<KmerInfo>& kmerTable, + const KmerChecker&, const Reads& reads, MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, size_t threadCount); @@ -35,7 +36,7 @@ private: // The arguments passed to the constructor. size_t k; - const MemoryMapped::Vector<KmerInfo>& kmerTable; + const KmerChecker& kmerChecker; const Reads& reads; MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers; size_t threadCount; diff --git a/src/MarkerGraph.cpp b/src/MarkerGraph.cpp index 8d4577b..28c1b12 100644 --- a/src/MarkerGraph.cpp +++ b/src/MarkerGraph.cpp @@ -1,6 +1,10 @@ // Shasta. #include "MarkerGraph.hpp" #include "Coverage.hpp" +#include "deduplicate.hpp" +#include "findMarkerId.hpp" +#include "invalid.hpp" +#include "markerAccessFunctions.hpp" using namespace shasta; // Standard library. @@ -113,6 +117,20 @@ uint64_t MarkerGraph::outDegree(VertexId vertexId) const +void MarkerGraph::Edge::writeFlags(ostream& s) const +{ + s << "wasRemovedByTransitiveReduction " << int(wasRemovedByTransitiveReduction) << "\n"; + s << "wasPruned " << int(wasPruned) << "\n"; + s << "isSuperBubbleEdge " << int(isSuperBubbleEdge) << "\n"; + s << "isLowCoverageCrossEdge " << int(isLowCoverageCrossEdge) << "\n"; + s << "wasAssembled " << int(wasAssembled) << "\n"; + s << "isSecondary " << int(isSecondary) << "\n"; + s << "wasRemovedWhileSplittingSecondaryEdges " << int(wasRemovedWhileSplittingSecondaryEdges) << "\n"; + s << flush; +} + + + MarkerGraph::EdgeId MarkerGraph::getFirstNonRemovedOutEdge( MarkerGraph::VertexId vertexId) const { @@ -596,3 +614,499 @@ void MarkerGraph::createVerticesFromVertexTableThreadFunction4(size_t threadId) } + +// Find the common KmerId for all the markers of a marker graph vertex. +KmerId MarkerGraph::getVertexKmerId( + MarkerGraphVertexId vertexId, + uint64_t k, + const Reads& reads, + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers + ) const +{ + // Get it from the first marker on this vertex. + const MarkerId markerId = getVertexMarkerIds(vertexId)[0]; + + // Find the OrientedReadId. + // This is slow as it requires a binary search in the markers toc. + OrientedReadId orientedReadId; + uint32_t ordinal; + tie(orientedReadId, ordinal) = findMarkerId(markerId, markers); + + return getOrientedReadMarkerKmerId( + orientedReadId, + ordinal, + k, + reads, + markers + ); +} + + + +// Find the edge that contains a given MarkerInterval. +MarkerGraphEdgeId MarkerGraph::locateMarkerInterval( + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, + const MarkerInterval& markerInterval) const +{ + const OrientedReadId orientedReadId = markerInterval.orientedReadId; + const uint64_t firstOrientedReadMarkerId = + markers.begin(orientedReadId.getValue()) - markers.begin(); + + // Now locate this marker interval. + const uint64_t markerId0 = firstOrientedReadMarkerId + markerInterval.ordinals[0]; + const uint64_t markerId1 = firstOrientedReadMarkerId + markerInterval.ordinals[1]; + const MarkerGraphVertexId vertexId0 = vertexTable[markerId0]; + const MarkerGraphVertexId vertexId1 = vertexTable[markerId1]; + + for(const auto edgeId: edgesBySource[vertexId0]) { + if(edges[edgeId].target != vertexId1) { + continue; + } + const auto markerIntervals = edgeMarkerIntervals[edgeId]; + if(find(markerIntervals.begin(), markerIntervals.end(), markerInterval) != + markerIntervals.end()) { + return edgeId; + } + } + + return invalid<MarkerGraphEdgeId>; +} + + +// Apply an ordinal offset in the specified direction to a given MarkerInterval +// and find the edge that contains the offset MarkerInterval. +// This assumes that we have the complete marker graph. +MarkerGraphEdgeId MarkerGraph::locateMarkerIntervalWithOffset( + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, + MarkerInterval markerInterval, + uint32_t ordinalOffset, + uint64_t direction // 0=forward, 1=backward. + ) const +{ + const OrientedReadId orientedReadId = markerInterval.orientedReadId; + const uint64_t firstOrientedReadMarkerId = + markers.begin(orientedReadId.getValue()) - markers.begin(); + + // Construct the offset MarkerInterval. + // If we end up outside the oriented read, return invalid<MarkerGraphEdgeId>. + if(direction == 0) { + markerInterval.ordinals[0] += ordinalOffset; + markerInterval.ordinals[1] += ordinalOffset; + if(markerInterval.ordinals[1] >= markers.size(orientedReadId.getValue())) { + return invalid<MarkerGraphEdgeId>; + } + } else { + if(ordinalOffset > markerInterval.ordinals[0]) { + return invalid<MarkerGraphEdgeId>; + } + markerInterval.ordinals[0] -= ordinalOffset; + markerInterval.ordinals[1] -= ordinalOffset; + } + SHASTA_ASSERT(markerInterval.ordinals[1] == markerInterval.ordinals[0] + 1); + + + // Now locate this marker interval. + const uint64_t markerId0 = firstOrientedReadMarkerId + markerInterval.ordinals[0]; + const uint64_t markerId1 = firstOrientedReadMarkerId + markerInterval.ordinals[1]; + const MarkerGraphVertexId vertexId0 = vertexTable[markerId0]; + const MarkerGraphVertexId vertexId1 = vertexTable[markerId1]; + + for(const auto edgeId: edgesBySource[vertexId0]) { + if(edges[edgeId].target != vertexId1) { + continue; + } + const auto markerIntervals = edgeMarkerIntervals[edgeId]; + if(find(markerIntervals.begin(), markerIntervals.end(), markerInterval) != + markerIntervals.end()) { + return edgeId; + } + } + + // If this happens, we don't have a complete marker graph. + SHASTA_ASSERT(0); +} + + + +// Find out if an edge has duplicate oriented reads +// in its MarkerIntervals. +bool MarkerGraph::edgeHasDuplicateOrientedReadIds(EdgeId edgeId) const +{ + const auto markerIntervals = edgeMarkerIntervals[edgeId]; + if(markerIntervals.size() < 2) { + return false; + } + for(uint64_t i=1; i<markerIntervals.size(); i++) { + if(markerIntervals[i-1].orientedReadId == markerIntervals[i].orientedReadId) { + return true; + } + } + + return false; +} + + + +// Find out if a vertex has more than one marker on the same oriented read. +bool MarkerGraph::vertexHasDuplicateOrientedReadIds( + VertexId vertexId, + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers) const +{ + const span<const MarkerId> vertexMarkerIds = vertices()[vertexId]; + if(vertexMarkerIds.size() < 2) { + return false; + } + + // The markers are sorted, so we only have to check each marker + // against the previous one. + // This could be done faster but is not performance critical. + for(uint64_t i=1; i<vertexMarkerIds.size(); i++) { + const MarkerId markerId0 = vertexMarkerIds[i-1]; + const MarkerId markerId1 = vertexMarkerIds[i]; + OrientedReadId orientedReadId0; + OrientedReadId orientedReadId1; + tie(orientedReadId0, ignore) = findMarkerId(markerId0, markers); + tie(orientedReadId1, ignore) = findMarkerId(markerId1, markers); + if(orientedReadId0 == orientedReadId1) { + return true; + } + } + + return false; +} + + + +// Flag primary edges (only used for Mode 3 assembly). +void MarkerGraph::flagPrimaryEdges( + uint64_t minPrimaryCoverage, + uint64_t maxPrimaryCoverage, + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, + uint64_t threadCount) +{ + SHASTA_ASSERT(disjointSetsHistogram.isOpen); + + // If minPrimaryCoverage and maxPrimaryCoverage are both 0, + // use the disjoint sets histogram and simple heuristics to choose + // appropriate values. + if((minPrimaryCoverage == 0) and (maxPrimaryCoverage == 0)) { + + // Set minPrimaryCoverage to the first value where the + // disjointSetsHistogram starts increasing. + bool done = false; + uint64_t frequencyAtMinPrimaryCoverage = 0; + for(uint64_t i=1; i<disjointSetsHistogram.size(); i++) { + const uint64_t coverage = disjointSetsHistogram[i].first; + const uint64_t frequency = disjointSetsHistogram[i].second; + const uint64_t previousCoverage = disjointSetsHistogram[i-1].first; + const uint64_t previousFrequency = disjointSetsHistogram[i-1].second; + if( + (coverage != previousCoverage+1) // Frequency at coverage-1 is zero, so the histogram went up. + or + frequency > previousFrequency // The histogram went up. + ) { + minPrimaryCoverage = coverage; + frequencyAtMinPrimaryCoverage = frequency; + done = true; + break; + } + } + SHASTA_ASSERT(done); + + // Set maxPrimaryCoverage to the last coverage with frequency + // at least equal to frequencyAtMinPrimaryCoverage. + done = false; + for(uint64_t i=disjointSetsHistogram.size()-1; i>0; i--) { + const uint64_t coverage = disjointSetsHistogram[i].first; + const uint64_t frequency = disjointSetsHistogram[i].second; + if(frequency >= frequencyAtMinPrimaryCoverage) { + maxPrimaryCoverage = coverage; + done= true; + break; + } + } + SHASTA_ASSERT(done); + + cout << "Automatically set: minPrimaryCoverage = " << minPrimaryCoverage << + ", maxPrimaryCoverage = " << maxPrimaryCoverage << endl; + } + + + + // Store the arguments so the threads can see them. + flagPrimaryEdgesData.minPrimaryCoverage = minPrimaryCoverage; + flagPrimaryEdgesData.maxPrimaryCoverage = maxPrimaryCoverage; + flagPrimaryEdgesData.markersPointer = &markers; + + // Adjust the numbers of threads, if necessary. + if(threadCount == 0) { + threadCount = std::thread::hardware_concurrency(); + } + + // Clear the flags on all edges. + for(Edge& edge: edges) { + edge.isPrimary = 0; + } + + // Multithreaded code to flag primary edges. + const uint64_t batchCount = 10000; + setupLoadBalancing(edges.size(), batchCount); + runThreads(&MarkerGraph::flagPrimaryEdgesThreadFunction, threadCount); + + uint64_t primaryEdgeCount = 0; + for(Edge& edge: edges) { + if(edge.isPrimary == 1) { + ++primaryEdgeCount; + } + } + cout << "Found " << primaryEdgeCount << + " primary marker graph edges out of " << edges.size() << " total." << endl; +} + + + +void MarkerGraph::flagPrimaryEdgesThreadFunction(uint64_t threadId) +{ + const uint64_t minPrimaryCoverage = flagPrimaryEdgesData.minPrimaryCoverage; + const uint64_t maxPrimaryCoverage = flagPrimaryEdgesData.maxPrimaryCoverage; + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers = + *flagPrimaryEdgesData.markersPointer; + + uint64_t begin, end; + while(getNextBatch(begin, end)) { + for(EdgeId edgeId=begin; edgeId!=end; ++edgeId) { + if(isPrimaryEdge(edgeId, minPrimaryCoverage, maxPrimaryCoverage, markers)) { + edges[edgeId].isPrimary = 1; + } + } + } +} + + + +// Find out if a marker graph edge is a primary edge. +// Only used for Mode 3 assembly. +bool MarkerGraph::isPrimaryEdge( + MarkerGraphEdgeId edgeId, + uint64_t minPrimaryCoverage, + uint64_t maxPrimaryCoverage, + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers) const +{ + // Check coverage. + const uint64_t coverage = edgeCoverage(edgeId); + if(coverage < minPrimaryCoverage) { + return false; + } + if(coverage > maxPrimaryCoverage) { + return false; + } + + // Check for duplicate oriented reads on the edge. + if(edgeHasDuplicateOrientedReadIds(edgeId)) { + return false; + } + + // Check for duplicate oriented reads on its vertices. + const MarkerGraph::Edge& edge = edges[edgeId]; + if( + vertexHasDuplicateOrientedReadIds(edge.source, markers) or + vertexHasDuplicateOrientedReadIds(edge.target, markers)) { + return false; + } + + // If all above checks passed, this is a primary edge. + return true; +} + + + +#if 0 +void MarkerGraph::createPrimaryJourneys( + uint64_t orientedReadCount, + uint64_t threadCount) +{ + // Adjust the numbers of threads, if necessary. + if(threadCount == 0) { + threadCount = std::thread::hardware_concurrency(); + } + + primaryJourneys.clear(); + + const uint64_t batchCount = 100; + + primaryJourneys.beginPass1(orientedReadCount); + setupLoadBalancing(edges.size(), batchCount); + runThreads(&MarkerGraph::createPrimaryJourneysThreadFunction1, threadCount); + primaryJourneys.beginPass2(); + setupLoadBalancing(edges.size(), batchCount); + runThreads(&MarkerGraph::createPrimaryJourneysThreadFunction2, threadCount); + primaryJourneys.endPass2(false, true); + setupLoadBalancing(orientedReadCount, 1); + runThreads(&MarkerGraph::createPrimaryJourneysThreadFunction3, threadCount); + + cout << "Found " << primaryJourneys.totalSize() << + " marker graph primary journey entries for " << orientedReadCount << + " oriented reads." << endl; + cout << "Average number of marker graph primary journey entries per oriented read is " << + double(primaryJourneys.totalSize()) / double(orientedReadCount) << endl; + + writePrimaryJourneys(); +} + + + +void MarkerGraph::writePrimaryJourneys() +{ + const uint64_t orientedReadCount = primaryJourneys.size(); + + ofstream csv("MarkerGraphPrimaryJourneys.csv"); + + for(ReadId readId=0; readId<orientedReadCount/2; readId++) { + for(Strand strand=0; strand<2; strand++) { + const OrientedReadId orientedReadId(readId, strand); + csv << orientedReadId << ","; + for(const auto& primaryJourneyEntry: primaryJourneys[orientedReadId.getValue()]) { + csv << primaryJourneyEntry.edgeId << ","; + } + csv << "\n"; + } + } +} + + + +void MarkerGraph::createPrimaryJourneysThreadFunction1(uint64_t threadId) +{ + createPrimaryJourneysThreadFunction12(1); +} + + + +void MarkerGraph::createPrimaryJourneysThreadFunction2(uint64_t threadId) +{ + createPrimaryJourneysThreadFunction12(2); +} + + + +void MarkerGraph::createPrimaryJourneysThreadFunction12(uint64_t pass) +{ + // Loop over batches assigned to this thread. + uint64_t begin, end; + while(getNextBatch(begin, end)) { + + // Loop over marker graph edges assigned to this batch. + for(EdgeId edgeId=begin; edgeId!=end; ++edgeId) { + const Edge& edge = edges[edgeId]; + + // If this is not a primary edge, skip it. + if(edge.isPrimary == 0) { + continue; + } + + PrimaryJourneyEntry primaryJourneyEntry; + primaryJourneyEntry.edgeId = edgeId; + + // Loop over the MarkerIntervals of this edge. + span<MarkerInterval> markerIntervals = edgeMarkerIntervals[edgeId]; + for(const MarkerInterval& markerInterval: markerIntervals) { + const uint64_t orientedReadIdValue = markerInterval.orientedReadId.getValue(); + + if(pass == 1) { + primaryJourneys.incrementCountMultithreaded(orientedReadIdValue); + } else { + primaryJourneyEntry.ordinals = markerInterval.ordinals; + primaryJourneys.storeMultithreaded(orientedReadIdValue, primaryJourneyEntry); + } + + } + + } + } +} + + + +void MarkerGraph::createPrimaryJourneysThreadFunction3(uint64_t threadId) +{ + // Loop over batches assigned to this thread. + uint64_t begin, end; + while(getNextBatch(begin, end)) { + + // Loop over oriented reads assigned to this batch. + for(uint64_t orientedReadIdValue=begin; orientedReadIdValue!=end; orientedReadIdValue++) { + auto journey = primaryJourneys[orientedReadIdValue]; + sort(journey.begin(), journey.end()); + } + } +} + + + +// Starting from a primary marker graph edge, follow the primary journeys +// of all oriented reads on the edge, moving forward. +// Find the set of MarkerGraphEdgeIds that were encountered in this way, +// and for each the number of times it was encountered. +void MarkerGraph::followPrimaryJourneysForward( + MarkerGraphEdgeId edgeId0, + vector<MarkerGraphEdgeId>& edgeIds, + vector<uint64_t>& count) const +{ + edgeIds.clear(); + count.clear(); + + // Loop over the oriented reads in edgeId0. + for(const MarkerInterval& markerInterval: edgeMarkerIntervals[edgeId0]) { + const OrientedReadId orientedReadId = markerInterval.orientedReadId; + const auto primaryJourney = primaryJourneys[orientedReadId.getValue()]; + + // Loop over the primary journey backward, stopping when we encounter edgeId0. + for(uint64_t j=primaryJourney.size(); /* Check later */; --j) { + const auto& primaryJourneyEntry = primaryJourney[j]; + const MarkerGraphEdgeId edgeId1 = primaryJourneyEntry.edgeId; + if(edgeId1 == edgeId0) { + break; + } + edgeIds.push_back(edgeId1); + if(j == 0) { + break; + } + } + } + + deduplicateAndCount(edgeIds, count); + SHASTA_ASSERT(edgeIds.size() == count.size()); + +} + + + +// Same, but moving backward. +void MarkerGraph::followPrimaryJourneysBackward( + MarkerGraphEdgeId edgeId0, + vector<MarkerGraphEdgeId>& edgeIds, + vector<uint64_t>& count) const +{ + edgeIds.clear(); + count.clear(); + + // Loop over the oriented reads in edgeId0. + for(const MarkerInterval& markerInterval: edgeMarkerIntervals[edgeId0]) { + const OrientedReadId orientedReadId = markerInterval.orientedReadId; + const auto primaryJourney = primaryJourneys[orientedReadId.getValue()]; + + // Loop over the primary journey, stopping when we encounter edgeId0. + for(const auto& primaryJourneyEntry: primaryJourney) { + const MarkerGraphEdgeId edgeId1 = primaryJourneyEntry.edgeId; + if(edgeId1 == edgeId0) { + break; + } + edgeIds.push_back(edgeId1); + } + } + + deduplicateAndCount(edgeIds, count); + SHASTA_ASSERT(edgeIds.size() == count.size()); + +} +#endif diff --git a/src/MarkerGraph.hpp b/src/MarkerGraph.hpp index d7e7856..c43dec1 100644 --- a/src/MarkerGraph.hpp +++ b/src/MarkerGraph.hpp @@ -13,8 +13,10 @@ namespace shasta { class Base; - class MarkerGraph; class CompressedCoverageData; + class CompressedMarker; + class MarkerGraph; + class Reads; extern template class MultithreadedObject<MarkerGraph>; } @@ -80,6 +82,11 @@ public: return vertices()[vertexId]; } + // Find out if a vertex has more than one marker on the same oriented read. + bool vertexHasDuplicateOrientedReadIds( + VertexId, + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers) const; + void remove(); // The global marker graph vertex corresponding to each marker. @@ -147,7 +154,11 @@ private: CreateVerticesFromVertexTableData createVerticesFromVertexTableData; public: - + // The disjoint sets histogram in a MemoryMapped::Vector. + // This is used when flagging primary marker graph edges for Mode 3 assembly. + // This stored pairs(coverage, frequency). + // Only pairs where the frequency is not zero are stored. + MemoryMapped::Vector< pair<uint64_t, uint64_t> > disjointSetsHistogram; // Remove marker graph vertices and update vertices and vertexTable. // After this is called, the only @@ -170,6 +181,15 @@ private: public: + // Find the common KmerId for all the markers of a marker graph vertex. + KmerId getVertexKmerId( + MarkerGraphVertexId vertexId, + uint64_t k, + const Reads&, + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers + ) const; + + // The reverse complement of each vertex. // Indexed by VertexId. @@ -180,7 +200,6 @@ public: public: Uint40 source; // The source vertex (index into globalMarkerGraphVertices). Uint40 target; // The target vertex (index into globalMarkerGraphVertices). - uint8_t coverage; // (255 indicates 255 or more). // Flags used to mark the edge as removed from the marker graph. bool wasRemoved() const @@ -220,8 +239,8 @@ public: // Assembly mode 2 only. uint8_t wasRemovedWhileSplittingSecondaryEdges : 1; - // Unused. - uint8_t flag6 : 1; + // This is set for primary edges (Mode 3 assembly only). + uint8_t isPrimary : 1; void clearFlags() { @@ -232,15 +251,16 @@ public: wasAssembled = 0; isSecondary = 0; wasRemovedWhileSplittingSecondaryEdges = 0; - flag6 = 0; + isPrimary = 0; } Edge() : source(MarkerGraph::invalidCompressedVertexId), - target(MarkerGraph::invalidCompressedVertexId), - coverage(0) + target(MarkerGraph::invalidCompressedVertexId) { clearFlags(); } + + void writeFlags(ostream&) const; }; MemoryMapped::Vector<Edge> edges; const Edge* findEdge(Uint40 source, Uint40 target) const; @@ -264,6 +284,25 @@ public: EdgeId getFirstNonRemovedOutEdge(VertexId) const; EdgeId getFirstNonRemovedInEdge(VertexId) const; + // Find the edge that contains a given MarkerInterval. + EdgeId locateMarkerInterval( + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, + const MarkerInterval&) const; + + // Apply an ordinal offset in the specified direction to a given MarkerInterval + // and find the edge that contains the offset MarkerInterval. + // This assumes that we have the complete marker graph. + EdgeId locateMarkerIntervalWithOffset( + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, + MarkerInterval, + uint32_t ordinalOffset, + uint64_t direction // 0=forward, 1=backward. + ) const; + + // Find out if an edge has duplicate oriented reads + // in its MarkerIntervals. + bool edgeHasDuplicateOrientedReadIds(EdgeId) const; + // The reverse complement of each edge. // Indexed by EdgeId. MemoryMapped::Vector<EdgeId> reverseComplementEdge; @@ -327,6 +366,90 @@ public: // ordered by position. MemoryMapped::VectorOfVectors<pair<uint32_t, CompressedCoverageData>, uint64_t> edgeCoverageData; + + + + // Edge sequence for each edge, for Mode 3 assembly. + // There are several difference compared to the consensus sequences stored above, + // which are not used in Mode 3 assembly: + // - Mode 3 assembly assumes we are not using RLE, so we don't need to store repeat counts. + // - Mode 3 assembly uses createMarkerGraphedgesStrict, which guarantees that + // all marker interval on a marker graph edge have exactly the same sequence. + // This dramatically simplifies edge sequence assembly because we can just + // obtain the sequence from the first marker interval, and multiple sequence + // alignment is not nedeed. + // - For Mode 3 assembly we assume that marker ength k is even, and + // the stored edge sequence includes the last k/2 bases from the marker + // of the source vertex and the first k/2 bases from the marker of + // the target vertex. As a result, every edge has at least one base of sequence, + // even when adjacent markers overlap. And the sequence of a path can + // be obtained by just concatenating the edge sequences. + MemoryMapped::VectorOfVectors<Base, uint64_t> edgeSequence; + + // Flag primary edges (only used for Mode 3 assembly). + void flagPrimaryEdges( + uint64_t minPrimaryCoverage, + uint64_t maxPrimaryCoverage, + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, + uint64_t threadCount); +private: + void flagPrimaryEdgesThreadFunction(uint64_t threadId); + bool isPrimaryEdge( + EdgeId, + uint64_t minPrimaryCoverage, + uint64_t maxPrimaryCoverage, + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers) const; + class FlagPrimaryEdgesData { + public: + uint64_t minPrimaryCoverage; + uint64_t maxPrimaryCoverage; + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>* markersPointer; + }; + FlagPrimaryEdgesData flagPrimaryEdgesData; + + +#if 0 + // PRIMARY JOURNEYS ARE NOW COMPUTED LOCALLY BY CLASS Mode3Assembler. + // The primary journey of an oriented read is the sequence of primary + // marker graph edges encountered by the oriented read. + // Indexed by OrientedReadId::getValue(). + // Only used for mode 3 assembly. +public: + class PrimaryJourneyEntry { + public: + array<uint32_t, 2> ordinals; + EdgeId edgeId; + bool operator<(const PrimaryJourneyEntry& that) const { + return ordinals[0] < that.ordinals[0]; + } + }; + MemoryMapped::VectorOfVectors<PrimaryJourneyEntry, uint64_t> primaryJourneys; + void createPrimaryJourneys(uint64_t orientedReadCount, uint64_t threadCount); + void writePrimaryJourneys(); +private: + void createPrimaryJourneysThreadFunction1(uint64_t threadId); + void createPrimaryJourneysThreadFunction2(uint64_t threadId); + void createPrimaryJourneysThreadFunction12(uint64_t pass); + void createPrimaryJourneysThreadFunction3(uint64_t threadId); + +public: + + // Starting from a primary marker graph edge, follow the primary journeys + // of all oriented reads on the edge, moving forward. + // Find the set of MarkerGraphEdgeIds that were encountered in this way, + // and for each the number of times it was encountered. + void followPrimaryJourneysForward( + MarkerGraphEdgeId, + vector<MarkerGraphEdgeId>&, + vector<uint64_t>& count + ) const; + // Same, but moving backward. + void followPrimaryJourneysBackward( + MarkerGraphEdgeId, + vector<MarkerGraphEdgeId>&, + vector<uint64_t>& count + ) const; +#endif }; #endif diff --git a/src/MarkerGraphEdgePairInfo.hpp b/src/MarkerGraphEdgePairInfo.hpp new file mode 100644 index 0000000..c067c14 --- /dev/null +++ b/src/MarkerGraphEdgePairInfo.hpp @@ -0,0 +1,91 @@ +#ifndef SHASTA_MARKER_GRAPH_EDGE_PAIR_INFO_HPP +#define SHASTA_MARKER_GRAPH_EDGE_PAIR_INFO_HPP + +// Shasta. +#include "invalid.hpp" + +// Standard library. +#include "algorithm.hpp" +#include "cstdint.hpp" + +namespace shasta { + class MarkerGraphEdgePairInfo; +} + + + +// Information abut the read similarity composition of two marker graph edges. +class shasta::MarkerGraphEdgePairInfo { +public: + + // The total number of OrientedReadIds in each of the edges A and B. + uint64_t totalA = 0; + uint64_t totalB = 0; + + // The number of common oriented reads. + uint64_t common = 0; + + // The number of oriented reads present in A but not in B. + uint64_t onlyA = 0; + + // The number of oriented reads present in B but not in A. + uint64_t onlyB = 0; + + // The rest of the statistics are only valid if the number + // of common oriented reads is not 0. + + // The estimated offset between the two edges. + // The estimate is done using the common oriented reads. + int64_t offsetInMarkers = invalid<int64_t>; + int64_t offsetInBases = invalid<int64_t>; + + // The number of onlyA reads which are too short to be on edge B, + // based on the above estimated offset. + uint64_t onlyAShort = invalid<uint64_t>; + + // The number of onlyB reads which are too short to be on edge A, + // based on the above estimated offset. + uint64_t onlyBShort = invalid<uint64_t>; + + uint64_t intersectionCount() const + { + return common; + } + uint64_t unionCount() const { + return totalA + totalB - common; + } + uint64_t correctedUnionCount() const + { + return unionCount() - onlyAShort - onlyBShort; + } + double jaccard() const + { + return double(intersectionCount()) / double(unionCount()); + } + double correctedJaccard() const + { + return double(intersectionCount()) / double(correctedUnionCount()); + } + + // Order them by number of common oriented reads. + bool operator<(const MarkerGraphEdgePairInfo& that) const + { + return correctedJaccard() < that.correctedJaccard(); + } + bool operator>(const MarkerGraphEdgePairInfo& that) const + { + return correctedJaccard() > that.correctedJaccard(); + } + + void reverse() + { + swap(totalA, totalB); + swap(onlyA, onlyB); + swap(onlyAShort, onlyBShort); + offsetInMarkers = - offsetInMarkers; + offsetInBases = - offsetInBases; + } + +}; + +#endif diff --git a/src/Mode3Assembler.cpp b/src/Mode3Assembler.cpp new file mode 100644 index 0000000..812d5a6 --- /dev/null +++ b/src/Mode3Assembler.cpp @@ -0,0 +1,477 @@ +// Shasta. +#include "Mode3Assembler.hpp" +#include "Assembler.hpp" +#include "AssemblerOptions.hpp" +#include "deduplicate.hpp" +#include "dset64-gccAtomic.hpp" +#include "mode3-AssemblyGraph.hpp" +#include "mode3-PrimaryGraph.hpp" +#include "orderPairs.hpp" +#include "performanceLog.hpp" +#include "timestamp.hpp" +using namespace shasta; +using namespace mode3; + +// Standard library. +#include "iostream.hpp" + +// Explicit instantiation. +#include "MultithreadedObject.tpp" +template class MultithreadedObject<Mode3Assembler>; + + + +Mode3Assembler::Mode3Assembler( + const Assembler& assembler, + uint64_t threadCount, + const Mode3AssemblyOptions& options, + bool debug) : + MultithreadedObject<Mode3Assembler>(*this), + MappedMemoryOwner(assembler), + assembler(assembler), + debug(debug) +{ + performanceLog << timestamp << "Mode 3 assembly begins." << endl; + + gatherPrimaryMarkerGraphEdgeIds(); + computeConnectedComponents(); + assembleConnectedComponents(threadCount, options, debug); + + performanceLog << timestamp << "Mode 3 assembly ends." << endl; +} + + + +void Mode3Assembler::gatherPrimaryMarkerGraphEdgeIds() +{ + const auto& markerGraphEdges = assembler.markerGraph.edges; + + primaryMarkerGraphEdgeIds.clear(); + for(MarkerGraphEdgeId edgeId=0; edgeId<markerGraphEdges.size(); edgeId++) { + if(markerGraphEdges[edgeId].isPrimary) { + primaryMarkerGraphEdgeIds.push_back(edgeId); + } + } + cout << "Of " << markerGraphEdges.size() << " marker graph edges, " << + primaryMarkerGraphEdgeIds.size() << " are primary." << endl; +} + + + +// The oriented reads present in each primary marker graph edge +// define a bipartite graph. We want to compute connected components +// of this bipartite graph and process them one at a time. +void Mode3Assembler::computeConnectedComponents() +{ + performanceLog << timestamp << "Mode3Assembler::computeConnectedComponents begins." << endl; + + // Compute connected components of the oriented reads portion + // of the bipartite graph. + // Here oriented reads are indexed by OrientedReadId::getValue(). + const uint64_t orientedReadCount = assembler.markers.size(); + vector<DisjointSets::Aint> disjointSetsData(orientedReadCount); + DisjointSets disjointSets(&disjointSetsData[0], orientedReadCount); + + // Loop over all primary marker graph edges. + // This could be multithreaded but runs at decent speed as is. + for(const MarkerGraphEdgeId edgeId: primaryMarkerGraphEdgeIds) { + const auto markerIntervals = assembler.markerGraph.edgeMarkerIntervals[edgeId]; + SHASTA_ASSERT(not markerIntervals.empty()); + const OrientedReadId orientedReadId0 = markerIntervals.front().orientedReadId; + for(const MarkerInterval& markerInterval: markerIntervals) { + const OrientedReadId orientedReadId1 = markerInterval.orientedReadId; + disjointSets.unite(orientedReadId0.getValue(), orientedReadId1.getValue()); + } + } + + // Gather the oriented reads in each connected component. + vector< vector<OrientedReadId> > componentsOrientedReads(orientedReadCount); + for(uint64_t i=0; i<orientedReadCount; i++) { + const uint64_t componentId = disjointSets.find(i); + componentsOrientedReads[componentId].push_back(OrientedReadId::fromValue(ReadId(i))); + } + + // Gather the primary marker graph edges in each connected component. + // This stores PrimaryIds, not MarkerGraphEdgeIds. + vector< vector<uint64_t> > componentsPrimaryIds(orientedReadCount); + for(uint64_t primaryId=0; primaryId<primaryMarkerGraphEdgeIds.size(); primaryId++) { + const MarkerGraphEdgeId edgeId = primaryMarkerGraphEdgeIds[primaryId]; + const auto markerIntervals = assembler.markerGraph.edgeMarkerIntervals[edgeId]; + SHASTA_ASSERT(not markerIntervals.empty()); + const OrientedReadId orientedReadId0 = markerIntervals.front().orientedReadId; + const uint64_t componentId = disjointSets.find(orientedReadId0.getValue()); + + // Check that all MarkerIntervals are in the same component. + // THIS CHECK CAN BE REMOVED FOR PERFORMANCE. + for(const MarkerInterval& markerInterval: markerIntervals) { + const OrientedReadId orientedReadId1 = markerInterval.orientedReadId; + SHASTA_ASSERT(disjointSets.find(orientedReadId1.getValue()) == componentId); + } + componentsPrimaryIds[componentId].push_back(primaryId); + } + + + + disjointSetsData.clear(); + + + + // Gather the components with more than one read and their sizes. + // The connected components cannot be self-complementary because + // we are using read strand separation method 2. + // This means that the ReadIds must be all distinct (and increasing). + // For each complementary pair, only keep the one + // that has the first oriented read on strand 0. + vector< pair<uint64_t, uint64_t> > componentTable; + for(uint64_t componentId=0; componentId<orientedReadCount; componentId++) { + const vector<OrientedReadId>& component = componentsOrientedReads[componentId]; + const uint64_t componentSize = component.size(); + if(componentSize < 2) { + continue; + } + if(component.front().getStrand() != 0) { + continue; + } + + // Verify that the ReadIds are all distinct. + // THIS CHECK CAN BE REMOVED FOR PERFORMANCE. + for(uint64_t i1=1; i1<component.size(); i1++) { + const uint64_t i0 = i1 - 1; + SHASTA_ASSERT(component[i0].getReadId() < component[i1].getReadId()); + } + + // Store this component in the componentTable. + componentTable.push_back({componentId, componentSize}); + } + + // Sort the component table by decreasing size. + sort(componentTable.begin(), componentTable.end(), + OrderPairsBySecondOnlyGreater<uint64_t, uint64_t>()); + + // Store the connected components we kept. + connectedComponents.resize(componentTable.size()); + for(uint64_t i=0; i<connectedComponents.size(); i++) { + const uint64_t componentId = componentTable[i].first; + connectedComponents[i].orientedReadIds.swap(componentsOrientedReads[componentId]); + connectedComponents[i].primaryIds.swap(componentsPrimaryIds[componentId]); + } + + // Fill in the orientedReadIdTable. + orientedReadIdTable.clear(); + orientedReadIdTable.resize(orientedReadCount, {invalid<uint64_t>, invalid<uint64_t>}); + for(uint64_t componentId=0; componentId<connectedComponents.size(); componentId++) { + const vector<OrientedReadId>& orientedReadIds = connectedComponents[componentId].orientedReadIds; + for(uint64_t position=0; position<orientedReadIds.size(); position++) { + const OrientedReadId orientedReadId = orientedReadIds[position]; + orientedReadIdTable[orientedReadId.getValue()] = {componentId, position}; + } + } + + performanceLog << timestamp << "Mode3Assembler::computeConnectedComponents ends." << endl; +} + + + +void Mode3Assembler::assembleConnectedComponents( + uint64_t threadCount, + const Mode3AssemblyOptions& options, + bool debug) +{ + performanceLog << timestamp << "Mode3Assembler::assembleConnectedComponents begins." << endl; + + vector< vector<uint64_t> > assemblyChainLengthsByPValue; + vector<uint64_t> assemblyBubbleChainLengths; + + ofstream summaryCsv("Components.csv"); + summaryCsv << "Component,Reads,Segments,Sequence,N50,Total Bubble chain length,Bubble chain N50\n"; + + vector< shared_ptr<mode3::AssemblyGraph> > assemblyGraphs; + for(uint64_t componentId=0; componentId<connectedComponents.size(); componentId++) { + const shared_ptr<AssemblyGraph> assemblyGraph = + assembleConnectedComponent(componentId, threadCount, options, true, debug); + assemblyGraphs.push_back(assemblyGraph); + + // Chain length statistics. + vector< vector<uint64_t> > chainLengths; + assemblyGraph->getChainLengthsByPValue(chainLengths); + + // Assembly statistics by P-value. + cout << "Assembly statistics by P-Value for component " << componentId << ":" << endl; + for(uint64_t pValue=0; pValue<chainLengths.size(); pValue++) { + uint64_t totalLength, n50; + tie(totalLength, n50) = AssemblyGraph::n50(chainLengths[pValue]); + cout << "P-value " << pValue << ": total assembled length " << totalLength << + ", N50 " << n50 << endl; + } + + // Combined chain length statistics for this component. + vector<uint64_t> allChainLengths; + for(const auto& v: chainLengths) { + copy(v.begin(), v.end(), back_inserter(allChainLengths)); + } + sort(allChainLengths.begin(), allChainLengths.end(), std::greater<uint64_t>()); + uint64_t totalLength, n50; + tie(totalLength, n50) = AssemblyGraph::n50(allChainLengths); + cout << "Combined for this component: total assembled length " << totalLength << + ", N50 " << n50 << endl; + + // Bubble chain length statistics (non-trivial bubble chains only). + vector<uint64_t> bubbleChainLengths; + assemblyGraph->getBubbleChainLengths(bubbleChainLengths); + uint64_t totalBubbleChainLength, bubbleChainN50; + tie(totalBubbleChainLength, bubbleChainN50) = AssemblyGraph::n50(bubbleChainLengths); + copy(bubbleChainLengths.begin(), bubbleChainLengths.end(), + back_inserter(assemblyBubbleChainLengths)); + cout << "Total non-trivial bubble chain length for this component " << totalBubbleChainLength << + ", N50 " << bubbleChainN50 << endl; + + // Write a line to the summaryCsv. + summaryCsv << componentId << ","; + summaryCsv << connectedComponents[componentId].orientedReadIds.size() << ","; + summaryCsv << allChainLengths.size() << ","; + summaryCsv << totalLength << ","; + summaryCsv << n50 << ","; + summaryCsv << totalBubbleChainLength << ","; + summaryCsv << bubbleChainN50 << "\n"; + + // Store the chain lengths. + if(assemblyChainLengthsByPValue.size() < chainLengths.size()) { + assemblyChainLengthsByPValue.resize(chainLengths.size()); + } + for(uint64_t pValue=0; pValue<chainLengths.size(); pValue++) { + copy(chainLengths[pValue].begin(), chainLengths[pValue].end(), + back_inserter(assemblyChainLengthsByPValue[pValue])); + } + } + + cout << "Global assembly statistics by P-Value:" << endl; + for(uint64_t pValue=0; pValue<assemblyChainLengthsByPValue.size(); pValue++) { + sort(assemblyChainLengthsByPValue[pValue].begin(), assemblyChainLengthsByPValue[pValue].end(), + std::greater<uint64_t>()); + uint64_t totalLength, n50; + tie(totalLength, n50) = AssemblyGraph::n50(assemblyChainLengthsByPValue[pValue]); + cout << "P-value " << pValue << ": total assembled length " << totalLength << + ", N50 " << n50 << endl; + } + vector<uint64_t> allChainLengths; + for(const auto& v: assemblyChainLengthsByPValue) { + copy(v.begin(), v.end(), back_inserter(allChainLengths)); + } + sort(allChainLengths.begin(), allChainLengths.end(), std::greater<uint64_t>()); + uint64_t totalLength, n50; + tie(totalLength, n50) = AssemblyGraph::n50(allChainLengths); + cout << "Global assembly statistics, combined for all P-values: total assembled length " << totalLength << + ", N50 " << n50 << endl; + + sort(assemblyBubbleChainLengths.begin(), assemblyBubbleChainLengths.end(), std::greater<uint64_t>()); + uint64_t totalBubbleChainLength, bubbleChainN50; + tie(totalBubbleChainLength, bubbleChainN50) = AssemblyGraph::n50(assemblyBubbleChainLengths); + cout << "Total non-trivial bubble chain length " << totalBubbleChainLength << + ", N50 " << bubbleChainN50 << endl; + + + // Create a csv file with one line for each assembled segment. + // This can also be loaded in Bandage. + { + ofstream csv("Assembly.csv"); + csv << "Chain,Connectivity,Component,Bubble chain,Position in bubble chain,Index in bubble," + "Sequence length,Primary coverage,P value,Color," + "Preceded by,Followed by," + "\n"; + for(const shared_ptr<mode3::AssemblyGraph>& assemblyGraph: assemblyGraphs) {; + assemblyGraph->writeCsvSummary(csv); + } + } + + // Create a global FASTA file with output from all the connected components. + { + ofstream fasta("Assembly.fasta"); + for(const shared_ptr<mode3::AssemblyGraph>& assemblyGraph: assemblyGraphs) { + assemblyGraph->writeFastaExpanded(fasta); + } + } + + // Create a global GFA file with output from all the connected components. + { + ofstream gfa("Assembly.gfa"); + AssemblyGraph::writeGfaHeader(gfa); + for(const shared_ptr<mode3::AssemblyGraph>& assemblyGraph: assemblyGraphs) { + assemblyGraph->writeGfaSegmentsExpanded(gfa, true, true); + } + for(const shared_ptr<mode3::AssemblyGraph>& assemblyGraph: assemblyGraphs) { + assemblyGraph->writeGfaLinksExpanded(gfa); + } + } + + // Also create a global GFA file without sequence. + ofstream gfa("Assembly-NoSequence.gfa"); + { + AssemblyGraph::writeGfaHeader(gfa); + for(const shared_ptr<mode3::AssemblyGraph>& assemblyGraph: assemblyGraphs) { + assemblyGraph->writeGfaSegmentsExpanded(gfa, false, true); + } + for(const shared_ptr<mode3::AssemblyGraph>& assemblyGraph: assemblyGraphs) { + assemblyGraph->writeGfaLinksExpanded(gfa); + } + } + + performanceLog << timestamp << "Mode3Assembler::assembleConnectedComponents ends." << endl; +} + + + +shared_ptr<AssemblyGraph> Mode3Assembler::assembleConnectedComponent( + uint64_t componentId, + uint64_t threadCount, + const Mode3AssemblyOptions& options, + bool assembleSequence, + bool debug) +{ + performanceLog << timestamp << "Assembling connected component " << + componentId << " of " << connectedComponents.size() << endl; + cout << timestamp << "Assembling connected component " << + componentId << " of " << connectedComponents.size() << endl; + + const ConnectedComponent& connectedComponent = connectedComponents[componentId]; + const vector<OrientedReadId>& orientedReadIds = connectedComponent.orientedReadIds; + const vector<uint64_t>& primaryIds = connectedComponent.primaryIds; + + cout << "This connected component has " << orientedReadIds.size() << + " reads and " << primaryIds.size() << " primary marker graph edges." << endl; + + + + // We need to compute the primary journey of each oriented read, + // that is, the sequence of primary edges encountered by each read. + // We store each journey as a vector of pairs of + // (ordinal0, localPrimaryId), where localPrimaryId is an index into primaryIds + // for this connected component. + vector< vector< pair<uint32_t, uint64_t> > > journeys(orientedReadIds.size()); + + performanceLog << timestamp << "Journey computation begins." << endl; + for(uint64_t localPrimaryId=0; localPrimaryId<primaryIds.size(); localPrimaryId++) { + const uint64_t primaryId = primaryIds[localPrimaryId]; + const MarkerGraphEdgeId edgeId = primaryMarkerGraphEdgeIds[primaryId]; + const auto markerIntervals = assembler.markerGraph.edgeMarkerIntervals[edgeId]; + for(const MarkerInterval& markerInterval: markerIntervals) { + const OrientedReadId orientedReadId = markerInterval.orientedReadId; + const uint32_t ordinal0 = markerInterval.ordinals[0]; + const auto& p = orientedReadIdTable[orientedReadId.getValue()]; + SHASTA_ASSERT(p.first == componentId); + journeys[p.second].push_back({ordinal0, localPrimaryId}); + } + } + for(vector< pair<uint32_t, uint64_t> >& journey: journeys) { + sort(journey.begin(), journey.end(), OrderPairsByFirstOnly<uint32_t, uint64_t>()); + } + performanceLog << timestamp << "Journey computation ends." << endl; + +#if 0 + // Check that the journeys computed in this way are identical to the ones stored in the MarkerGraph. + // The ones stored in the MarkerGraph will eventually go away. + for(uint64_t i=0; i<orientedReadIds.size(); i++) { + const OrientedReadId orientedReadId = orientedReadIds[i]; + const auto journey = journeys[i]; + const auto storedJourney = assembler.markerGraph.primaryJourneys[orientedReadId.getValue()]; + SHASTA_ASSERT(journey.size() == storedJourney.size()); + + for(uint64_t j=0; j<journey.size(); j++) { + const auto& p = journey[j]; + const uint64_t localPrimaryId = p.second; + const uint64_t primaryId = primaryIds[localPrimaryId]; + const MarkerGraphEdgeId edgeId = primaryMarkerGraphEdgeIds[primaryId]; + // cout << orientedReadId << " " << storedJourney[j].edgeId << " " << edgeId << endl; + SHASTA_ASSERT(edgeId == storedJourney[j].edgeId); + } + } +#endif + + + // Now we can create the PrimaryGraph for this connected component. + PrimaryGraph primaryGraph; + + // Create the vertices first. + vector<PrimaryGraph::vertex_descriptor> vertexDescriptors; + for(uint64_t localPrimaryId=0; localPrimaryId<primaryIds.size(); localPrimaryId++) { + const uint64_t primaryId = primaryIds[localPrimaryId]; + const MarkerGraphEdgeId edgeId = primaryMarkerGraphEdgeIds[primaryId]; + vertexDescriptors.push_back(primaryGraph.addVertex(edgeId)); + } + + + + // To generate edges of the PrimaryGraph, we need to gather pairs of consecutive + // journey entries. Each pair (localPrimaryId0, localPrimaryId1) is stored + // as a localPrimaryId1 in journeyPairs[localPrimaryId0]. + // For now use a simple vector of vector and sequential code, but later + // switch to MemoryMapped::VectorOfVectors<uint64_t, uint64_t> and multithreaded code. + vector< vector<uint64_t> > journeyPairs(primaryIds.size()); + performanceLog << timestamp << "PrimaryGraph edge creation begins." << endl; + for(const auto& journey: journeys) { + for(uint64_t i1=1; i1<journey.size(); i1++) { + const uint64_t i0 = i1 - 1; + const uint64_t localPrimaryId0 = journey[i0].second; + const uint64_t localPrimaryId1 = journey[i1].second; + journeyPairs[localPrimaryId0].push_back(localPrimaryId1); + } + } + vector<uint64_t> count; + for(uint64_t localPrimaryId0=0; localPrimaryId0<primaryIds.size(); localPrimaryId0++) { + const PrimaryGraph::vertex_descriptor v0 = vertexDescriptors[localPrimaryId0]; + const MarkerGraphEdgeId edgeId0 = primaryGraph[v0].edgeId; + auto journeyPairs0 = journeyPairs[localPrimaryId0]; + deduplicateAndCount(journeyPairs0, count); + SHASTA_ASSERT(journeyPairs0.size() == count.size()); + for(uint64_t j=0; j<journeyPairs0.size(); j++) { + const uint64_t localPrimaryId1 = journeyPairs0[j]; + const uint64_t coverage = count[j]; + const PrimaryGraph::vertex_descriptor v1 = vertexDescriptors[localPrimaryId1]; + const MarkerGraphEdgeId edgeId1 = primaryGraph[v1].edgeId; + MarkerGraphEdgePairInfo info; + SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(edgeId0, edgeId1, info)); + primaryGraph.addEdgeFromVertexDescriptors(v0, v1, info, coverage); + } + } + performanceLog << timestamp << "PrimaryGraph edge creation ends." << endl; + + cout << "The PrimaryGraph for this connected component has " << + num_vertices(primaryGraph) << " vertices and " << num_edges(primaryGraph) << " edges." << endl; + + + // Graphviz output. + if(debug) { + PrimaryGraphDisplayOptions options; + options.showNonTransitiveReductionEdges = true; + primaryGraph.writeGraphviz( + "PrimaryGraphInitial" + to_string(componentId), options, assembler.markerGraph); + options.makeCompact(); + primaryGraph.writeGraphviz( + "PrimaryGraphCompactInitial" + to_string(componentId), options, assembler.markerGraph); + primaryGraph.writeEdgeCoverageHistogram("PrimaryGraphInitial" + to_string(componentId) + "-EdgeCoverageHistogram.csv"); + } + + // Remove weak edges.. + primaryGraph.removeWeakEdges(options.primaryGraphOptions.maxLoss); + + // Remove cross-edges. + primaryGraph.removeCrossEdges( + options.primaryGraphOptions.crossEdgesLowCoverageThreshold, + options.primaryGraphOptions.crossEdgesHighCoverageThreshold, + 0); + + // Graphviz output. + if(debug) { + PrimaryGraphDisplayOptions options; + options.showNonTransitiveReductionEdges = false; + primaryGraph.writeGraphviz( + "PrimaryGraph" + to_string(componentId), options, assembler.markerGraph); + options.makeCompact(); + primaryGraph.writeGraphviz( + "PrimaryGraphCompact" + to_string(componentId), options, assembler.markerGraph); + } + + // Create the assembly graph for this connected component. + return make_shared<AssemblyGraph>( + primaryGraph, componentId, assembler, threadCount, + options, assembleSequence, debug); +} diff --git a/src/Mode3Assembler.hpp b/src/Mode3Assembler.hpp new file mode 100644 index 0000000..a7a8773 --- /dev/null +++ b/src/Mode3Assembler.hpp @@ -0,0 +1,80 @@ +#pragma once + +// Shasta. +#include "MappedMemoryOwner.hpp" +#include "MultithreadedObject.hpp" +#include "ReadId.hpp" +#include "shastaTypes.hpp" + +// Standard library. +#include "memory.hpp" +#include "utility.hpp" +#include "vector.hpp" + +namespace shasta { + class Mode3Assembler; + class Assembler; + class Mode3AssemblyOptions; + namespace mode3 { + class AssemblyGraph; + } +} + + +class shasta::Mode3Assembler : + public MultithreadedObject<Mode3Assembler>, + public MappedMemoryOwner { +public: + Mode3Assembler( + const Assembler&, + uint64_t threadCount, + const Mode3AssemblyOptions&, + bool debug); +private: + const Assembler& assembler; + bool debug; + + // The MarkerGraphEdgeIds of the primary marker graph edges. + // These are sorted. + // An index in this vector is called PrimaryId. + vector<MarkerGraphEdgeId> primaryMarkerGraphEdgeIds; + void gatherPrimaryMarkerGraphEdgeIds(); + + // The oriented reads present in each primary marker graph edge + // define a bipartite graph. We want to compute connected components + // of this bipartite graph and process them one at a time. + // These are also connected components of the global primary graph + // (with one vertex for each primary marker graph edge, + // and edges created by following the reads). + class ConnectedComponent { + public: + // The oriented reads in this connected component. + vector<OrientedReadId> orientedReadIds; + + // The PrimaryIds of the marker graph edges in this connected component. + // These are indices into primaryMarkerGraphEdgeIds. + vector<uint64_t> primaryIds; + }; + vector<ConnectedComponent> connectedComponents; + void computeConnectedComponents(); + + // For each oriented read, store which ConnectedComponent it belongs to, + // and at what position. + // Indexed by OrientedReadId::getValue(). + // For each OrientedReadId we store a pair (componentId, position), + // where componentId is the index in the connectedComponents vector + // and position is the index in the orientedReadIds vector + // for that connected component. + vector< pair<uint64_t, uint64_t> > orientedReadIdTable; + + void assembleConnectedComponents( + uint64_t threadCount, + const Mode3AssemblyOptions&, + bool debug); + shared_ptr<mode3::AssemblyGraph> assembleConnectedComponent( + uint64_t componentId, + uint64_t threadCount, + const Mode3AssemblyOptions&, + bool assembleSequence, + bool debug); +}; diff --git a/src/PythonModule.cpp b/src/PythonModule.cpp index 5812058..2a3aed0 100644 --- a/src/PythonModule.cpp +++ b/src/PythonModule.cpp @@ -12,8 +12,11 @@ #include "deduplicate.hpp" #include "dset64Test.hpp" #include "diploidBayesianPhase.hpp" +#include "enumeratePaths.hpp" #include "shastaLapack.hpp" +#include "globalMsa.hpp" #include "LongBaseSequence.hpp" +#include "longestPath.hpp" #include "mappedCopy.hpp" #include "MedianConsensusCaller.hpp" #include "MemoryMappedAllocator.hpp" @@ -154,37 +157,8 @@ PYBIND11_MODULE(shasta, shastaModule) // K-mers. .def("accessKmers", &Assembler::accessKmers) - .def("writeKmers", - &Assembler::writeKmers, - arg("fileName") = "Kmers.csv") - .def("randomlySelectKmers", - &Assembler::randomlySelectKmers, - arg("k"), - arg("probability"), - arg("seed") = 231) - .def("selectKmersBasedOnFrequency", - &Assembler::selectKmersBasedOnFrequency, - arg("k"), - arg("markerDensity"), - arg("seed") = 231, - arg("enrichmentThreshold"), - arg("threadCount") = 0) - .def("selectKmers2", - &Assembler::selectKmers2, - arg("k"), - arg("markerDensity"), - arg("seed") = 231, - arg("enrichmentThreshold"), - arg("threadCount") = 0) - .def("selectKmers4", - &Assembler::selectKmers4, - arg("k"), - arg("markerDensity"), - arg("seed") = 231, - arg("distanceThreshold"), - arg("threadCount") = 0) - - + .def("accessKmerChecker", + &Assembler::accessKmerChecker) // Markers. .def("accessMarkers", @@ -203,10 +177,6 @@ PYBIND11_MODULE(shasta, shastaModule) arg("readId"), arg("strand"), arg("fileName")) - .def("getMarkers", - &Assembler::getMarkers) - .def("writeMarkerFrequency", - &Assembler::writeMarkerFrequency) .def("computeSortedMarkers", &Assembler::computeSortedMarkers, arg("threadCount") = 0) @@ -226,16 +196,6 @@ PYBIND11_MODULE(shasta, shastaModule) arg("maxBucketSize"), arg("minFrequency"), arg("threadCount") = 0) - .def("findAlignmentCandidatesLowHash1", - &Assembler::findAlignmentCandidatesLowHash1, - arg("m"), - arg("hashFraction"), - arg("minHashIterationCount"), - arg("log2MinHashBucketCount") = 0, - arg("minBucketSize"), - arg("maxBucketSize"), - arg("minFrequency"), - arg("threadCount") = 0) .def("accessAlignmentCandidates", &Assembler::accessAlignmentCandidates) .def("accessAlignmentCandidateTable", @@ -263,8 +223,6 @@ PYBIND11_MODULE(shasta, shastaModule) &Assembler::writeAlignmentCandidates, arg("useReadName") = false, arg("verbose") = false) - .def("writeAlignmentDetails", - &Assembler::writeAlignmentDetails) .def("writeLocalAlignmentCandidateReads", &Assembler::writeLocalAlignmentCandidateReads, arg("readId"), @@ -305,7 +263,7 @@ PYBIND11_MODULE(shasta, shastaModule) (ReadId, Strand, ReadId, Strand, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, double, uint64_t, uint64_t, uint64_t, uint64_t, - int64_t, int64_t, int64_t) const + int64_t, int64_t, int64_t) ) &Assembler::alignOrientedReads4, arg("readId0"), @@ -355,6 +313,10 @@ PYBIND11_MODULE(shasta, shastaModule) arg("strand0"), arg("readId1"), arg("strand1")) + .def("computeMarkerKmerIds", + &Assembler::computeMarkerKmerIds) + .def("cleanupMarkerKmerIds", + &Assembler::cleanupMarkerKmerIds) @@ -437,6 +399,8 @@ PYBIND11_MODULE(shasta, shastaModule) .def("accessMarkerGraphVertices", &Assembler::accessMarkerGraphVertices, arg("readWriteAccess") = false) + .def("accessDisjointSetsHistogram", + &Assembler::accessDisjointSetsHistogram) .def("getGlobalMarkerGraphVertex", ( MarkerGraph::VertexId (Assembler::*) @@ -481,8 +445,6 @@ PYBIND11_MODULE(shasta, shastaModule) arg("pattern2CreateNewVertices")) .def("getMarkerGraphMinCoverageUsed", &Assembler::getMarkerGraphMinCoverageUsed) - .def("vertexCoverageStatisticsByKmerId", - &Assembler::vertexCoverageStatisticsByKmerId) // Edges of the global marker graph. .def("createMarkerGraphEdges", @@ -520,11 +482,6 @@ PYBIND11_MODULE(shasta, shastaModule) arg("highCoverageThreshold"), arg("maxDistance"), arg("edgeMarkerSkipThreshold")) - .def("reverseTransitiveReduction", - &Assembler::reverseTransitiveReduction, - arg("lowCoverageThreshold"), - arg("highCoverageThreshold"), - arg("maxDistance")) .def("pruneMarkerGraphStrongSubgraph", &Assembler::pruneMarkerGraphStrongSubgraph, arg("iterationCount")) @@ -621,7 +578,7 @@ PYBIND11_MODULE(shasta, shastaModule) .def("assembleAssemblyGraphEdge", ( AssembledSegment (Assembler::*) - (AssemblyGraph::EdgeId, bool) + (mode0::AssemblyGraph::EdgeId, bool) ) &Assembler::assembleAssemblyGraphEdge, arg("edgeId"), @@ -638,8 +595,6 @@ PYBIND11_MODULE(shasta, shastaModule) arg("diagonalReadCountMin"), arg("offDiagonalReadCountMax"), arg("offDiagonalRatio")) - .def("alignPseudoPaths", - &Assembler::alignPseudoPaths) .def("removeAssemblyGraph", &Assembler::removeAssemblyGraph) @@ -655,18 +610,11 @@ PYBIND11_MODULE(shasta, shastaModule) // Assembly mode 3. .def("mode3Assembly", - &Assembler::mode3Assembly, - arg("threadCount") = 0) - .def("accessMode3AssemblyGraph", - &Assembler::accessMode3AssemblyGraph) - .def("analyzeMode3Subgraph", - &Assembler::analyzeMode3Subgraph) - .def("createMode3PathGraph", - &Assembler::createMode3PathGraph) - .def("createMode3Detangler", - &Assembler::createMode3Detangler) - - + &Assembler::mode3Assembly) + .def("mode3AssembleComponent", + &Assembler::mode3AssembleComponent) + .def("flagPrimaryMarkerGraphEdges", + &Assembler::flagPrimaryMarkerGraphEdges) // Consensus caller. .def("setupConsensusCaller", @@ -708,6 +656,18 @@ PYBIND11_MODULE(shasta, shastaModule) + // Expose portions of class AssemblerOptions to Python. + class_<AssemblerOptions>(shastaModule, "AssemblerOptions") + .def(pybind11::init<const string&>()) + .def_readonly("assemblyOptions", &AssemblerOptions::assemblyOptions) + ; + class_<AssemblyOptions>(shastaModule, "AssemblyOptions") + .def_readonly("mode3Options", &AssemblyOptions::mode3Options) + ; + class_<Mode3AssemblyOptions>(shastaModule, "Mode3AssemblyOptions"); + + + // Constants. shastaModule.attr("invalidGlobalMarkerGraphVertexId") = MarkerGraph::invalidVertexId; shastaModule.attr("invalidCompressedGlobalMarkerGraphVertexId") = @@ -785,6 +745,15 @@ PYBIND11_MODULE(shasta, shastaModule) shastaModule.def("testSubsetGraph", testSubsetGraph ); + shastaModule.def("testLongestPath", + testLongestPath + ); + shastaModule.def("testEnumeratePaths", + testEnumeratePaths + ); + shastaModule.def("globalMsaPython", + globalMsaPython + ); } #endif diff --git a/src/ReadFlags.hpp b/src/ReadFlags.hpp index bf83ff9..094cca6 100644 --- a/src/ReadFlags.hpp +++ b/src/ReadFlags.hpp @@ -10,6 +10,14 @@ namespace shasta { class shasta::ReadFlags { public: + // Set if we have other reads with the same name. + uint8_t isDuplicate : 1; + + // Set if this read is not to be used in the assembly + // due to the presence of duplicates. + // The way this is set is determined by the value of --Reads.handleDuplicates: + uint8_t discardDueToDuplicates : 1; + // This is set for reads that are approximate palindromic, // that is, are well aligned with their own reverse complement. uint8_t isPalindromic : 1; diff --git a/src/Reads.cpp b/src/Reads.cpp index 4a2ce50..3651d46 100644 --- a/src/Reads.cpp +++ b/src/Reads.cpp @@ -242,23 +242,32 @@ uint64_t Reads::getReadRawSequenceLength(ReadId readId) const // representation of an oriented read. vector<uint32_t> Reads::getRawPositions(OrientedReadId orientedReadId) const { - const ReadId readId = orientedReadId.getReadId(); - const ReadId strand = orientedReadId.getStrand(); - const auto repeatCounts = readRepeatCounts[readId]; - const uint64_t n = repeatCounts.size(); - vector<uint32_t> v; - uint32_t position = 0; - for(uint64_t i=0; i<n; i++) { - v.push_back(position); - uint8_t count; - if(strand == 0) { - count = repeatCounts[i]; - } else { - count = repeatCounts[n-1-i]; + if(representation == 1) { + const ReadId readId = orientedReadId.getReadId(); + const ReadId strand = orientedReadId.getStrand(); + const auto repeatCounts = readRepeatCounts[readId]; + const uint64_t n = repeatCounts.size(); + + uint32_t position = 0; + for(uint64_t i=0; i<n; i++) { + v.push_back(position); + uint8_t count; + if(strand == 0) { + count = repeatCounts[i]; + } else { + count = repeatCounts[n-1-i]; + } + position += count; + } + } else { + + // If not using RLE, raw positions are the same as RLE positions. + const ReadId readId = orientedReadId.getReadId(); + for(uint32_t i=0; i<reads[readId].baseCount; i++) { + v.push_back(i); } - position += count; } return v; @@ -541,3 +550,113 @@ ReadId Reads::getReadId(const span<const char>& readName) const } } + + +// Find duplicate reads, as determined by name (not sequence). +// This also sets the isDuplicate and discardDueToDuplicates read flags +// and summarizes what it found Duplicates.csv. +void Reads::findDuplicates(const string& handleDuplicates) +{ + const uint64_t readCount = reads.size(); + SHASTA_ASSERT(readFlags.size() == readCount); + SHASTA_ASSERT(readNames.size() == readCount); + + // Set bool variables correspondng to the permitted values of handleDuplicates. + bool useAllCopies = false; + bool useOneCopy = false; + bool useNone = false; + bool forbid = false; + if(handleDuplicates == "useAllCopies") { + useAllCopies = true; + } else if(handleDuplicates == "useOneCopy") { + useOneCopy = true; + } else if(handleDuplicates == "useNone") { + useNone = true; + } else if(handleDuplicates == "forbid") { + forbid = true; + } else { + throw runtime_error("Invalid value " + handleDuplicates + " specified for --Reads.handleDuplicates. " + "Must be one of: useAllCopies, useOneCopy, useNone, forbid."); + } + + uint64_t discardedCount = 0; + vector<uint64_t> duplicatedReadIds; + for(uint64_t i=0; i<readCount; i++) { + const uint64_t readId = readIdsSortedByName[i]; + const auto name = readNames[readId]; + + // Find out if the name is the same as the + // name of the previous read, in order sorted by name. + bool hasSameNameAsPrevious = false; + if(i != 0) { + const auto previousName = readNames[readIdsSortedByName[i - 1]]; + hasSameNameAsPrevious = equal( + name.begin(), name.end(), + previousName.begin(), previousName.end()); + } + + // Find out if the name is the same as the + // name of the next read, in order sorted by name. + bool hasSameNameAsNext = false; + if(i < readCount - 1) { + const auto nextName = readNames[readIdsSortedByName[i + 1]]; + hasSameNameAsNext = equal( + name.begin(), name.end(), + nextName.begin(), nextName.end()); + } + + // Set the isDuplicate flag for this read. + ReadFlags& flags = readFlags[readId]; + flags.isDuplicate = uint8_t(hasSameNameAsPrevious or hasSameNameAsNext); + + // Set the discardDueToDuplicates flag for this read. + if(useAllCopies) { + flags.discardDueToDuplicates = uint8_t(false); + } else if(useOneCopy) { + flags.discardDueToDuplicates = uint8_t(hasSameNameAsPrevious); + } else if(useNone) { + flags.discardDueToDuplicates = flags.isDuplicate; + } else if(forbid) { + // This does not really matter because in this case the assembly will stop. + flags.discardDueToDuplicates = flags.isDuplicate; + } + + // Increment counts. + if(flags.isDuplicate) { + duplicatedReadIds.push_back(readId); + } + if(flags.discardDueToDuplicates) { + ++discardedCount; + } + } + + cout << "Found " << duplicatedReadIds.size() << " reads with duplicate names." << endl; + cout << "Discarded from the assembly " << discardedCount << " reads with duplicate names." << endl; + + + + // Write a csv file with details of the duplicate reads. + ofstream csv("DuplicateReads.csv"); + csv << "Id,Discarded,Name,MetaData\n"; + for(const uint64_t readId: duplicatedReadIds) { + const ReadFlags& flags = readFlags[readId]; + if(flags.isDuplicate) { + csv << readId << ","; + csv << (flags.discardDueToDuplicates ? "Yes" : "No") << ","; + + const auto name = readNames[readId]; + copy(name.begin(), name.end(), ostream_iterator<char>(csv)); + csv << ","; + + const auto metaData = readMetaData[readId]; + copy(metaData.begin(), metaData.end(), ostream_iterator<char>(csv)); + csv << "\n"; + } + } + + // If there are duplicates, stop the assembly, if requested. + if(forbid and duplicatedReadIds.size() > 0) { + throw runtime_error("Stopping assembly because reads with duplicate names were found " + "and --Reads.handleDuplicates is set to forbid."); + } +} diff --git a/src/Reads.hpp b/src/Reads.hpp index 5b9cc8d..06215b1 100644 --- a/src/Reads.hpp +++ b/src/Reads.hpp @@ -256,6 +256,11 @@ public: uint64_t& discardedShortReadBases ); + // Find duplicate reads, as determined by name (not sequence). + // This also sets the isDuplicate and discardDueToDuplicates read flags + // and summarizes what it found Duplicates.csv. + void findDuplicates(const string& handleDuplicates); + void remove(); uint64_t representation; // 0 = raw sequence, 1 = RLE sequence diff --git a/src/ShortBaseSequence.cpp b/src/ShortBaseSequence.cpp index 7f18e72..b8fe346 100644 --- a/src/ShortBaseSequence.cpp +++ b/src/ShortBaseSequence.cpp @@ -9,7 +9,7 @@ using namespace shasta; void shasta::testShortBaseSequence() { - ShortBaseSequence8 s; + ShortBaseSequence16 s; s.set(0, Base::fromCharacter('T')); s.set(1, Base::fromCharacter('C')); s.set(2, Base::fromCharacter('G')); @@ -19,13 +19,39 @@ void shasta::testShortBaseSequence() cout << s << endl; // const auto oldFill = cout.fill('0'); - for(const uint8_t x: s.data) { + for(const uint16_t x: s.data) { cout << std::setw(2) << std::hex << int(x) << endl; // cout << int(x) << endl; } // cout.fill(oldFill); // Check that constructor from id does the inverse of function id(). - const ShortBaseSequence8 t(s.id(4), 4); + const ShortBaseSequence16 t(s.id(4), 4); SHASTA_ASSERT(t == s); + + + // Verify that the KmerId for a k-mer of given length k is the + // same regardless of how the k-mer is stored. + { + const string sequenceString = "TCGAGCTTAG"; + const uint64_t k = sequenceString.size(); + + ShortBaseSequence16 s16; + ShortBaseSequence32 s32; + ShortBaseSequence64 s64; + for(uint64_t i=0; i<k; i++) { + const Base base = Base::fromCharacter(sequenceString[i]); + s16.set(i, base); + s32.set(i, base); + s64.set(i, base); + } + const uint64_t kmerId16 = s16.id(k); + const uint64_t kmerId32 = s32.id(k); + const uint64_t kmerId64 = s64.id(k); + + cout << kmerId16 << " " << kmerId32 << " " << kmerId64 << endl; + SHASTA_ASSERT(kmerId16 == kmerId32); + SHASTA_ASSERT(kmerId32 == kmerId64); + + } } diff --git a/src/ShortBaseSequence.hpp b/src/ShortBaseSequence.hpp index 7c848f7..39944be 100644 --- a/src/ShortBaseSequence.hpp +++ b/src/ShortBaseSequence.hpp @@ -3,6 +3,7 @@ // shasta. #include "Base.hpp" +#include "bitReversal.hpp" // Standard library. #include "array.hpp" @@ -106,7 +107,7 @@ public: } // Return the reverse complement of the first n bases. - ShortBaseSequence<Int> reverseComplement(uint64_t n) const + ShortBaseSequence<Int> reverseComplementSlow(uint64_t n) const { ShortBaseSequence<Int> reverseComplementedSequence; for(size_t i=0; i<n; i++) { @@ -116,11 +117,39 @@ public: return reverseComplementedSequence; } + + + // Return the reverse complement of the first n bases. + // Use bit reversal for speed. This avoids a loop over the n bases. + ShortBaseSequence<Int> reverseComplement(uint64_t n) const + { + const Int shift = Int(capacity - n); + const Int mask = Int(1ULL << n) - Int(1); + ShortBaseSequence<Int> reverseComplementedSequence; + reverseComplementedSequence.data[0] = Int(((~bitReversal(data[0])) & mask) << shift); + reverseComplementedSequence.data[1] = Int(((~bitReversal(data[1])) & mask) << shift); + +#if 0 + // Testing. + SHASTA_ASSERT(reverseComplementedSequence == reverseComplementSlow(n)); + SHASTA_ASSERT(reverseComplementedSequence.reverseComplementSlow(n) == *this); +#endif + + return reverseComplementedSequence; + } + + + bool operator==(const ShortBaseSequence<Int>& that) const { return data == that.data; } + bool operator<(const ShortBaseSequence<Int>& that) const + { + return data < that.data; + } + // Write the first n bases. ostream& write(ostream& s, uint64_t n) const { diff --git a/src/approximateTopologicalSort.hpp b/src/approximateTopologicalSort.hpp index 00a0af9..effd036 100644 --- a/src/approximateTopologicalSort.hpp +++ b/src/approximateTopologicalSort.hpp @@ -51,6 +51,7 @@ Only the last edge processed will be classified as causing a cycle. #include <boost/graph/iteration_macros.hpp> #include <stack> +#include "utility.hpp" #include "vector.hpp" namespace shasta { diff --git a/src/assembleMarkerGraphPath.cpp b/src/assembleMarkerGraphPath.cpp index ed140f7..e615e95 100644 --- a/src/assembleMarkerGraphPath.cpp +++ b/src/assembleMarkerGraphPath.cpp @@ -7,6 +7,7 @@ using namespace shasta; void shasta::assembleMarkerGraphPath( uint64_t readRepresentation, uint64_t k, + const Reads& reads, const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, const MarkerGraph& markerGraph, const span<const MarkerGraph::EdgeId>& markerGraphPath, @@ -53,9 +54,7 @@ void shasta::assembleMarkerGraphPath( for(size_t i=0; i<assembledSegment.vertexCount; i++) { // Get the sequence. - const MarkerId firstMarkerId = markerGraph.getVertexMarkerIds(assembledSegment.vertexIds[i])[0]; - const CompressedMarker& firstMarker = markers.begin()[firstMarkerId]; - const KmerId kmerId = firstMarker.kmerId; + const KmerId kmerId = markerGraph.getVertexKmerId(assembledSegment.vertexIds[i], k, reads, markers); const Kmer kmer(kmerId, k); if(readRepresentation == 1) { diff --git a/src/assembleMarkerGraphPath.hpp b/src/assembleMarkerGraphPath.hpp index f3ded2f..a5ec3b6 100644 --- a/src/assembleMarkerGraphPath.hpp +++ b/src/assembleMarkerGraphPath.hpp @@ -8,10 +8,12 @@ namespace shasta { class AssembledSegment; + class Reads; void assembleMarkerGraphPath( uint64_t readRepresentation, uint64_t k, + const Reads& reads, const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, const MarkerGraph&, const span<const MarkerGraph::EdgeId>& markerGraphPath, diff --git a/src/bitReversal.hpp b/src/bitReversal.hpp new file mode 100644 index 0000000..04ed9c6 --- /dev/null +++ b/src/bitReversal.hpp @@ -0,0 +1,54 @@ +#ifndef SHASTA_BIT_REVERSAL_HPP +#define SHASTA_BIT_REVERSAL_HPP + +// See https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel + +#include "cstdint.hpp" + +namespace shasta { + inline uint16_t bitReversal(uint16_t); + inline uint32_t bitReversal(uint32_t); + inline uint64_t bitReversal(uint64_t); +} + + + +inline uint16_t shasta::bitReversal(uint16_t x) +{ + const uint16_t m1 = uint16_t(0x5555); + const uint16_t m2 = uint16_t(0x3333); + const uint16_t m4 = uint16_t(0x0F0F); + + x = ((x >> 1) & m1) | ((x & m1) << 1); + x = ((x >> 2) & m2) | ((x & m2) << 2); + x = ((x >> 4) & m4) | ((x & m4) << 4); + x = (x >> 8) | (x << 8); + return x; +} + + + +inline uint32_t shasta::bitReversal(uint32_t x) +{ + x = ((x >> 1) & 0x55555555) | ((x & 0x55555555) << 1); + x = ((x >> 2) & 0x33333333) | ((x & 0x33333333) << 2); + x = ((x >> 4) & 0x0F0F0F0F) | ((x & 0x0F0F0F0F) << 4); + x = ((x >> 8) & 0x00FF00FF) | ((x & 0x00FF00FF) << 8); + x = ( x >> 16) | ( x << 16); + return x; +} + + + +inline uint64_t shasta::bitReversal(uint64_t x) +{ + x = ((x >> 1) & 0x5555555555555555UL) | ((x & 0x5555555555555555UL) << 1 ); + x = ((x >> 2) & 0x3333333333333333UL) | ((x & 0x3333333333333333UL) << 2 ); + x = ((x >> 4) & 0x0F0F0F0F0F0F0F0FUL) | ((x & 0x0F0F0F0F0F0F0F0FUL) << 4 ); + x = ((x >> 8) & 0x00FF00FF00FF00FFUL) | ((x & 0x00FF00FF00FF00FFUL) << 8 ); + x = ((x >> 16) & 0x0000FFFF0000FFFFUL) | ((x & 0x0000FFFF0000FFFFUL) << 16); + x = (x >> 32) | (x << 32); + return x; +} + +#endif diff --git a/src/computeLayout.hpp b/src/computeLayout.hpp index 90e3c84..7395ee7 100644 --- a/src/computeLayout.hpp +++ b/src/computeLayout.hpp @@ -79,6 +79,7 @@ namespace shasta { const Graph&, const std::map<typename Graph::edge_descriptor, double>& edgeLengthMap, std::map<typename Graph::vertex_descriptor, array<double, 2> >& positionMap, + uint64_t quality, double timeout); } @@ -214,6 +215,7 @@ template<class Graph> shasta::ComputeLayoutReturnCode shasta::computeLayoutCusto const Graph& graph, const std::map<typename Graph::edge_descriptor, double>& edgeLengthMap, std::map<typename Graph::vertex_descriptor, array<double, 2> >& positionMap, + uint64_t quality, double timeout) { using vertex_descriptor = typename Graph::vertex_descriptor; @@ -251,7 +253,8 @@ template<class Graph> shasta::ComputeLayoutReturnCode shasta::computeLayoutCusto // Invoke the custom graph layout program. const string outputFileName = tmpDirectory() + uuid + "-output.txt"; - const string command = "customLayout -i " + inputFileName + " -o " + outputFileName ; + const string command = "customLayout -i " + inputFileName + " -o " + outputFileName + + " --quality " + to_string(quality); bool timeoutTriggered = false; bool signalOccurred = false; int returnCode = 0; diff --git a/src/copyNumber.hpp b/src/copyNumber.hpp index f0d4ef2..9c33d07 100644 --- a/src/copyNumber.hpp +++ b/src/copyNumber.hpp @@ -2,7 +2,7 @@ #define SHASTA_COPY_NUMBER_HPP #include "prefixLength.hpp" -#include "span.hpp" +#include "SHASTA_ASSERT.hpp" #include "cstdint.hpp" namespace shasta { @@ -42,21 +42,6 @@ template<class Container> uint64_t shasta::isCopyNumberDifference( } SHASTA_ASSERT(nx < ny); - // If the length difference is not a multiple of one of the allowed periods, - // return 0. - const uint64_t dn = ny - nx; - bool found = false; - for(uint64_t period=2; period<=maxPeriod; period++) { - if((dn % period) == 0) { - found = true; - break; - } - } - if(not found) { - return 0; - } - - const uint64_t prefixLength = commonPrefixLength(x, y); const uint64_t suffixLength = commonSuffixLength(x, y); @@ -79,6 +64,7 @@ template<class Container> uint64_t shasta::isCopyNumberDifference( // If getting here, x and y differ by an insertion in iy of range [iy, jy). + const uint64_t dn = ny - nx; SHASTA_ASSERT(ix == jx); SHASTA_ASSERT(jy - iy == dn); @@ -86,7 +72,7 @@ template<class Container> uint64_t shasta::isCopyNumberDifference( // Check for k base repeat. // We kept the entire common prefix, so we can check just to the left of the insertion. - for(uint64_t period=2; period<=maxPeriod; period++) { + for(uint64_t period=1; period<=maxPeriod; period++) { if((dn % period) != 0) { continue; } diff --git a/src/deduplicate.hpp b/src/deduplicate.hpp index 932d1ae..b04a6a2 100644 --- a/src/deduplicate.hpp +++ b/src/deduplicate.hpp @@ -62,11 +62,108 @@ namespace shasta { } + + // Remove duplicate elements in a vector and count occurrences of each. + // Keep only the ones that occur at least minCount times. + template<class T, class Int> void deduplicateAndCountWithThreshold( + vector<T>& v, + vector<Int>& count, + Int minCount + ) + { + // Clear the count vector. + count.clear(); + + // If the given vector is empty, return now. + if(v.empty()) { + return; + } + + // Sort the vector. + sort(v.begin(), v.end()); + + // Add elements, keeping track of the number + // of occurrences of each. + typename vector<T>::iterator output = v.begin(); + typename vector<T>::iterator input = v.begin(); + while(input != v.end()) { + + + // Count how many there are. + typename vector<T>::iterator it = input; + while(it!=v.end() && *it==*input) { + ++it; + } + const Int n = Int(it - input); + + if(n >= minCount) { + + // Store this element. + *output = *input; + ++output; + + // Store the count. + count.push_back(n); + } + + // Update our output iterator. + input = it; + + } + v.resize(count.size()); + } + + + + // Remove duplicate elements in a vector and count occurrences of each. + // Keep only the ones that occur exactly once. + template<class T> void deduplicateAndCountAndKeepUnique( + vector<T>& v) + { + + // If the given vector is empty, return now. + if(v.empty()) { + return; + } + + // Sort the vector. + sort(v.begin(), v.end()); + + // Add elements, keeping track of the number + // of occurrences of each. + typename vector<T>::iterator output = v.begin(); + typename vector<T>::iterator input = v.begin(); + while(input != v.end()) { + + + // Count how many there are. + typename vector<T>::iterator it = input; + while(it!=v.end() && *it==*input) { + ++it; + } + const uint64_t n = it - input; + + if(n == 1) { + + // Store this element. + *output = *input; + ++output; + } + + // Update our output iterator. + input = it; + + } + v.resize(output - v.begin()); + } + + + inline void testDeduplicateAndCount() { vector<int> v = {7, 4, 5, 7, 4, 18, 2, 4}; vector<int> count; - deduplicateAndCount(v, count); + deduplicateAndCountWithThreshold(v, count, 2); SHASTA_ASSERT(v.size() == count.size()); for(uint64_t i=0; i<v.size(); i++) { cout << v[i] << " " << count[i] << endl; diff --git a/src/enumeratePaths.cpp b/src/enumeratePaths.cpp new file mode 100644 index 0000000..1e7ca68 --- /dev/null +++ b/src/enumeratePaths.cpp @@ -0,0 +1,50 @@ +#include "enumeratePaths.hpp" +using namespace shasta; + +#include <boost/graph/adjacency_list.hpp> + +#include "iostream.hpp" + + +void shasta::testEnumeratePaths() +{ + using Graph = boost::adjacency_list< + boost::vecS, + boost::vecS, + boost::bidirectionalS>; + Graph graph(10); + using edge_descriptor = Graph::edge_descriptor; + using Path = vector<edge_descriptor>; + + class PathInspector { + public: + const Graph& graph; + uint64_t length; + PathInspector(const Graph& graph, uint64_t length) : graph(graph), length(length) {} + void operator()(const Path& path) + { + if(path.size() == length) { + for(const edge_descriptor e: path) { + cout << source(e, graph) << "->" << target(e, graph) << " "; + } + cout << endl; + } + } + }; + const uint64_t length = 4; + PathInspector pathInspector(graph, length); + + add_edge(0, 1, graph); + add_edge(1, 2, graph); + add_edge(2, 3, graph); + add_edge(3, 4, graph); + add_edge(1, 5, graph); + add_edge(5, 6, graph); + add_edge(6, 7, graph); + add_edge(6, 3, graph); + add_edge(3, 8, graph); + add_edge(7, 9, graph); + + enumeratePaths(graph, 0, length, pathInspector); + +} diff --git a/src/enumeratePaths.hpp b/src/enumeratePaths.hpp index beb14e5..4e77fa1 100644 --- a/src/enumeratePaths.hpp +++ b/src/enumeratePaths.hpp @@ -5,18 +5,62 @@ #include <boost/graph/iteration_macros.hpp> #include "algorithm.hpp" +#include "iostream.hpp" #include <stack> #include "tuple.hpp" +#include "utility.hpp" #include "vector.hpp" namespace shasta { -template<class G> void enumerateSelfAvoidingPaths(const G&, - typename G::vertex_descriptor vA, typename G::vertex_descriptor vB, - vector<vector<typename G::edge_descriptor> > &paths); + template<class G> void enumerateSelfAvoidingPaths(const G&, + typename G::vertex_descriptor vA, typename G::vertex_descriptor vB, + vector<vector<typename G::edge_descriptor> > &paths); + + template<class G, class PathInspector> void enumeratePaths( + const G&, + typename G::vertex_descriptor v, + uint64_t pathLength, + PathInspector&); + template<class G, class PathInspector> void enumeratePathsRecursive( + const G&, + typename G::vertex_descriptor v, + uint64_t pathLength, + PathInspector&, + vector<typename G::edge_descriptor>& path); + + // Same, but in the reverse direction (backward paths). + template<class G, class PathInspector> void enumeratePathsReverse( + const G&, + typename G::vertex_descriptor v, + uint64_t pathLength, + PathInspector&); + template<class G, class PathInspector> void enumeratePathsReverseRecursive( + const G&, + typename G::vertex_descriptor v, + uint64_t pathLength, + PathInspector&, + vector<typename G::edge_descriptor>& path); + + // Similar to the above, but for paths of any length beginning at vA and ending at vB. + template<class G, class PathInspector> void enumeratePathsBetween( + const G&, + typename G::vertex_descriptor vA, + typename G::vertex_descriptor vB, + PathInspector&); + template<class G, class PathInspector> void enumeratePathsBetweenRecursive( + const G&, + typename G::vertex_descriptor vA, + typename G::vertex_descriptor vB, + PathInspector&, + vector<typename G::edge_descriptor>& path); + + void testEnumeratePaths(); } + + // Enumerate self-avoiding paths starting at v0 and ending at v1. // Self-avoiding means that an edge cannot be used twice. template<class G> void shasta::enumerateSelfAvoidingPaths(const G &g, @@ -75,5 +119,101 @@ template<class G> void shasta::enumerateSelfAvoidingPaths(const G &g, } } + + +// In a directed graph of type G, +// enumerate all paths starting at v and with length (number of edges) +// up to pathLength. +// For each path found, apply the given function object by calling +// functionObject(path), where path is a vector<G::edge_descriptor> +template<class G, class PathInspector> void shasta::enumeratePaths( + const G& g, + typename G::vertex_descriptor v, + uint64_t maxPathLength, + PathInspector& pathInspector) +{ + vector<typename G::edge_descriptor> path; + enumeratePathsRecursive(g, v, maxPathLength, pathInspector, path); +} +template<class G, class PathInspector> void shasta::enumeratePathsRecursive( + const G& g, + typename G::vertex_descriptor v, + uint64_t maxPathLength, + PathInspector& pathInspector, + vector<typename G::edge_descriptor>& path) +{ + if(maxPathLength == 0) { + return; + } + BGL_FORALL_OUTEDGES_T(v, e, g, G) { + path.push_back(e); + pathInspector(path); + enumeratePathsRecursive(g, target(e, g), maxPathLength - 1, pathInspector, path); + path.pop_back(); + } +} + + + +template<class G, class PathInspector> void shasta::enumeratePathsReverse( + const G& g, + typename G::vertex_descriptor v, + uint64_t maxPathLength, + PathInspector& pathInspector) +{ + vector<typename G::edge_descriptor> path; + enumeratePathsReverseRecursive(g, v, maxPathLength, pathInspector, path); +} +template<class G, class PathInspector> void shasta::enumeratePathsReverseRecursive( + const G& g, + typename G::vertex_descriptor v, + uint64_t maxPathLength, + PathInspector& pathInspector, + vector<typename G::edge_descriptor>& path) +{ + if(maxPathLength == 0) { + return; + } + BGL_FORALL_INEDGES_T(v, e, g, G) { + path.push_back(e); + pathInspector(path); + enumeratePathsReverseRecursive(g, source(e, g), maxPathLength - 1, pathInspector, path); + path.pop_back(); + } +} + + +// In a directed graph of type G, +// enumerate all paths of any length starting at vA ending at vB. +// For each path found, apply the given function object by calling +// functionObject(path), where path is a vector<G::edge_descriptor> +template<class G, class PathInspector> void shasta::enumeratePathsBetween( + const G& g, + typename G::vertex_descriptor vA, + typename G::vertex_descriptor vB, + PathInspector& pathInspector) +{ + vector<typename G::edge_descriptor> path; + enumeratePathsBetweenRecursive(g, vA, vB, pathInspector, path); +} +template<class G, class PathInspector> void shasta::enumeratePathsBetweenRecursive( + const G& g, + typename G::vertex_descriptor vA, + typename G::vertex_descriptor vB, + PathInspector& pathInspector, + vector<typename G::edge_descriptor>& path) +{ + BGL_FORALL_OUTEDGES_T(vA, e, g, G) { + path.push_back(e); + typename G::vertex_descriptor vC = target(e, g); + if(vC == vB) { + pathInspector(path); + } else { + enumeratePathsBetweenRecursive(g, vC, vB, pathInspector, path); + } + path.pop_back(); + } +} + #endif diff --git a/src/findLinearChains.hpp b/src/findLinearChains.hpp index 64fb35b..b260943 100644 --- a/src/findLinearChains.hpp +++ b/src/findLinearChains.hpp @@ -225,7 +225,15 @@ template<class Graph> void shasta::findLinearVertexChains( // Check that all vertices were found. - SHASTA_ASSERT(verticesFound.size() == num_vertices(graph)); + // Just using num_vertices does not work if the graph is a filtered_graph. + // SHASTA_ASSERT(verticesFound.size() == num_vertices(graph)); + uint64_t vertexCount = 0; + BGL_FORALL_VERTICES_T(v, graph, Graph) { + if(v != Graph::null_vertex()) { // Just to avoid compiler warning. + ++vertexCount; + } + } + SHASTA_ASSERT(verticesFound.size() == vertexCount); } diff --git a/src/globalMsa.cpp b/src/globalMsa.cpp new file mode 100644 index 0000000..9831d75 --- /dev/null +++ b/src/globalMsa.cpp @@ -0,0 +1,471 @@ +// Shasta. +#include "globalMsa.hpp" +#include "Base.hpp" +#include "deduplicate.hpp" +#include "invalid.hpp" +#include "orderPairs.hpp" +#include "SHASTA_ASSERT.hpp" +#include "ShortBaseSequence.hpp" + +// Spoa. +#include "spoa/spoa.hpp" + +// Standard library. +#include "algorithm.hpp" +#include <map> +#include "tuple.hpp" + +// See the comments in globalMsa.hpp. + + + +void shasta::globalMsa( + const vector< pair<vector<Base>, uint64_t> >& sequences, + uint64_t maxSpoaLength, + uint64_t kmerLength, + vector<Base>& consensus + ) +{ + const bool debug = true; + if(debug) { + cout << "globalMsa called with " << sequences.size() << " sequences with (length,weight):" << endl; + uint64_t totalWeight = 0; + for(const auto& p: sequences) { + cout << "(" << p.first.size() << "," << p.second << ") "; + totalWeight += p.second; + } + cout << endl; + cout << "Total weight is " << totalWeight << endl; + } + + // Sanity check. + SHASTA_ASSERT(not sequences.empty()); + + using Kmer = ShortBaseSequence64; + SHASTA_ASSERT(kmerLength <= Kmer::capacity); + + // Trivial case. + if(sequences.size() == 1) { + consensus = sequences.front().first; + return; + } + + // Compute the maximum length of the input sequences. + uint64_t maxLength = 0; + for(const auto& p: sequences) { + maxLength = max(maxLength, p.first.size()); + } + + // If short enough, use spoa. + if(maxLength <= maxSpoaLength) { + if(debug) { + cout << "Using spoa." << endl; + } + globalMsaSpoa(sequences, consensus); + return; + } + + + + // Create a table of unique k-mers for each of the sequences. + class KmerInfo { + public: + Kmer kmer; + uint64_t position; + bool operator<(const KmerInfo& that) const + { + return kmer.data < that.kmer.data; + } + bool operator==(const KmerInfo& that) const + { + return kmer.data == that.kmer.data; + } + }; + vector< vector<KmerInfo> > kmerTable1(sequences.size()); + + for(uint64_t i=0; i<sequences.size(); i++) { + const vector<Base>& sequence = sequences[i].first; + if(false) { + cout << "Finding unique k-mers for sequence of length " << sequence.size() << endl; + } + vector<KmerInfo>& kmerInfos = kmerTable1[i]; + + Kmer kmer; + for(uint64_t position=0; position<kmerLength; position++) { + kmer.set(position, sequence[position]); + } + + for(uint64_t position=0; /* Check later */; position++) { + kmerInfos.push_back({kmer, position}); + + if(position + kmerLength == sequence.size()) { + break; + } + + // Update the k-mer. + kmer.shiftLeft(); + kmer.set(kmerLength - 1, sequence[position + kmerLength]); + } + SHASTA_ASSERT(kmerInfos.size() == sequence.size() - kmerLength + 1); + + // Only keep the k-mers that appear once. + if(false) { + cout << kmerInfos.size() << " total kmers." << endl; + } + deduplicateAndCountAndKeepUnique(kmerInfos); + if(false) { + cout << kmerInfos.size() << " unique kmers." << endl; + } + } + + + + // Create a global table of unique k-mers in all the sequences. + class KmerData { + public: + Kmer kmer; + uint64_t sequenceIndex; + uint64_t position; + bool operator<(const KmerData& that) const + { + return tie(kmer.data, sequenceIndex) < tie(that.kmer.data, that.sequenceIndex); + } + }; + vector<KmerData> kmerTable2; + for(uint64_t sequenceIndex=0; sequenceIndex<sequences.size(); sequenceIndex++) { + const vector<KmerInfo>& kmerInfos = kmerTable1[sequenceIndex]; + for(const KmerInfo& kmerInfo: kmerInfos) { + kmerTable2.push_back({kmerInfo.kmer, sequenceIndex, kmerInfo.position}); + } + } + sort(kmerTable2.begin(), kmerTable2.end()); + + + + // Now construct a third table that for each unique k-mer + // gives the sequence indexes and positions the k-mer appears in. + class UniqueKmerInfo { + public: + Kmer kmer; + uint64_t totalWeight = 0; + uint64_t minDistanceFromEnds = invalid<uint64_t>; + class Occurrence { + public: + uint64_t sequenceIndex; + uint64_t position; + }; + vector<Occurrence> occurrences; + bool operator<(const UniqueKmerInfo& that) const + { + return tie(totalWeight, minDistanceFromEnds) > tie(that.totalWeight, that.minDistanceFromEnds); + } + void write(ostream& s, uint64_t kmerLength) const + { + kmer.write(s, kmerLength); + s << " " << totalWeight; + s << " " << minDistanceFromEnds; + for(const auto& occurrence: occurrences) { + s << " (" << occurrence.sequenceIndex << "," << + occurrence.position << ")"; + } + s << endl; + } + }; + vector<UniqueKmerInfo> kmerTable3; + for(auto it=kmerTable2.begin(); it!= kmerTable2.end(); /* Increment later */) { + const Kmer kmer = it->kmer; + + // Find the end of the streak for the same kmer. + auto jt = it; + while(true) { + if(jt == kmerTable2.end()) { + break; + } + if(jt->kmer != kmer) { + break; + } + ++jt; + } + + // Store this streak in kmerTable3. + UniqueKmerInfo uniqueKmerInfo; + uniqueKmerInfo.kmer = kmer; + for(; it!=jt; it++) { + const uint64_t sequenceIndex = it->sequenceIndex; + const uint64_t sequenceLength = sequences[sequenceIndex].first.size(); + const uint64_t position = it->position; + const uint64_t distanceFromLeft = position; + const uint64_t distanceFromRight = sequenceLength - kmerLength - position; + const uint64_t distanceFromEnds = min(distanceFromLeft, distanceFromRight); + uniqueKmerInfo.occurrences.push_back({sequenceIndex, it->position}); + uniqueKmerInfo.totalWeight += sequences[it->sequenceIndex].second; + uniqueKmerInfo.minDistanceFromEnds = min(uniqueKmerInfo.minDistanceFromEnds, distanceFromEnds); + } + kmerTable3.push_back(uniqueKmerInfo); + } + sort(kmerTable3.begin(), kmerTable3.end()); + + + if(false) { + for(const auto& uniqueKmerInfo: kmerTable3) { + uniqueKmerInfo.write(cout, kmerLength); + } + } + + // The first entry in kmerTable3 gives the optimal splitting kmer, + // the sequences involves (all of them, in most cases), + // and the position of the splitting k-mer in each of the sequences. + SHASTA_ASSERT(not kmerTable3.empty()); + const UniqueKmerInfo& optimalSplitting = kmerTable3.front(); + if(debug) { + cout << "Splitting at "; + optimalSplitting.write(cout, kmerLength); + } + + + // Prepare the sequences for the left and right MSA. + vector< pair<vector<Base>, uint64_t> > leftSequences; + vector< pair<vector<Base>, uint64_t> > rightSequences; + vector<Base> leftConsensus; + vector<Base> rightConsensus; + for(const auto& occurrence: optimalSplitting.occurrences) { + const uint64_t sequenceIndex = occurrence.sequenceIndex; + const auto& p = sequences[sequenceIndex]; + const vector<Base>& sequence = p.first; + const uint64_t weight = p.second; + const uint64_t position = occurrence.position; + leftSequences.push_back(make_pair(vector<Base>(), weight)); + rightSequences.push_back(make_pair(vector<Base>(), weight)); + vector<Base>& leftSequence = leftSequences.back().first; + vector<Base>& rightSequence = rightSequences.back().first; + copy(sequence.begin(), sequence.begin() + position, + back_inserter(leftSequence)); + copy(sequence.begin() + position + kmerLength, sequence.end(), + back_inserter(rightSequence)); + } + + // Recursive call to do the left and right MSA. + globalMsa(leftSequences , maxSpoaLength, kmerLength, leftConsensus); + globalMsa(rightSequences, maxSpoaLength, kmerLength, rightConsensus); + + // Now stitch the pieces together. + consensus = leftConsensus; + for(uint64_t position=0; position<kmerLength; position++) { + consensus.push_back(optimalSplitting.kmer[position]); + } + copy(rightConsensus.begin(), rightConsensus.end(), + back_inserter(consensus)); +} + + + +// This just uses spoa. +// It cannot be used for very long sequences due to quadratic +// memory and time. Practical limit is a few thousand bases. +void shasta::globalMsaSpoa( + const vector< pair<vector<Base>, uint64_t> >& sequences, + vector<Base>& consensus + ) +{ + // Sanity check. + SHASTA_ASSERT(not sequences.empty()); + + // Trivial case. + if(sequences.size() == 1) { + consensus = sequences.front().first; + return; + } + + // We want to enter the sequences in order of decreasing weight. + // Create a table of pairs (sequenceIndex, weight) + // where sequenceIndex is the index in the sequences vector. + // Then sort by decreasing weight. + vector< pair<uint64_t, uint64_t> > sequencesTable; + for(uint64_t sequenceIndex=0; sequenceIndex<sequences.size(); sequenceIndex++) { + const auto& p = sequences[sequenceIndex]; + const uint64_t weight = p.second; + sequencesTable.push_back(make_pair(sequenceIndex, weight)); + } + sort(sequencesTable.begin(), sequencesTable.end(), + OrderPairsBySecondOnlyGreater<uint64_t, uint64_t>()); + + // Create the spoa alignment engine and alignment graph. + const spoa::AlignmentType alignmentType = spoa::AlignmentType::kNW; + const int8_t match = 1; + const int8_t mismatch = -1; + const int8_t gap = -1; + auto spoaAlignmentEngine = spoa::AlignmentEngine::Create(alignmentType, match, mismatch, gap); + spoa::Graph spoaAlignmentGraph; + + // Add the sequences to the MSA in order of decreasing weight. + string sequenceString; + for(uint64_t indexByWeight=0; indexByWeight<sequencesTable.size(); indexByWeight++) { + const auto& p = sequencesTable[indexByWeight]; + const uint64_t sequenceIndex = p.first; + const uint64_t weight = p.second; + const auto& q = sequences[sequenceIndex]; + SHASTA_ASSERT(q.second == weight); + const vector<Base>& sequence = q.first; + + sequenceString.clear(); + for(const Base base: sequence) { + sequenceString += base.character(); + } + auto alignment = spoaAlignmentEngine->Align(sequenceString, spoaAlignmentGraph); + spoaAlignmentGraph.AddAlignment(alignment, sequenceString, uint32_t(weight)); + } + + // Get the MSA alignment. + // The true argument causes a final alignment entry equal to the consensus. + vector<string> alignment = spoaAlignmentGraph.GenerateMultipleSequenceAlignment(false); + SHASTA_ASSERT(alignment.size() == sequencesTable.size()); + + // Compute coverage at each alignment position for each of the 5 AlignedBases. + const uint64_t alignmentLength = alignment.front().size(); + vector< array<uint64_t, 5> > coverage(alignmentLength, {0, 0, 0, 0, 0}); + for(uint64_t indexByWeight=0; indexByWeight<sequencesTable.size(); indexByWeight++) { + const string& alignmentRow = alignment[indexByWeight]; + SHASTA_ASSERT(alignmentRow.size() == alignmentLength); + for(uint64_t position=0; position<alignmentLength; position++) { + const AlignedBase b = AlignedBase::fromCharacter(alignmentRow[position]); + coverage[position][b.value] += sequencesTable[indexByWeight].second; + } + } + + // Compute coverage-based consensus at each alignment position. + vector<AlignedBase> alignedConsensus; + for(const auto& c: coverage) { + const uint64_t iBase = std::max_element(c.begin(), c.end()) - c.begin(); + alignedConsensus.push_back(AlignedBase::fromInteger(iBase)); + } + SHASTA_ASSERT(alignedConsensus.size() == alignmentLength); + + // Take out the gaps. + consensus.clear(); + for(const AlignedBase b: alignedConsensus) { + if(not b.isGap()) { + consensus.push_back(Base(b)); + } + } +} + + + +// This just uses spoa. +// It cannot be used for very long sequences due to quadratic +// memory and time. Practical limit is a few thousand bases. +// Version that returns the alignment. +// THE SEQUENCES MUST BE PASSED IN ORDER OF DECREASING WEIGHT. +void shasta::globalMsaSpoa( + const vector< pair<vector<Base>, uint64_t> >& sequences, + vector< vector<AlignedBase> >& alignmentArgument + ) +{ + // Sanity check. + SHASTA_ASSERT(not sequences.empty()); + + // Check that the sequences are ordered by decreasing weight. + for(uint64_t i=1; i<sequences.size(); i++) { + SHASTA_ASSERT(sequences[i-1].second >= sequences[i].second); + } + + // Create the spoa alignment engine and alignment graph. + const spoa::AlignmentType alignmentType = spoa::AlignmentType::kNW; + const int8_t match = 1; + const int8_t mismatch = -1; + const int8_t gap = -1; + auto spoaAlignmentEngine = spoa::AlignmentEngine::Create(alignmentType, match, mismatch, gap); + spoa::Graph spoaAlignmentGraph; + + // Add the sequences to the MSA in order of decreasing weight. + string sequenceString; + for(uint64_t i=0; i<sequences.size(); i++) { + const auto& p = sequences[i]; + const vector<Base>& sequence = p.first; + const uint64_t weight = p.second; + + sequenceString.clear(); + for(const Base base: sequence) { + sequenceString += base.character(); + } + auto alignment = spoaAlignmentEngine->Align(sequenceString, spoaAlignmentGraph); + spoaAlignmentGraph.AddAlignment(alignment, sequenceString, uint32_t(weight)); + } + + // Get the MSA alignment. + // The true argument causes a final alignment entry equal to the consensus. + vector<string> alignment = spoaAlignmentGraph.GenerateMultipleSequenceAlignment(false); + SHASTA_ASSERT(alignment.size() == sequences.size()); + + // Copy it to alignmentArgument. + alignmentArgument.clear(); + alignmentArgument.resize(alignment.size()); + for(uint64_t i=0 ; i<alignment.size(); i++) { + const string& alignmentRow = alignment[i]; + vector<AlignedBase>& alignmentArgumentRow = alignmentArgument[i]; + alignmentArgumentRow.resize(alignmentRow.size()); + for(uint64_t j=0; j<alignmentRow.size(); j++) { + alignmentArgumentRow[j] = AlignedBase::fromCharacter(alignmentRow[j]); + } + } + +} + + + +// Version that enforces a maximum MSA length and returns false if it is exceeded. +bool shasta::globalMsaSpoa( + const vector< pair<vector<Base>, uint64_t> >& sequences, + vector<Base>& consensus, + uint64_t maximumMsaLength + ) +{ + if(sequences.size() > 1) { + uint64_t maxLength = 0; + for(const auto& sequence: sequences) { + maxLength = max(maxLength, sequence.first.size()); + } + if(maxLength > maximumMsaLength) { + return false; + } + } + + // If getting here, the MSA is no longer than the specified maximum length + // (or it is trivial, consisting of just one sequence). + globalMsaSpoa(sequences, consensus); + return true; +} + + + +// Python-callable version. +std::string shasta::globalMsaPython( + const vector< pair<string, uint64_t> >& sequenceStrings, + uint64_t maxSpoaLength, + uint64_t kmerLength) +{ + // Extract the sequences. + vector< pair<vector<Base>, uint64_t> > sequences; + sequences.reserve(sequenceStrings.size()); + for(const auto& p: sequenceStrings) { + sequences.resize(sequences.size() + 1); + sequences.back().second = p.second; + const string& sequenceString = p.first; + vector<Base>& sequence = sequences.back().first; + for(const char c: sequenceString) { + sequence.push_back(Base::fromCharacter(c)); + } + } + + // Do the MSA. + vector<Base> consensus; + globalMsa(sequences, maxSpoaLength, kmerLength, consensus); + + // Construct the consensus string and return it. + string consensusString; + for(const Base b: consensus) { + consensusString.push_back(b.character()); + } + return consensusString; +} + diff --git a/src/globalMsa.hpp b/src/globalMsa.hpp new file mode 100644 index 0000000..700bf03 --- /dev/null +++ b/src/globalMsa.hpp @@ -0,0 +1,62 @@ +#ifndef SHASTA_GLOBAL_MSA_HPP +#define SHASTA_GLOBAL_MSA_HPP + +/******************************************************************************* + +Global multiple sequence alignment. +Global means constrained on both sides, aka Needleman–Wunsch. + +This supports sequences of arbitrary length. +If all the sequences are at most maxSpoaLength long, +this invokes spoa. + +Otherwise it finds a common subsequence of length kmerLength +and splits the MSA at that location, invoking itself recursively +to solve the two MSAs. + +Each of the input sequences is passed in as a pair. +The second member of the pair is the "weight" of the sequence +(that is, typically the number of reads with that sequence). + +*******************************************************************************/ + +#include "cstdint.hpp" +#include "utility.hpp" +#include "string.hpp" +#include "vector.hpp" + +namespace shasta { + + class Base; + class AlignedBase; + + void globalMsa( + const vector< pair<vector<Base>, uint64_t> >& sequences, + uint64_t maxSpoaLength, + uint64_t kmerLength, + vector<Base>& consensus + ); + + void globalMsaSpoa( + const vector< pair<vector<Base>, uint64_t> >& sequences, + vector<Base>& consensus + ); + bool globalMsaSpoa( + const vector< pair<vector<Base>, uint64_t> >& sequences, + vector<Base>& consensus, + uint64_t maximumMsaLength + ); + void globalMsaSpoa( + const vector< pair<vector<Base>, uint64_t> >& sequences, + vector< vector<AlignedBase> >& alignment + ); + + // Python-callable version. + string globalMsaPython( + const vector< pair<string, uint64_t> >& sequenceStrings, + uint64_t maxSpoaLength, + uint64_t kmerLength + ); +} + +#endif diff --git a/src/html.cpp b/src/html.cpp index c30c98e..83aa323 100644 --- a/src/html.cpp +++ b/src/html.cpp @@ -176,6 +176,7 @@ function zoomSvg(factor) y = yCenter - 0.5 * height; svg.setAttribute('viewBox', `${x} ${y} ${width} ${height}`); + svg.setAttribute('font-size', svg.getAttribute('font-size') / factor); return false; } @@ -185,3 +186,10 @@ function zoomSvg(factor) } + + +void shasta::writeInformationIcon(ostream& html, const string& message) +{ + html << "<span style='color:Blue;font-weight:bold' title=\"" << + message << "\">ⓘ</span>"; +} diff --git a/src/html.hpp b/src/html.hpp index 9702b86..aa90bdc 100644 --- a/src/html.hpp +++ b/src/html.hpp @@ -13,6 +13,8 @@ namespace shasta { void writeStyle(ostream&); void addSvgDragAndZoom(ostream& html); + + void writeInformationIcon(ostream& html, const string& message); } #endif diff --git a/src/invalid.hpp b/src/invalid.hpp index 32bb79a..2f14c27 100644 --- a/src/invalid.hpp +++ b/src/invalid.hpp @@ -1,14 +1,14 @@ #ifndef SHASTA_INVALID_HPP #define SHASTA_INVALID_HPP -// In many contexts, we use invalid<uint64_t> (or similar for other integer types) +// In many contexts, we use invalid<T> // to indicate a value that is invalid, uninitialized, or unknown. -#include <concepts> #include <numeric> namespace shasta { - template<std::integral Int> static const Int invalid = std::numeric_limits<Int>::max(); + template<class T> static const T invalid = std::numeric_limits<T>::max(); + template<class T> static const T unlimited = std::numeric_limits<T>::max(); } #endif diff --git a/src/localTransitiveReduction.hpp b/src/localTransitiveReduction.hpp new file mode 100644 index 0000000..b4eb07d --- /dev/null +++ b/src/localTransitiveReduction.hpp @@ -0,0 +1,115 @@ +#ifndef SHASTA_LOCAL_TRANSITIVE_REDUCTION_HPP +#define SHASTA_LOCAL_TRANSITIVE_REDUCTION_HPP + +// Boost libraries. +#include <boost/graph/adjacency_list.hpp> +#include <boost/graph/iteration_macros.hpp> + +// Standard library. +#include <queue> +#include "vector.hpp" + +namespace shasta { + template<class Graph> void localTransitiveReduction( + const Graph&, + uint64_t maxPathLength, + vector<typename Graph::edge_descriptor>& nonTransitiveReductionEdges); +} + + + +// For each directed edge v0->v1, look for a path that: +// - Starts at v0. +// - Ends at v1. +// - Has length at most maxPathLength; +// - Does not use edge v0>v1; +// If such a path is found, the edge descriptor is added to nonTransitiveReductionEdges. +// The edges that are not in nonTransitiveReductionEdges form a sort of "local transitive reduction" +// of the graph. +template<class Graph> void shasta::localTransitiveReduction( + const Graph& graph, + uint64_t maxPathLength, + vector<typename Graph::edge_descriptor>& nonTransitiveReductionEdges) +{ + + using namespace boost; + using vertex_descriptor = typename Graph::vertex_descriptor; + // using edge_descriptor = typename Graph::edge_descriptor; + + // Check the Graph type. + static_assert( + std::is_same<typename Graph::directed_selector, directedS>::value + or + std::is_same<typename Graph::directed_selector, bidirectionalS>::value, + "shasta::transitiveReduction requires an adjacency_list " + "with the third template argument set to boost::directedS or boost::bidirectionalS."); + + // Loop over all edges v0->v1. + nonTransitiveReductionEdges.clear(); + BGL_FORALL_EDGES_T(e01, graph, Graph) { + const vertex_descriptor v0 = source(e01, graph); + const vertex_descriptor v1 = target(e01, graph); + + // Do a BFS starting at v0, up to a distance maxPathLength. + // Stop if we encounter v1. + + // The BFS queue. + std::queue<vertex_descriptor> q; + q.push(v0); + + // The vertices we encountered so far, with their distance from v0. + std::map<vertex_descriptor, uint64_t> m; + m.insert({v0, 0}); + + // BFS loop. + // cout << "BFS loop begins for " << v0 << "->" << v1 << endl; + while(not q.empty()) { + + // Dequeue a vertex. + const vertex_descriptor vA = q.front(); + q.pop(); + const auto itA = m.find(vA); + SHASTA_ASSERT(itA != m.end()); + const uint64_t distanceA = itA->second; + const uint64_t distanceB = distanceA + 1; + // cout << "Dequeued " << vA << " at distance " << distanceA << endl; + + // Loop over the out-edges of vA. + bool endBfs = false; + BGL_FORALL_OUTEDGES_T(vA, eAB, graph, Graph) { + + // Dont's use e01 in the BFS. + if(eAB == e01) { + continue; + } + + // If we reached v1, mark e01 as a nonTransitiveReduction edge + // and stop the BFS. + const vertex_descriptor vB = target(eAB, graph); + if(vB == v1) { + nonTransitiveReductionEdges.push_back(e01); + endBfs = true; + // cout << "Reached " << v1 << endl; + break; + } + + // If we already reached this vertex, do nothing. + if(m.contains(vB)) { + continue; + } + + // If not at maximum distance, enqueue vB. + if(distanceB < maxPathLength) { + q.push(vB); + m.insert({vB, distanceB}); + // cout << "Enqueued " << vB << " at distance " << distanceB << endl; + } + } + if(endBfs) { + break; + } + } + } +} + +#endif diff --git a/src/longestPath.cpp b/src/longestPath.cpp new file mode 100644 index 0000000..ce66d74 --- /dev/null +++ b/src/longestPath.cpp @@ -0,0 +1,23 @@ +#include "longestPath.hpp" +#include "iostream.hpp" +using namespace shasta; + +void shasta::testLongestPath() +{ + using Graph = boost::adjacency_list<boost::listS, boost::vecS, boost::bidirectionalS>; + Graph graph(7); + add_edge(0, 1, graph); + add_edge(1, 2, graph); + add_edge(2, 3, graph); + add_edge(4, 1, graph); + add_edge(2, 5, graph); + add_edge(6, 4, graph); + + vector<Graph::vertex_descriptor> longestPath; + shasta::longestPath(graph, longestPath); + + for(const auto v: longestPath) { + cout << v << " "; + } + cout << endl; +} diff --git a/src/longestPath.hpp b/src/longestPath.hpp new file mode 100644 index 0000000..0f92fa7 --- /dev/null +++ b/src/longestPath.hpp @@ -0,0 +1,114 @@ +#ifndef SHASTA_LONGEST_PATH_HPP +#define SHASTA_LONGEST_PATH_HPP + +// Boost libraries. +#include <boost/graph/adjacency_list.hpp> +#include <boost/graph/iteration_macros.hpp> +#include <boost/graph/topological_sort.hpp> + +// Standard library. +#include "algorithm.hpp" +#include <map> +#include "utility.hpp" +#include "vector.hpp" + +namespace shasta { + template<class Graph> void longestPath( + const Graph &graph, + vector<typename Graph::vertex_descriptor>& longestPath); + void testLongestPath(); +} + + + +// Find the longest path in a directed graph without cycles. +// Class Graph must be a boost::adjacency_list with +// the first three template arguments set to <listS, vecS, bidirectionalS>. +// If the graph has cycles, this throws boost::not_a_dag. +// This uses the algorithm described here: +// https://en.wikipedia.org/wiki/Longest_path_problem#Acyclic_graphs +template<class Graph> void shasta::longestPath( + const Graph &graph, + vector<typename Graph::vertex_descriptor>& longestPath) +{ + using namespace boost; + using vertex_descriptor = typename Graph::vertex_descriptor; + // using edge_descriptor = typename Graph::edge_descriptor; + // using edge_iterator = typename Graph::edge_iterator; + + // Check the Graph type. + // Use C++20 concepts instead. + static_assert( + std::is_same<typename Graph::out_edge_list_selector, listS>::value, + "shasta::transitiveReduction requires an adjacency_list " + "with the first template argument set to boost::listS."); + static_assert( + std::is_same<typename Graph::vertex_list_selector, vecS>::value, + "shasta::transitiveReduction requires an adjacency_list " + "with the second template argument set to boost::vecS."); + static_assert( + std::is_same<typename Graph::directed_selector, bidirectionalS>::value, + "shasta::transitiveReduction requires an adjacency_list " + "with the third template argument set to boost::bidirectionalS."); + + // Use boost topological_sort to get a vector of vertex descriptors + // in topological order. The output from the boost call is in + // reverse topological order. + vector<vertex_descriptor> sortedVertices; + topological_sort(graph, back_inserter(sortedVertices)); + std::reverse(sortedVertices.begin(), sortedVertices.end()); + + // Map to contain the length of the longest path ending at each vertex. + std::map<vertex_descriptor, uint64_t> lengthMap; + BGL_FORALL_VERTICES_T(v, graph, Graph) { + lengthMap.insert(make_pair(v, 0)); + } + + // Compute the maximum length of a path ending at each vertex. + for(const vertex_descriptor v: sortedVertices) { + uint64_t maximumLength = 0; + BGL_FORALL_INEDGES_T(v, e, graph, Graph) { + maximumLength = max(maximumLength, lengthMap[source(e, graph)]); + } + lengthMap[v] = maximumLength + 1; + } + + // Find the vertex with the longest length. + // This will be the end of the longest path. + vertex_descriptor v = Graph::null_vertex(); + uint64_t maximumLength = 0; + for(const auto& p: lengthMap) { + if(p.second > maximumLength) { + v = p.first; + maximumLength = p.second; + } + } + + // Constuct the path, moving backward from here. + longestPath.clear(); + longestPath.push_back(v); + while(true) { + vertex_descriptor vPrevious = Graph::null_vertex(); + uint64_t maximumLength = 0; + BGL_FORALL_INEDGES_T(v, e, graph, Graph) { + const vertex_descriptor v0 = source(e, graph); + const uint64_t length = lengthMap[v0]; + if(length > maximumLength) { + vPrevious = v0; + maximumLength = length; + } + } + if(vPrevious == Graph::null_vertex()) { + break; + } + v = vPrevious; + longestPath.push_back(v); + + } + std::reverse(longestPath.begin(), longestPath.end()); + +} + + + +#endif diff --git a/src/markerAccessFunctions.cpp b/src/markerAccessFunctions.cpp new file mode 100644 index 0000000..7fff818 --- /dev/null +++ b/src/markerAccessFunctions.cpp @@ -0,0 +1,86 @@ +#include "markerAccessFunctions.hpp" +#include "extractKmer.hpp" +#include "Marker.hpp" +#include "Reads.hpp" +using namespace shasta; + + + +Kmer shasta::getOrientedReadMarkerKmer( + OrientedReadId orientedReadId, + uint32_t ordinal, + uint64_t k, + const Reads& reads, + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers + ) +{ + const ReadId readId = orientedReadId.getReadId(); + const Strand strand = orientedReadId.getStrand(); + + if(strand == 0) { + return getOrientedReadMarkerKmerStrand0(readId, ordinal, k, reads, markers); + } else { + return getOrientedReadMarkerKmerStrand1(readId, ordinal, k, reads, markers); + } + +} + + + +Kmer shasta::getOrientedReadMarkerKmerStrand0( + ReadId readId, + uint32_t ordinal0, + uint64_t k, + const Reads& reads, + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers + ) +{ + const auto read = reads.getRead(uint32_t(readId)); + const OrientedReadId orientedReadId0(readId, 0); + const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()]; + + Kmer kmer0; + extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0); + + return kmer0; +} + + + +Kmer shasta::getOrientedReadMarkerKmerStrand1( + ReadId readId, + uint32_t ordinal1, + uint64_t k, + const Reads& reads, + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers + ) +{ + + // We only have the read stored without reverse complement, so get it from there... + const auto read = reads.getRead(uint32_t(readId)); + const OrientedReadId orientedReadId0(readId, 0); + const auto orientedReadMarkers0 = markers[orientedReadId0.getValue()]; + const uint64_t readMarkerCount = orientedReadMarkers0.size(); + const uint64_t ordinal0 = readMarkerCount - 1 - ordinal1; + Kmer kmer0; + extractKmer(read, uint64_t(orientedReadMarkers0[ordinal0].position), k, kmer0); + + // ... then do the reverse complement. + const Kmer kmer1 = kmer0.reverseComplement(k); + return kmer1; +} + + + +// Get the marker KmerId for an oriented read and ordinal. +KmerId shasta::getOrientedReadMarkerKmerId( + OrientedReadId orientedReadId, + uint32_t ordinal, + uint64_t k, + const Reads& reads, + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers + ) +{ + const Kmer kmer = getOrientedReadMarkerKmer(orientedReadId, ordinal, k, reads, markers); + return KmerId(kmer.id(k)); +} diff --git a/src/markerAccessFunctions.hpp b/src/markerAccessFunctions.hpp new file mode 100644 index 0000000..0e9b7c9 --- /dev/null +++ b/src/markerAccessFunctions.hpp @@ -0,0 +1,53 @@ +#ifndef SHASTA_MARKER_ACCESS_FUNCTIONS_HPP + +#include "Kmer.hpp" +#include "ReadId.hpp" + +namespace shasta { + + class CompressedMarker; + class Reads; + namespace MemoryMapped { + template<class T, class Int> class VectorOfVectors; + } + + // Access functions for markers Kmers and KmerIds. + // There are similar member functions in class Assembler, + // but these are accessible anywhere else. + + Kmer getOrientedReadMarkerKmer( + OrientedReadId, + uint32_t ordinal, + uint64_t k, + const Reads&, + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers + ); + + Kmer getOrientedReadMarkerKmerStrand0( + ReadId, + uint32_t ordinal, + uint64_t k, + const Reads&, + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers + ); + + Kmer getOrientedReadMarkerKmerStrand1( + ReadId, + uint32_t ordinal, + uint64_t k, + const Reads&, + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers + ); + + // Get the marker KmerId for an oriented read and ordinal. + KmerId getOrientedReadMarkerKmerId( + OrientedReadId, + uint32_t ordinal, + uint64_t k, + const Reads&, + const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers + ); + +} + +#endif diff --git a/src/mode3-AssemblyGraph.cpp b/src/mode3-AssemblyGraph.cpp new file mode 100644 index 0000000..4b18a8e --- /dev/null +++ b/src/mode3-AssemblyGraph.cpp @@ -0,0 +1,8499 @@ +// Shasta. +#include "mode3-AssemblyGraph.hpp" +#include "mode3-LocalAssembly.hpp" +#include "mode3-PrimaryGraph.hpp" +#include "mode3-PhasingTable.hpp" +#include "Assembler.hpp" +#include "AssemblerOptions.hpp" +#include "copyNumber.hpp" +#include "deduplicate.hpp" +#include "diploidBayesianPhase.hpp" +#include "dominatorTree.hpp" +#include "enumeratePaths.hpp" +#include "findLinearChains.hpp" +#include "orderPairs.hpp" +#include "performanceLog.hpp" +#include "timestamp.hpp" +using namespace shasta; +using namespace mode3; + +// Boost libraries. +#include <boost/archive/binary_oarchive.hpp> +#include <boost/archive/binary_iarchive.hpp> +#include <boost/graph/adj_list_serialize.hpp> +#include <boost/graph/filtered_graph.hpp> +#include <boost/pending/disjoint_sets.hpp> +#include <boost/graph/reverse_graph.hpp> +#include <boost/graph/strong_components.hpp> + +// Standard library. +#include "fstream.hpp" +#include <queue> +#include "tuple.hpp" + +// Explicit instantiation. +#include "MultithreadedObject.tpp" +template class MultithreadedObject<AssemblyGraph>; + + +// Create from a connected component of the PrimaryGraph, then call run. +AssemblyGraph::AssemblyGraph( + const PrimaryGraph& graph, + uint64_t componentId, + const Assembler& assembler, + uint64_t threadCount, + const Mode3AssemblyOptions& options, + bool assembleSequence, + bool debug) : + MultithreadedObject<AssemblyGraph>(*this), + componentId(componentId), + assembler(assembler), + options(options) +{ + // Adjust the numbers of threads, if necessary. + if(threadCount == 0) { + threadCount = std::thread::hardware_concurrency(); + } + + performanceLog << timestamp << "Creating the assembly graph for component " << componentId << endl; + create(graph, debug); + + // Serialize it so we can restore it to facilitate debugging. + save("AssemblyGraph-" + to_string(componentId) + ".data"); + + performanceLog << timestamp << "Processing the assembly graph for component " << componentId << endl; + run(threadCount, assembleSequence, debug); + performanceLog << timestamp << "Done with the assembly graph for component " << componentId << endl; +} + + + +// Load it from a binary archive, then call run. +AssemblyGraph::AssemblyGraph( + const string& fileName, + const Assembler& assembler, + uint64_t threadCount, + const Mode3AssemblyOptions& options, + bool assembleSequence, + bool debug) : + MultithreadedObject<AssemblyGraph>(*this), + assembler(assembler), + options(options) +{ + // Adjust the numbers of threads, if necessary. + if(threadCount == 0) { + threadCount = std::thread::hardware_concurrency(); + } + + load(fileName); + run(threadCount, assembleSequence, debug); +} + + + +void AssemblyGraph::run( + uint64_t threadCount, + bool assembleSequence, + bool debug) +{ + const bool useBayesianModel = true; + // const uint64_t detangleWithSearchToleranceLow = 1; + // const uint64_t detangleWithSearchToleranceHigh = 6; + // const uint64_t optimizeChainsMinCommon = 3; + // const uint64_t optimizeChainsK = 100; + + if(debug) write("A"); + + // Don't do any detangling before cleanup of bubbles and superbubbles and phasing. + + // Cleanup bubbles and superbubbles. + // Must do compress to make sure all bubbles are in bubble chains. + compress(); + for(uint64_t iteration=0; ; iteration ++) { + performanceLog << timestamp << "Iteration " << iteration << + " of bubble cleanup begins." << endl; + const uint64_t cleanedUpBubbleCount = cleanupBubbles( + false, + options.assemblyGraphOptions.bubbleCleanupMaxOffset, + options.assemblyGraphOptions.chainTerminalCommonThreshold, + threadCount); + if(cleanedUpBubbleCount == 0) { + break; + } + if(debug) { + cout << "Cleaned up " << cleanedUpBubbleCount << " bubbles probably caused by errors." << endl; + } + compressBubbleChains(); + compress(); + } + if(debug) write("B"); + cleanupSuperbubbles(false, + options.assemblyGraphOptions.superbubbleLengthThreshold1, + options.assemblyGraphOptions.chainTerminalCommonThreshold); + compress(); + + // Remove short superbubbles. + removeShortSuperbubbles(false, + options.assemblyGraphOptions.superbubbleLengthThreshold2, + options.assemblyGraphOptions.superbubbleLengthThreshold3); + compress(); + + // Phase. + compressBubbleChains(); + if(debug) write("C"); + phaseBubbleChainsUsingPhasingTable( + debug ? "C" : "", + options.assemblyGraphOptions.phaseErrorThreshold, + options.assemblyGraphOptions.bubbleErrorThreshold, + options.assemblyGraphOptions.longBubbleThreshold); + compress(); + + // For detangling, expand all bubble chains. + expand(); + + // Detangle. + if(debug) write("D"); + performanceLog << timestamp << "Detangling begins." << endl; + while(compressSequentialEdges()); + compressBubbleChains(); + detangleEdges(false, + options.assemblyGraphOptions.detangleToleranceLow, + options.assemblyGraphOptions.detangleToleranceHigh, + useBayesianModel, + options.assemblyGraphOptions.epsilon, + options.assemblyGraphOptions.minLogP); + while(compressSequentialEdges()); + compressBubbleChains(); + detangleVertices(false, + options.assemblyGraphOptions.detangleToleranceLow, + options.assemblyGraphOptions.detangleToleranceHigh, + useBayesianModel, + options.assemblyGraphOptions.epsilon, + options.assemblyGraphOptions.minLogP); + while(compressSequentialEdges()); + compressBubbleChains(); + detangleEdges(false, + options.assemblyGraphOptions.detangleToleranceLow, + options.assemblyGraphOptions.detangleToleranceHigh, + useBayesianModel, + options.assemblyGraphOptions.epsilon, + options.assemblyGraphOptions.minLogP); + detangleShortSuperbubbles(false, + options.assemblyGraphOptions.superbubbleLengthThreshold4, + options.assemblyGraphOptions.detangleToleranceLow, + options.assemblyGraphOptions.detangleToleranceHigh, + useBayesianModel, + options.assemblyGraphOptions.epsilon, + options.assemblyGraphOptions.minLogP); + performanceLog << timestamp << "Detangling ends." << endl; + + compress(); + compressBubbleChains(); + if(debug) write("E"); + +#if 0 + // Optimize the chains. + optimizeChains( + false, + optimizeChainsMinCommon, + optimizeChainsK); +#endif + + // Before final output, renumber the edges contiguously. + renumberEdges(); + if(debug) write("F"); + + if(assembleSequence) { + + // Assemble sequence. + assembleAllChainsMultithreaded( + options.assemblyGraphOptions.chainTerminalCommonThreshold, + threadCount); + writeAssemblyDetails(); + + if(debug) write("G", true); + + } else { + + // Skip sequence assembly. + write("Final"); + } + + +} + + + +// Initial creation from the PrimaryGraph. +// Each linear chain of edges in the PrimaryGraph after transitive reduction generates +// an AssemblyGraphEdge (BubbleChain) consisting of a single haploid bubble. +void AssemblyGraph::create(const PrimaryGraph& graph, bool debug) +{ + AssemblyGraph& cGraph = *this; + + // Create a filtered version of the PathGraph, containing only the + // transitive reduction edges. + class EdgePredicate { + public: + bool operator()(const PrimaryGraph::edge_descriptor e) const + { + return not (*graph)[e].isNonTransitiveReductionEdge; + } + EdgePredicate(const PrimaryGraph& graph) : graph(&graph) {} + EdgePredicate() : graph(0) {} + private: + const PrimaryGraph* graph; + }; + using FilteredPrimaryGraph = boost::filtered_graph<PrimaryGraph, EdgePredicate>; + FilteredPrimaryGraph filteredGraph(graph, EdgePredicate(graph)); + + // Find linear chains in the PathGraph after transitive reduction. + vector< vector<PrimaryGraph::edge_descriptor> > inputChains; + findLinearChains(filteredGraph, 0, inputChains); + + // Each chain generates an edge. + // Vertices are added as needed. + std::map<MarkerGraphEdgeId, vertex_descriptor> vertexMap; + for(const vector<PrimaryGraph::edge_descriptor>& inputChain: inputChains) { + const PrimaryGraph::vertex_descriptor v0 = source(inputChain.front(), graph); + const PrimaryGraph::vertex_descriptor v1 = target(inputChain.back(), graph); + const MarkerGraphEdgeId markerGraphEdgeId0 = graph[v0].edgeId; + const MarkerGraphEdgeId markerGraphEdgeId1 = graph[v1].edgeId; + const vertex_descriptor cv0 = getVertex(markerGraphEdgeId0, vertexMap); + const vertex_descriptor cv1 = getVertex(markerGraphEdgeId1, vertexMap); + + // Create an edge for this input chain. + edge_descriptor ce; + tie(ce, ignore) = add_edge(cv0, cv1, cGraph); + AssemblyGraphEdge& edge = cGraph[ce]; + edge.id = nextEdgeId++; + + // The edge is a degenerate BubbleChain consisting of a single haploid bubble. + edge.resize(1); // BubbleChain has length 1. + Bubble& bubble = edge.front(); + bubble.resize(1); // Bubble is haploid. + + // Store the chain. + Chain& chain = bubble.front(); + for(const PrimaryGraph::edge_descriptor e: inputChain) { + const PrimaryGraph::vertex_descriptor v = source(e, graph); + chain.push_back(graph[v].edgeId); + } + const PrimaryGraph::edge_descriptor eLast = inputChain.back(); + const PrimaryGraph::vertex_descriptor vLast = target(eLast, graph); + chain.push_back(graph[vLast].edgeId); + } +} + + + +// Return the vertex corresponding to a given MarkerGraphEdgeId, +// creating it if it is not in the given vertexMap +AssemblyGraph::vertex_descriptor AssemblyGraph::getVertex( + MarkerGraphEdgeId markerGraphEdgeId, + std::map<MarkerGraphEdgeId, vertex_descriptor>& vertexMap) +{ + AssemblyGraph& cGraph = *this; + + auto it = vertexMap.find(markerGraphEdgeId); + if(it == vertexMap.end()) { + const vertex_descriptor cv = add_vertex({markerGraphEdgeId}, cGraph); + vertexMap.insert({markerGraphEdgeId, cv}); + return cv; + } else { + return it->second; + } +} + + + +// Create a new vertex with a given MarkerGraphEdgeId. +AssemblyGraph::vertex_descriptor AssemblyGraph::createVertex( + MarkerGraphEdgeId markerGraphEdgeId) +{ + return add_vertex({markerGraphEdgeId}, *this); +} + + + +void AssemblyGraph::removeVertex(vertex_descriptor cv) +{ + AssemblyGraph& cGraph = *this; + + SHASTA_ASSERT(in_degree(cv, cGraph) == 0); + SHASTA_ASSERT(out_degree(cv, cGraph) == 0); + + boost::remove_vertex(cv, cGraph); +} + + + +// Compute vertexIndex for every vertex. +// This numbers vertices consecutively starting at zero. +// This numbering becomes invalid as soon as a vertex is added or removed. +void AssemblyGraph::numberVertices() +{ + AssemblyGraph& cGraph = *this; + uint64_t index = 0; + BGL_FORALL_VERTICES(cv, cGraph, AssemblyGraph) { + cGraph[cv].index = index++; + } +} + + + +void AssemblyGraph::clearVertexNumbering() +{ + AssemblyGraph& cGraph = *this; + BGL_FORALL_VERTICES(cv, cGraph, AssemblyGraph) { + cGraph[cv].index = invalid<uint64_t>; + } + +} + + +void AssemblyGraph::renumberEdges() +{ + AssemblyGraph& cGraph = *this; + nextEdgeId = 0; + + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + cGraph[ce].id = nextEdgeId++; + } +} + + + +// Compress parallel edges into bubbles, where possible. +bool AssemblyGraph::compressParallelEdges() +{ + AssemblyGraph& cGraph = *this; + bool changesWereMade = false; + + // Look for sets of parallel edges v0->v1. + vector<vertex_descriptor> childrenVertices; + vector<edge_descriptor> edgesToBeRemoved; + Bubble newBubble; + BGL_FORALL_VERTICES(v0, cGraph, AssemblyGraph) { + if(out_degree(v0, cGraph) < 2) { + continue; + } + + // Find distinct children vertices of v0. + childrenVertices.clear(); + BGL_FORALL_OUTEDGES(v0, e, cGraph, AssemblyGraph) { + childrenVertices.push_back(target(e, cGraph)); + } + deduplicate(childrenVertices); + + // Handle the children vertices one at a time. + for(const vertex_descriptor v1: childrenVertices) { + + // Create the new bubble using parallel edges v0->v1. + newBubble.clear(); + edgesToBeRemoved.clear(); + BGL_FORALL_OUTEDGES(v0, e, cGraph, AssemblyGraph) { + if(target(e, cGraph) != v1) { + continue; + } + AssemblyGraphEdge& edge = cGraph[e]; + + // The BubbleChain must have length 1. + if(edge.size() > 1) { + continue; + } + const Bubble& oldBubble = edge.front(); + + copy(oldBubble.begin(), oldBubble.end(), back_inserter(newBubble)); + edgesToBeRemoved.push_back(e); + } + if(edgesToBeRemoved.size() < 2) { + continue; + } + + // Create the new edge. + changesWereMade = true; + edge_descriptor eNew; + tie(eNew, ignore) = add_edge(v0, v1, cGraph); + AssemblyGraphEdge& newEdge = cGraph[eNew]; + newEdge.id = nextEdgeId++; + newEdge.resize(1); // Make it a single bubble. + Bubble& newEdgeBubble = newEdge.front(); + newEdgeBubble = newBubble; + newEdgeBubble.deduplicate(); + + // Remove the old edges. + for(const edge_descriptor e: edgesToBeRemoved) { + boost::remove_edge(e, cGraph); + } + + } + } + return changesWereMade; +} + + + +// Remove duplicate chains. +void Bubble::deduplicate() +{ + shasta::deduplicate(*this); +} + + + +// Compress linear sequences of edges (BubbleChains) into longer BubbleChains. +bool AssemblyGraph::compressSequentialEdges() +{ + AssemblyGraph& cGraph = *this; + bool changesWereMade = false; + + // Find linear chains of edges. + vector< vector<edge_descriptor> > linearChains; + findLinearChains(cGraph, 0, linearChains); + + + + // Each linear chain of more than one edge gets compressed into a single edge (BubbleChain). + for(const vector<edge_descriptor>& linearChain: linearChains) { + if(linearChain.size() < 2) { + continue; + } + + // Create the new edge. + changesWereMade = true; + const vertex_descriptor v0 = source(linearChain.front(), cGraph); + const vertex_descriptor v1 = target(linearChain.back(), cGraph); + edge_descriptor ceNew; + tie(ceNew, ignore) = add_edge(v0, v1, cGraph); + AssemblyGraphEdge& newEdge = cGraph[ceNew]; + newEdge.id = nextEdgeId++; + for(const edge_descriptor ce: linearChain) { + const AssemblyGraphEdge& oldEdge = cGraph[ce]; + copy(oldEdge.begin(), oldEdge.end(), back_inserter(newEdge)); + } + + // Remove the old edges. + for(const edge_descriptor ce: linearChain) { + boost::remove_edge(ce, cGraph); + } + + // Remove the vertices internal to the old edge. + for(uint64_t i=1; i<linearChain.size(); i++) { + const vertex_descriptor cv = source(linearChain[i], cGraph); + cGraph.removeVertex(cv); + } + } + return changesWereMade; +} + + + +// Call compressParallelEdges and compressSequentialEdges iteratively until nothing changes. +bool AssemblyGraph::compress() +{ + bool changesWereMade = false; + + while(true) { + const bool compressBubbleChainChanges = compressBubbleChains(); + const bool compressParallelChanges = compressParallelEdges(); + const bool compressSequentialChanges = compressSequentialEdges(); + + if(compressBubbleChainChanges or compressParallelChanges or compressSequentialChanges) { + // Something changed. Continue the iteration loop. + changesWereMade = true; + continue; + } else { + // Nothing changed at this iteration. Stop iteration loop. + break; + } + } + + return changesWereMade; +} + + + +// Call compress on all BubbleChains to merge adjacent haploid bubbles. +bool AssemblyGraph::compressBubbleChains() +{ + AssemblyGraph& cGraph = *this; + + bool changesWereMade = false; + BGL_FORALL_EDGES(e, cGraph, AssemblyGraph) { + if(cGraph[e].compress()) { + changesWereMade = true; + } + } + + return changesWereMade; +} + + + +// This does the opposite of compress. All bubble chains that +// consist of more than one simple haploid bubble are expanded into one +// edge for each edge of each bubble. +// For optimal results it is best to call compressBubbleChains before expand. +void AssemblyGraph::expand() +{ + AssemblyGraph& cGraph = *this; + + // Gather all edges that exist at this point. + vector<edge_descriptor> initialEdges; + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + initialEdges.push_back(ce); + } + + + + // Loop over the initial edges. + for(const edge_descriptor ce: initialEdges) { + BubbleChain& bubbleChain = cGraph[ce]; + + // If this bubbleChain consists of a single haploid bubble, don't do anything. + if(bubbleChain.isSimpleChain()) { + continue; + } + + // Prepare a vector of the vertices that will be the sources and targets + // of the edges we will create. + vector<vertex_descriptor> newVertices; + newVertices.push_back(source(ce, cGraph)); + for(uint64_t positionInBubbleChain=1; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) { + const vertex_descriptor cv = createVertex(bubbleChain[positionInBubbleChain].front().front()); + newVertices.push_back(cv); + } + newVertices.push_back(target(ce, cGraph)); + + // Create a new edge for each chain of each bubble in this bubble chain. + for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) { + Bubble& bubble = bubbleChain[positionInBubbleChain]; + const vertex_descriptor cv0 = newVertices[positionInBubbleChain]; + const vertex_descriptor cv1 = newVertices[positionInBubbleChain + 1]; + + for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) { + Chain& chain = bubble[indexInBubble]; + + // Create a new edge for this chain. + edge_descriptor ceNew; + tie(ceNew, ignore) = add_edge(cv0, cv1, cGraph); + AssemblyGraphEdge& edge = cGraph[ceNew]; + edge.id = nextEdgeId++; + + // Store this Chain in the new edge. + BubbleChain& newBubbleChain = cGraph[ceNew]; + newBubbleChain.resize(1); + Bubble& newBubble = newBubbleChain.front(); + newBubble.resize(1); + Chain& newChain = newBubble.front(); + newChain.swap(chain); + } + } + + // Now we can remove the BubbleChain. + boost::remove_edge(ce, cGraph); + } +} + + + +void AssemblyGraph::write(const string& name, bool writeSequence) const +{ + const string fileNamePrefix = name + "-" + to_string(componentId); + + cout << fileNamePrefix << ": " << num_vertices(*this) << + " vertices, " << num_edges(*this) << " edges. Next edge id " << nextEdgeId << endl; + + writeCsv(fileNamePrefix); + writeGraphviz(fileNamePrefix, true); + writeGraphviz(fileNamePrefix, false); + writeGfa(fileNamePrefix); + writeGfaExpanded(name, writeSequence, writeSequence); + if(writeSequence) { + writeFastaExpanded(name); + } +} + + + +void AssemblyGraph::writeCsv(const string& fileNamePrefix) const +{ + writeChainsDetailsCsv(fileNamePrefix); + writeChainsCsv(fileNamePrefix); + writeBubblesCsv(fileNamePrefix); + writeBubbleChainsCsv(fileNamePrefix); +} + + + +void AssemblyGraph::writeBubbleChainsCsv(const string& fileNamePrefix) const +{ + const AssemblyGraph& cGraph = *this; + + ofstream csv(fileNamePrefix + "-BubbleChains.csv"); + csv << "Id,ComponentId,BubbleChainId,v0,v1,BubbleCount,AverageOffset,MinOffset,MaxOffset,\n"; + + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + const vertex_descriptor cv0 = source(ce, cGraph); + const vertex_descriptor cv1 = target(ce, cGraph); + const BubbleChain& bubbleChain = cGraph[ce]; + + uint64_t averageOffset; + uint64_t minOffset; + uint64_t maxOffset; + bubbleChainOffset(bubbleChain, averageOffset, minOffset, maxOffset); + + csv << bubbleChainStringId(ce) << ","; + csv << componentId << ","; + csv << cGraph[ce].id << ","; + csv << cGraph[cv0].edgeId << ","; + csv << cGraph[cv1].edgeId << ","; + csv << bubbleChain.size() << ","; + csv << averageOffset << ","; + csv << minOffset << ","; + csv << maxOffset << ","; + csv << "\n"; + } +} + + + + +void AssemblyGraph::writeBubblesCsv(const string& fileNamePrefix) const +{ + const AssemblyGraph& cGraph = *this; + + ofstream csv(fileNamePrefix + "-Bubbles.csv"); + csv << "Id,ComponentId,BubbleChainId,Position in bubble chain,v0,v1,Ploidy,AverageOffset,MinOffset,MaxOffset,\n"; + + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = cGraph[ce]; + + for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) { + const Bubble& bubble = bubbleChain[positionInBubbleChain]; + const Chain& firstChain = bubble.front(); + + // Check that all the chains begins/end in the same place. + for(const Chain& chain: bubble) { + SHASTA_ASSERT(chain.front() == firstChain.front()); + SHASTA_ASSERT(chain.back() == firstChain.back()); + } + + uint64_t averageOffset; + uint64_t minOffset; + uint64_t maxOffset; + bubbleOffset(bubble, averageOffset, minOffset, maxOffset); + + csv << bubbleStringId(ce, positionInBubbleChain) << ","; + csv << componentId << ","; + csv << cGraph[ce].id << ","; + csv << positionInBubbleChain << ","; + csv << firstChain.front() << ","; + csv << firstChain.back() << ","; + csv << bubble.size() << ","; + csv << averageOffset << ","; + csv << minOffset << ","; + csv << maxOffset << ","; + csv << "\n"; + } + } + +} + + +void AssemblyGraph::writeChainsCsv(const string& fileNamePrefix) const +{ + const AssemblyGraph& cGraph = *this; + + ofstream csv(fileNamePrefix + "-Chains.csv"); + csv << "Id,ComponentId,BubbleChainId,Position in bubble chain,Index in bubble,Length,Offset\n"; + + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = cGraph[ce]; + + for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) { + const Bubble& bubble = bubbleChain[positionInBubbleChain]; + const uint64_t ploidy = bubble.size(); + + for(uint64_t indexInBubble=0; indexInBubble<ploidy; indexInBubble++) { + const Chain& chain = bubble[indexInBubble]; + SHASTA_ASSERT(chain.size() >= 2); + + csv << chainStringId(ce, positionInBubbleChain, indexInBubble) << ","; + csv << componentId << ","; + csv << cGraph[ce].id << ","; + csv << positionInBubbleChain << ","; + csv << indexInBubble << ","; + csv << chain.size() << ","; + csv << chainOffset(chain) << ","; + csv << "\n"; + } + } + } + +} + + + +void AssemblyGraph::writeChainsDetailsCsv(const string& fileNamePrefix) const +{ + const AssemblyGraph& cGraph = *this; + + ofstream csv(fileNamePrefix + "-ChainsDetails.csv"); + csv << "Id,ComponentId,BubbleChainId,Position in bubble chain," + "Index in bubble,Position in chain,MarkerGraphEdgeId,Coverage,Common,Offset\n"; + + BGL_FORALL_EDGES(e, cGraph, AssemblyGraph) { + writeChainDetailsCsv(csv, e, false); + } +} + + + +void AssemblyGraph::writeChainDetailsCsv( + ostream& csv, + edge_descriptor e, + bool writeHeader) const +{ + const AssemblyGraph& cGraph = *this; + const BubbleChain& bubbleChain = cGraph[e]; + + if(writeHeader) { + csv << "Id,ComponentId,BubbleChainId,Position in bubble chain," + "Index in bubble,Position in chain,MarkerGraphEdgeId,Coverage,Common,Offset\n"; + } + + for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) { + const Bubble& bubble = bubbleChain[positionInBubbleChain]; + const uint64_t ploidy = bubble.size(); + + for(uint64_t indexInBubble=0; indexInBubble<ploidy; indexInBubble++) { + const Chain& chain = bubble[indexInBubble]; + SHASTA_ASSERT(chain.size() >= 2); + + for(uint64_t positionInChain=0; positionInChain<chain.size(); positionInChain++) { + const MarkerGraphEdgeId markerGraphEdgeId = chain[positionInChain]; + const uint64_t coverage = assembler.markerGraph.edgeCoverage(markerGraphEdgeId); + csv << chainStringId(e, positionInBubbleChain, indexInBubble) << ","; + csv << componentId << ","; + csv << cGraph[e].id << ","; + csv << positionInBubbleChain << ","; + csv << indexInBubble << ","; + csv << positionInChain << ","; + csv << markerGraphEdgeId << ","; + csv << coverage << ","; + + if(positionInChain != 0) { + const MarkerGraphEdgeId previousMarkerGraphEdgeId = chain[positionInChain - 1]; + MarkerGraphEdgePairInfo info; + SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair( + previousMarkerGraphEdgeId, markerGraphEdgeId, info)); + csv << info.common << ","; + if(info.common != 0) { + csv << info.offsetInBases << ","; + } + } + csv << "\n"; + } + } + } +} + + + +void AssemblyGraph::writeGraphviz( + const string& fileNamePrefix, + bool labels) const +{ + const AssemblyGraph& cGraph = *this; + + ofstream dot; + if(labels) { + dot.open(fileNamePrefix + ".dot"); + } else { + dot.open(fileNamePrefix + "-NoLabels.dot"); + } + + dot << "digraph Component_" << componentId << "{\n"; + + BGL_FORALL_VERTICES(cv, cGraph, AssemblyGraph) { + const MarkerGraphEdgeId edgeId = cGraph[cv].edgeId; + const uint64_t coverage = assembler.markerGraph.edgeCoverage(edgeId); + dot << edgeId << "[label=\"" << edgeId << "\\n" << coverage << "\"];\n"; + } + + + + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = cGraph[ce]; + const vertex_descriptor cv0 = source(ce, cGraph); + const vertex_descriptor cv1 = target(ce, cGraph); + + uint64_t averageOffset; + uint64_t minOffset; + uint64_t maxOffset; + bubbleChainOffset(cGraph[ce], averageOffset, minOffset, maxOffset); + + dot << cGraph[cv0].edgeId << "->" << cGraph[cv1].edgeId; + + if(labels) { + dot << " [label=\""; + dot << bubbleChainStringId(ce) << "\\noff=" << averageOffset; + + // Additional annotation if this BubbleChain consists of a single + // haploid bubble. + const uint64_t bubbleCount = bubbleChain.size(); + if(bubbleCount == 1) { + const Bubble& bubble = bubbleChain.front(); + const uint64_t ploidy = bubble.size(); + if(ploidy == 1) { + const Chain& chain = bubble.front(); + dot << "\\nlen=" << chain.size(); + if(chain.size() > 2) { + // Compute average coverage for the internal edges. + uint64_t coverageSum = 0; + for(uint64_t i=1; i<chain.size()-1; i++) { + coverageSum += assembler.markerGraph.edgeCoverage(chain[i]); + } + const double averageCoverage = double(coverageSum) / double(chain.size() - 2); + dot << "\\ncov=" << uint64_t(std::round(averageCoverage)); + + dot << "\\n" << chain.second(); + if(chain.size() > 3) { + dot << "\\n" << chain.secondToLast(); + } + } + } + } + + dot << "\"]"; + } + dot << ";\n"; + } + + dot << "}\n"; +} + + + +void AssemblyGraph::writeGfa(const string& fileNamePrefix) const +{ + const AssemblyGraph& cGraph = *this; + + ofstream gfa(fileNamePrefix + ".gfa"); + + // Write the header line. + gfa << "H\tVN:Z:1.0\n"; + + // Write a segment for each edge. + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + + uint64_t averageOffset; + uint64_t minOffset; + uint64_t maxOffset; + bubbleChainOffset(cGraph[ce], averageOffset, minOffset, maxOffset); + + // Record type. + gfa << "S\t"; + + // Name. + gfa << bubbleChainStringId(ce) << "\t"; + + // Sequence. + gfa << "*\t"; + + // Sequence length in bases. + gfa << "LN:i:" << averageOffset << "\n"; + } + + // For each vertex, write links between each pair of incoming/outgoing edges. + BGL_FORALL_VERTICES(cv, cGraph, AssemblyGraph) { + BGL_FORALL_INEDGES(cv, ceIn, cGraph, AssemblyGraph) { + BGL_FORALL_OUTEDGES(cv, ceOut, cGraph, AssemblyGraph) { + gfa << + "L\t" << + bubbleChainStringId(ceIn) << "\t+\t" << + bubbleChainStringId(ceOut) << "\t+\t*\n"; + } + } + } +} + + + +void AssemblyGraph::writeGfaExpanded( + ostream& gfa, + bool includeSequence, + bool useSequenceLength) const +{ + writeGfaHeader(gfa); + writeGfaSegmentsExpanded(gfa, includeSequence, useSequenceLength); + writeGfaLinksExpanded(gfa); +} + + + +void AssemblyGraph::writeGfaSegmentsExpanded( + ostream& gfa, + bool includeSequence, + bool useSequenceLength +) const +{ + if(includeSequence) { + SHASTA_ASSERT(useSequenceLength); + } + + const AssemblyGraph& graph = *this; + + // Loop over BubbleChains. Each Chain of each Bubble generates a GFA segment. + BGL_FORALL_EDGES(ce, graph, AssemblyGraph) { + const BubbleChain& bubbleChain = graph[ce]; + + // Loop over Bubbles of this chain. + for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); + ++positionInBubbleChain) { + const Bubble& bubble = bubbleChain[positionInBubbleChain]; + + // Loop over chains of this bubble. + for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) { + const Chain& chain = bubble[indexInBubble]; + + // Record type. + gfa << "S\t"; + + // Name. + gfa << chainStringId(ce, positionInBubbleChain, indexInBubble) << "\t"; + + if(includeSequence) { + using shasta::Base; + const vector<Base>& sequence = chain.sequence; + + // Sequence. + copy(sequence.begin(), sequence.end(), ostream_iterator<Base>(gfa)); + gfa << "\t"; + + // Sequence length in bases. + gfa << "LN:i:" << sequence.size() << "\n"; + + } else { + + // Sequence. + gfa << "*\t"; + + // Sequence length in bases. + if(useSequenceLength) { + gfa << "LN:i:" << chain.sequence.size() << "\n"; + } else { + const uint64_t offset = chainOffset(chain); + gfa << "LN:i:" << offset << "\n"; + } + } + } + } + } +} + + + +// This writes a csv summary with one line for each assembled segment. +void AssemblyGraph::writeCsvSummary(ostream& csv) const +{ + const AssemblyGraph& assemblyGraph = *this; + + // Loop over BubbleChains. Each Chain of each Bubble generates a GFA segment. + BGL_FORALL_EDGES(e, assemblyGraph, AssemblyGraph) { + const AssemblyGraphEdge& edge = assemblyGraph[e]; + const BubbleChain& bubbleChain = edge; + const vertex_descriptor v0 = source(e, assemblyGraph); + const vertex_descriptor v1 = target(e, assemblyGraph); + + // Loop over Bubbles of this chain. + for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); + ++positionInBubbleChain) { + const Bubble& bubble = bubbleChain[positionInBubbleChain]; + + // Loop over chains of this bubble. + for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) { + const Chain& chain = bubble[indexInBubble]; + const uint64_t pValue = chainPValue(e, positionInBubbleChain, indexInBubble); + + // Define connectivity string. + string connectivity; + if(pValue == 0) { + const bool danglingAtBeginning = (in_degree(v0, assemblyGraph) == 0); + const bool danglingAtEnd = (out_degree(v1, assemblyGraph) == 0); + const bool isDangling = (danglingAtBeginning or danglingAtEnd); + const bool isIsolated = (danglingAtBeginning and danglingAtEnd); + if(isIsolated) { + connectivity = "Isolated"; + } else if(isDangling) { + connectivity = "Dangling"; + } else { + connectivity = "Complex"; + } + + } else if(pValue == 1) { + connectivity = "Haploid"; + } else if(pValue == 2) { + connectivity = "Diploid"; + } else { + connectivity = "Ploidy-" + to_string(pValue); + } + + // Set the color for display in Bandage. + // The colors below are constructed using HSV(hue,75%,750%). + // Bandage support for HSV appears to be buggy. + string color; + switch(pValue) { + + case 0: + { + // The only Chain of this BubbleChain. + // Figure out if it is dangling. + const vertex_descriptor v0 = source(e, assemblyGraph); + const vertex_descriptor v1 = target(e, assemblyGraph); + const bool isDanglingBackward = (in_degree(v0, assemblyGraph) == 0); + const bool isDanglingForward = (out_degree(v1, assemblyGraph) == 0); + const bool isIsolated = (isDanglingBackward and isDanglingForward); + const bool isDangling = (isDanglingBackward or isDanglingForward); + + if(isIsolated) { + color = "#3030bf"; // Blue + } else if(isDangling) { + color = "#30bfbf"; // Cyan + } else { + color = "#bf30bf"; // Purple + } + } + break; + + case 1: + // Haploid Chain in a non-trivial BubbleChain. + color = "#bf3030"; // Red + break; + case 2: + // Diploid segment. + color = "#30bf30"; // Green + break; + default: + // Ploidy > 2. + color = "#bfbf30"; // Yellow + break; + } + + csv << chainStringId(e, positionInBubbleChain, indexInBubble) << ","; + csv << connectivity << ","; + csv << componentId << ","; + csv << edge.id << ","; + csv << positionInBubbleChain << ","; + csv << indexInBubble << ","; + csv << chain.sequence.size() << ","; + if(chain.size() > 2) { + csv << std::fixed << std::setprecision(1) << primaryCoverage(chain); + } + csv << ","; + csv << pValue << ","; + csv << color << ","; + + + + // Write the preceding segments. + if(positionInBubbleChain == 0) { + + // The preceding segments are the Chains of the last Bubble + // of each previous BubbleChain. + bool isFirst = true; + BGL_FORALL_INEDGES(v0, e, assemblyGraph, AssemblyGraph) { + const AssemblyGraphEdge& edge = assemblyGraph[e]; + const BubbleChain& bubbleChain = edge; + const uint64_t positionInBubbleChain = bubbleChain.size() - 1; + const Bubble& bubble = bubbleChain[positionInBubbleChain]; + for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) { + if(isFirst) { + isFirst = false; + } else { + csv << " "; + } + csv << chainStringId(e, positionInBubbleChain, indexInBubble); + } + } + } else { + + // The preceding segments are the Chains of the previous Bubble + // in this BubbleChain. + const Bubble& bubble = bubbleChain[positionInBubbleChain - 1]; + for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) { + if(indexInBubble != 0) { + csv << " "; + } + csv << chainStringId(e, positionInBubbleChain - 1, indexInBubble); + } + } + csv << ","; + + + + // Write the following segments. + if(positionInBubbleChain == bubbleChain.size() - 1) { + + // The following segments are the Chains of the first Bubble + // of each next BubbleChain. + bool isFirst = true; + BGL_FORALL_OUTEDGES(v1, e, assemblyGraph, AssemblyGraph) { + const AssemblyGraphEdge& edge = assemblyGraph[e]; + const BubbleChain& bubbleChain = edge; + const uint64_t positionInBubbleChain = 0; + const Bubble& bubble = bubbleChain[positionInBubbleChain]; + for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) { + if(isFirst) { + isFirst = false; + } else { + csv << " "; + } + csv << chainStringId(e, positionInBubbleChain, indexInBubble); + } + } + } else { + + // The following segments are the Chains of the next Bubble + // in this BubbleChain. + const Bubble& bubble = bubbleChain[positionInBubbleChain + 1]; + for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) { + if(indexInBubble != 0) { + csv << " "; + } + csv << chainStringId(e, positionInBubbleChain + 1, indexInBubble); + } + } + csv << ","; + + csv << "\n"; + } + } + } +} + + + +void AssemblyGraph::writeGfaLinksExpanded(ostream& gfa) const +{ + const AssemblyGraph& graph = *this; + + // Write links between adjacent Chains of each BubbleChain. + BGL_FORALL_EDGES(ce, graph, AssemblyGraph) { + const BubbleChain& bubbleChain = graph[ce]; + + // Loop over Bubbles of this chain. + for(uint64_t positionInBubbleChain=1; positionInBubbleChain<bubbleChain.size(); + ++positionInBubbleChain) { + const Bubble& bubble0 = bubbleChain[positionInBubbleChain - 1]; + const Bubble& bubble1 = bubbleChain[positionInBubbleChain]; + const uint64_t overlapLength = assembler.markerGraph.edgeSequence[bubble1.front().front()].size(); + + for(uint64_t indexInBubble0=0; indexInBubble0<bubble0.size(); indexInBubble0++) { + const string chain0StringId = chainStringId(ce, positionInBubbleChain-1, indexInBubble0); + + for(uint64_t indexInBubble1=0; indexInBubble1<bubble1.size(); indexInBubble1++) { + const string chain1StringId = chainStringId(ce, positionInBubbleChain, indexInBubble1); + + gfa << + "L\t" << + chain0StringId << "\t+\t" << + chain1StringId << "\t+\t" << overlapLength << "M\n"; + } + } + } + } + + + + // Write links between Chains in different bubble chains. + BGL_FORALL_VERTICES(cv, graph, AssemblyGraph) { + const uint64_t overlapLength = assembler.markerGraph.edgeSequence[graph[cv].edgeId].size(); + + BGL_FORALL_INEDGES(cv, ce0, graph, AssemblyGraph) { + const BubbleChain& bubbleChain0 = graph[ce0]; + const Bubble& bubble0 = bubbleChain0.back(); + BGL_FORALL_OUTEDGES(cv, ce1, graph, AssemblyGraph) { + const BubbleChain& bubbleChain1 = graph[ce1]; + const Bubble& bubble1 = bubbleChain1.front(); + + for(uint64_t indexInBubble0=0; indexInBubble0<bubble0.size(); indexInBubble0++) { + const string chain0StringId = chainStringId(ce0, bubbleChain0.size()-1, indexInBubble0); + + for(uint64_t indexInBubble1=0; indexInBubble1<bubble1.size(); indexInBubble1++) { + const string chain1StringId = chainStringId(ce1, 0, indexInBubble1); + + gfa << + "L\t" << + chain0StringId << "\t+\t" << + chain1StringId << "\t+\t" << overlapLength << "M\n"; + } + } + } + } + } + + +} + + + +void AssemblyGraph::writeGfaHeader(ostream& gfa) +{ + gfa << "H\tVN:Z:1.0\n"; +} + + +// This version writes each chain as a segment, so it shows the +// details of the BubbleChains. +void AssemblyGraph::writeGfaExpanded( + const string& fileNamePrefix, + bool includeSequence, + bool useSequenceLength) const +{ + ofstream gfa(fileNamePrefix + "-" + to_string(componentId) + "-Expanded.gfa"); + writeGfaExpanded(gfa, includeSequence, useSequenceLength); +} + + + + +void AssemblyGraph::writeFastaExpanded(const string& fileNamePrefix) const +{ + ofstream fasta(fileNamePrefix + "-" + to_string(componentId) + "-Expanded.fasta"); + writeFastaExpanded(fasta); +} + + + +void AssemblyGraph::writeFastaExpanded(ostream& fasta) const +{ + const AssemblyGraph& cGraph = *this; + + + // Loop over BubbleChains. Each Chain of each Bubble generates a GFA segment. + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = cGraph[ce]; + + // Loop over Bubbles of this chain. + for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); + ++positionInBubbleChain) { + const Bubble& bubble = bubbleChain[positionInBubbleChain]; + + // Loop over chains of this bubble. + for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) { + const Chain& chain = bubble[indexInBubble]; + + using shasta::Base; + const vector<Base>& sequence = chain.sequence; + + fasta << ">" << chainStringId(ce, positionInBubbleChain, indexInBubble) << + " " << sequence.size() << "\n"; + copy(sequence.begin(), sequence.end(), ostream_iterator<Base>(fasta)); + fasta << "\n"; + + + + } + } + } +} + + + +void AssemblyGraph::writeSnapshot(uint64_t& snapshotNumber) const +{ + const string name = to_string(snapshotNumber++); + write(name); + writeGfaExpanded(name, false, false); +} + + + +string AssemblyGraph::bubbleChainStringId(edge_descriptor ce) const +{ + const AssemblyGraph& cGraph = *this; + const AssemblyGraphEdge& edge = cGraph[ce]; + return to_string(componentId) + "-" + to_string(edge.id); +} + + + +string AssemblyGraph::bubbleStringId( + edge_descriptor ce, + uint64_t positionInBubbleChain) const +{ + const AssemblyGraph& cGraph = *this; + const AssemblyGraphEdge& edge = cGraph[ce]; + + return + to_string(componentId) + "-" + + to_string(edge.id) + "-" + + to_string(positionInBubbleChain); +} + + + +string AssemblyGraph::chainStringId( + edge_descriptor e, + uint64_t positionInBubbleChain, + uint64_t indexInBubble) const +{ + // Locate the AssemblyGraphEdge. + const AssemblyGraph& cGraph = *this; + const AssemblyGraphEdge& edge = cGraph[e]; + + // Get the P-value for the Chain. + const uint64_t pValue = chainPValue(e, positionInBubbleChain, indexInBubble); + + return + to_string(componentId) + "-" + + to_string(edge.id) + "-" + + to_string(positionInBubbleChain) + "-" + + to_string(indexInBubble) + "-P" + + to_string(pValue); +} + + + +// This returns a "P-value" for a Chain defined as follows: +// If the Chain is the only chain of a BubbleChain, the P-value is 0. +// Otherwise, the P-value is the ploidy of the Bubble that the Chain belongs to. +// The P-value is used to create the -P suffix in the name (stringId) of the Chain. +uint64_t AssemblyGraph::chainPValue( + edge_descriptor e, + uint64_t positionInBubbleChain, + uint64_t indexInBubble) const +{ + // Locate the chain. + const AssemblyGraph& cGraph = *this; + const AssemblyGraphEdge& edge = cGraph[e]; + const BubbleChain& bubbleChain = edge; + const Bubble& bubble = bubbleChain[positionInBubbleChain]; + + if(bubbleChain.size() == 1 and bubble.size() == 1) { + // This is the only Chain in this BubbleChain. + return 0; + } else { + // Return the ploidy of the Bubble this Chain belongs to. + return bubble.size(); + } +} + + + +// Get the lengths of Chains assembled sequence for each Chain P-value. +// On return, chainLengths[pValue] contains the lengths of all +// Chains with that pValue, sorted in decreasing order. +// This can be used for N50 statistics. +void AssemblyGraph::getChainLengthsByPValue(vector< vector<uint64_t> >& chainLengths) const +{ + const AssemblyGraph& assemblyGraph = *this; + chainLengths.clear(); + + // Loop over all BubbleChains. + BGL_FORALL_EDGES(e, assemblyGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = assemblyGraph[e]; + + // Loop over all Bubbles in this BubbleChain. + for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) { + const Bubble& bubble = bubbleChain[positionInBubbleChain]; + const uint64_t ploidy = bubble.size(); + + // Loop over all Chains in this Bubble. + for(uint64_t indexInBubble=0; indexInBubble<ploidy; indexInBubble++) { + const Chain& chain = bubble[indexInBubble]; + const uint64_t pValue = chainPValue(e, positionInBubbleChain, indexInBubble); + + // Make sure we have a vector for this pValue. + if(pValue >= chainLengths.size()) { + chainLengths.resize(pValue + 1); + } + + // Store the sequence length of this chain. + chainLengths[pValue].push_back(chain.sequence.size()); + } + } + } + + // Sort by decreasing Chain lengths. + for(auto& v: chainLengths) { + sort(v.begin(), v.end(), std::greater<uint64_t>()); + } +} + + + +// Get the lengths of all non-trivial bubble chains. +void AssemblyGraph::getBubbleChainLengths(vector<uint64_t>& bubbleChainLengths) const +{ + const AssemblyGraph& assemblyGraph = *this; + + bubbleChainLengths.clear(); + BGL_FORALL_EDGES(e, assemblyGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = assemblyGraph[e]; + if(not bubbleChain.isSimpleChain()) { + bubbleChainLengths.push_back(bubbleChain.totalLength()); + } + } + sort(bubbleChainLengths.begin(), bubbleChainLengths.end(), std::greater<uint64_t>()); +} + + + +// Return the total lenght of this bubble chain. +uint64_t BubbleChain::totalLength() const +{ + double length = 0.; + for(const Bubble& bubble: *this) { + uint64_t bubbleTotalLength = 0; + for(const Chain& chain: bubble) { + bubbleTotalLength += chain.sequence.size(); + } + const double bubbleLength = double(bubbleTotalLength) / double(bubble.size()); + length += bubbleLength; + } + return uint64_t(std::round(length)); +} + + + +// Given a vector of lengths in decreasing order, compute the total length and N50. +pair<uint64_t, uint64_t> AssemblyGraph::n50(const vector<uint64_t>& lengths) +{ + // Handle the trivial case. + if(lengths.empty()) { + return {0, 0}; + } + + // Compute the total length. + const uint64_t totalLength = accumulate(lengths.begin(), lengths.end(), 0UL); + + // Compute the N50. + uint64_t cumulativeLength = 0; + for(const uint64_t length: lengths) { + cumulativeLength += length; + if(2 * cumulativeLength >= totalLength) { + return {totalLength, length}; + } + } + + + + // We should never get here. + // Before asserting, write some diagnostics. + ofstream csv("Assertion.csv"); + csv << "N," << lengths.size() << endl; + csv << "Total length," << totalLength << endl; + + // Check that it is sorted in decreasing order. + if(lengths.size() > 1) { + for(uint64_t i1=1; i1<lengths.size(); i1++) { + const uint64_t i0 = i1 - 1; + if(lengths[i0] < lengths[i1]) { + csv << "Not sorted at," << i0 << "," << i1 << "," << + lengths[i0] << "," << lengths[i1] << endl; + } + } + } + + // Write it all out. + for(uint64_t i=0; i<lengths.size(); i++) { + csv << i << "," << lengths[i] << endl; + } + + SHASTA_ASSERT(0); +} + + + + +uint64_t AssemblyGraph::chainOffset(const Chain& chain) const +{ + const uint64_t length = chain.size(); + SHASTA_ASSERT(length >= 2); + + uint64_t offset = 0; + for(uint64_t i=1; i<length; i++) { + const MarkerGraphEdgeId edgeId0 = chain[i-1]; + const MarkerGraphEdgeId edgeId1 = chain[i]; + + const uint64_t offsetThisPair = assembler.estimateBaseOffsetUnsafe(edgeId0, edgeId1); + + if(offsetThisPair != invalid<uint64_t>) { + offset += offsetThisPair; + } + } + return offset; +} + + + +// Return average coverage for the internal MarkerGraphEdgeIds of a Chain. +// For chain of length 2, this returns 0. +double AssemblyGraph::primaryCoverage(const Chain& chain) const +{ + if(chain.size() < 3) { + return 0.; + } + + uint64_t sum = 0; + for(uint64_t positionInChain=1; positionInChain<chain.size()-1; positionInChain++) { + const MarkerGraphEdgeId markerGraphEdgeId = chain[positionInChain]; + const uint64_t coverage = assembler.markerGraph.edgeCoverage(markerGraphEdgeId); + sum += coverage; + } + + return double(sum) / double(chain.size() - 2); +} + + + +void AssemblyGraph::bubbleOffset( + const Bubble& bubble, + uint64_t& averageOffset, + uint64_t& minOffset, + uint64_t& maxOffset + ) const +{ + averageOffset = 0; + minOffset = std::numeric_limits<uint64_t>::max(); + maxOffset = 0; + + for(const Chain& chain: bubble) { + const uint64_t offset = chainOffset(chain); + + averageOffset += offset; + minOffset = min(minOffset, offset); + maxOffset = max(maxOffset, offset); + } + averageOffset /= bubble.size(); +} + + + +bool AssemblyGraph::bubbleOffsetNoException( + const Bubble& bubble, + uint64_t& averageOffset, + uint64_t& minOffset, + uint64_t& maxOffset + ) const +{ + averageOffset = 0; + minOffset = std::numeric_limits<uint64_t>::max(); + maxOffset = 0; + + for(const Chain& chain: bubble) { + const uint64_t offset = chainOffset(chain); + if(offset == invalid<uint64_t>) { + return false; + } + + averageOffset += offset; + minOffset = min(minOffset, offset); + maxOffset = max(maxOffset, offset); + } + averageOffset /= bubble.size(); + return true; +} + + + +void AssemblyGraph::bubbleChainOffset( + const BubbleChain& bubbleChain, + uint64_t& averageOffset, + uint64_t& minOffset, + uint64_t& maxOffset + ) const +{ + averageOffset = 0; + minOffset = 0; + maxOffset = 0; + + for(const Bubble& bubble: bubbleChain) { + uint64_t bubbleAverageOffset; + uint64_t bubbleMinOffset; + uint64_t bubbleMaxOffset; + bubbleOffset(bubble, bubbleAverageOffset, bubbleMinOffset, bubbleMaxOffset); + + averageOffset += bubbleAverageOffset; + minOffset += bubbleMinOffset; + maxOffset += bubbleMaxOffset; + } +} + + + +AssemblyGraph::Superbubbles::Superbubbles( + AssemblyGraph& cGraph, + uint64_t maxOffset1 // Used to define superbubbles + ) : + cGraph(cGraph) +{ + cGraph.numberVertices(); + const uint64_t vertexCount = num_vertices(cGraph); + + vector<uint64_t> rank(vertexCount); + vector<uint64_t> parent(vertexCount); + boost::disjoint_sets<uint64_t*, uint64_t*> disjointSets(&rank[0], &parent[0]); + + // Compute connected components, using only edges with average offset up to maxOffset1. + for(uint64_t i=0; i<vertexCount; i++) { + disjointSets.make_set(i); + } + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + uint64_t averageOffset; + uint64_t minOffset; + uint64_t maxOffset; + cGraph.bubbleChainOffset(cGraph[ce], averageOffset, minOffset, maxOffset); + if(averageOffset <= maxOffset1) { + const vertex_descriptor cv0 = source(ce, cGraph); + const vertex_descriptor cv1 = target(ce, cGraph); + disjointSets.union_set(cGraph[cv0].index, cGraph[cv1].index); + } + } + + // Gather the vertices in each connected component. + vector< vector<vertex_descriptor> > components(vertexCount); + BGL_FORALL_VERTICES(cv, cGraph, AssemblyGraph) { + const uint64_t componentId = disjointSets.find_set(cGraph[cv].index); + components[componentId].push_back(cv); + } + + // The superbubbles are the components with size at least 2. + for(uint64_t componentId=0; componentId<components.size(); componentId++) { + const vector<vertex_descriptor> component = components[componentId]; + if(components[componentId].size() > 1) { + superbubbles.emplace_back(Superbubble(component)); + } + } + + // Store superbubble ids in the vertices. + BGL_FORALL_VERTICES(cv, cGraph, AssemblyGraph) { + cGraph[cv].superbubbleId = invalid<uint64_t>; + } + for(uint64_t superbubbleId=0; superbubbleId<superbubbles.size(); superbubbleId++) { + const vector<vertex_descriptor>& superbubble = getSuperbubble(superbubbleId); + for(const vertex_descriptor cv: superbubble) { + cGraph[cv].superbubbleId = superbubbleId; + } + } + + + + // Find entrances and exists of each superbubble. + for(uint64_t superbubbleId=0; superbubbleId<superbubbles.size(); superbubbleId++) { + Superbubble& superbubble = getSuperbubble(superbubbleId); + + // Find entrances. These are superbubble vertices with in-edges + // from outside the superbubble. + for(const vertex_descriptor cv0: superbubble) { + BGL_FORALL_INEDGES(cv0, ce, cGraph, AssemblyGraph) { + const vertex_descriptor cv1 = source(ce, cGraph); + if(not isInSuperbubble(superbubbleId, cv1)) { + superbubble.entrances.push_back(cv0); + break; + } + } + } + + // Find exits. These are superbubble vertices with out-edges + // to outside the superbubble. + vector<vertex_descriptor> exits; + for(const vertex_descriptor cv0: superbubble) { + BGL_FORALL_OUTEDGES(cv0, ce, cGraph, AssemblyGraph) { + const vertex_descriptor cv1 = target(ce, cGraph); + if(not isInSuperbubble(superbubbleId, cv1)) { + superbubble.exits.push_back(cv0); + break; + } + } + } + } + +} + + + +// This uses dominator trees. +// It only finds superbubbles with one entrance and one exit. +AssemblyGraph::Superbubbles::Superbubbles( + AssemblyGraph& cGraph) : + cGraph(cGraph) +{ + const bool debug = false; + + // Map vertices to integers. + std::map<vertex_descriptor, uint64_t> indexMap; + uint64_t vertexIndex = 0; + BGL_FORALL_VERTICES(v, cGraph, AssemblyGraph) { + indexMap.insert({v, vertexIndex++}); + } + auto associativeIndexMap = boost::make_assoc_property_map(indexMap); + const uint64_t vertexCount = vertexIndex; + + // Vectors used below to compute the dominator tree. + vector<uint64_t> dfNum(vertexCount); + vector<vertex_descriptor> parent(vertexCount); + vector<vertex_descriptor> verticesByDFNum(vertexCount); + + // Tree pairs found on forward and backward dominator tree. + vector< pair<vertex_descriptor, vertex_descriptor> > forwardPairs; + vector< pair<vertex_descriptor, vertex_descriptor> > backwardPairs; + + + + // Compute dominator trees using as entrance each of the + // vertices with zero in-degree. + BGL_FORALL_VERTICES(entrance, cGraph, AssemblyGraph) { + if(in_degree(entrance, cGraph) != 0) { + continue; + } + + // Compute the dominator tree. + fill(dfNum.begin(), dfNum.end(), invalid<uint64_t>); + fill(parent.begin(), parent.end(), null_vertex()); + fill(verticesByDFNum.begin(), verticesByDFNum.end(), null_vertex()); + std::map<vertex_descriptor, vertex_descriptor> predecessorMap; + + boost::lengauer_tarjan_dominator_tree( + cGraph, + entrance, + boost::make_assoc_property_map(indexMap), + boost::make_iterator_property_map(dfNum.begin(), associativeIndexMap), + boost::make_iterator_property_map(parent.begin(), associativeIndexMap), + verticesByDFNum, + boost::make_assoc_property_map(predecessorMap)); + + if(debug) { + cout << "Forward dominator tree with entrance at " << cGraph[entrance].edgeId << endl; + } + for(const auto& p: predecessorMap) { + const vertex_descriptor cv0 = p.second; + const vertex_descriptor cv1 = p.first; + forwardPairs.push_back({cv0, cv1}); + if(debug) { + cout << "F " << cGraph[cv0].edgeId << "->" << cGraph[cv1].edgeId << endl; + } + } + } + + + + // Compute dominator trees on the reverse graph using as entrance each of the + // vertices with zero in-degree on the reverse graph + // (that is, zero out-degree on the AssemblyGraph). + using ReverseAssemblyGraph = boost::reverse_graph<AssemblyGraph>; + ReverseAssemblyGraph reverseGraph(cGraph); + BGL_FORALL_VERTICES(entrance, reverseGraph, ReverseAssemblyGraph) { + if(in_degree(entrance, reverseGraph) != 0) { + continue; + } + + // Compute the dominator tree. + fill(dfNum.begin(), dfNum.end(), invalid<uint64_t>); + fill(parent.begin(), parent.end(), null_vertex()); + fill(verticesByDFNum.begin(), verticesByDFNum.end(), null_vertex()); + std::map<vertex_descriptor, vertex_descriptor> predecessorMap; + + boost::lengauer_tarjan_dominator_tree( + reverseGraph, + entrance, + boost::make_assoc_property_map(indexMap), + boost::make_iterator_property_map(dfNum.begin(), associativeIndexMap), + boost::make_iterator_property_map(parent.begin(), associativeIndexMap), + verticesByDFNum, + boost::make_assoc_property_map(predecessorMap)); + + if(debug) { + cout << "Backward dominator tree with exit at " << cGraph[entrance].edgeId << endl; + } + for(const auto& p: predecessorMap) { + const vertex_descriptor cv0 = p.first; + const vertex_descriptor cv1 = p.second; + backwardPairs.push_back({cv0, cv1}); + if(debug) { + cout << "B " << cGraph[cv0].edgeId << "->" << cGraph[cv1].edgeId << endl; + } + } + } + + // Compute strongly connected components. + std::map<vertex_descriptor, uint64_t> componentMap; + boost::strong_components( + cGraph, + boost::make_assoc_property_map(componentMap), + boost::vertex_index_map(boost::make_assoc_property_map(indexMap))); + + // Gather the vertices in each strong component. + vector< vector<vertex_descriptor> > strongComponents(vertexCount); + for(const auto& p: componentMap) { + const vertex_descriptor v = p.first; + const uint64_t componentId = p.second; + SHASTA_ASSERT(componentId < vertexCount); + strongComponents[componentId].push_back(v); + } + + + + // The pairs that appear both in forwardPairs and backwardPairs define our superbubbles + deduplicate(forwardPairs); + deduplicate(backwardPairs); + vector< pair<vertex_descriptor, vertex_descriptor> > bidirectionalPairs; + std::set_intersection( + forwardPairs.begin(), forwardPairs.end(), + backwardPairs.begin(), backwardPairs.end(), + back_inserter(bidirectionalPairs) + ); + + if(debug) { + cout << "Bidirectional pairs:" << endl; + for(const auto& p: bidirectionalPairs) { + const vertex_descriptor cv0 = p.first; + const vertex_descriptor cv1 = p.second; + cout << cGraph[cv0].edgeId << "->" << cGraph[cv1].edgeId << endl; + } + } + + // Each bidirectional pair generates a superbubble if + // the out-degree of the entrance and + // the in-degree of the exit are greater than 1, + // unless the entrance or exit or any of the + // superbubble vertices are in a non-trivial strong component.. + for(const auto& p: bidirectionalPairs) { + const vertex_descriptor cv0 = p.first; + const vertex_descriptor cv1 = p.second; + if(out_degree(cv0, cGraph) <= 1) { + continue; + } + if(in_degree(cv1, cGraph) <= 1) { + continue; + } + if(strongComponents[componentMap[cv0]].size() > 1) { + // The entrance is in a non-trivial strong component. + continue; + } + if(strongComponents[componentMap[cv1]].size() > 1) { + // The exit is in a non-trivial strong component. + continue; + } + superbubbles.resize(superbubbles.size() + 1); + Superbubble& superbubble = superbubbles.back(); + superbubble.entrances.push_back(cv0); + superbubble.exits.push_back(cv1); + superbubble.fillInFromEntranceAndExit(cGraph); + + if(debug) { + cout << "Tentative superbubble with entrance " << cGraph[cv0].edgeId << + " exit " << cGraph[cv1].edgeId << " and " << superbubble.size() << + " vertices total." << endl; + } + + // If any vertices in the superbubble are in a non-trivial + // strong component, remove it. + for(const vertex_descriptor cv: superbubble) { + if(strongComponents[componentMap[cv]].size() > 1) { + superbubbles.pop_back(); + if(debug) { + cout << "This superbubble will not be stored because some vertices are in a non-trivial strong component." << endl; + } + break; + } + } + } + + if(debug) { + cout << "Superbubble entrance/exit pairs:" << endl; + for(const Superbubble& superbubble: superbubbles) { + const vertex_descriptor cv0 = superbubble.entrances.front(); + const vertex_descriptor cv1 = superbubble.exits.front();; + cout << cGraph[cv0].edgeId << "->" << cGraph[cv1].edgeId << endl; + } + } +} + + + +// Fill in the superbubble given a single entrance and exit. +void AssemblyGraph::Superbubble::fillInFromEntranceAndExit(const AssemblyGraph& cGraph) +{ + SHASTA_ASSERT(empty()); + SHASTA_ASSERT(entrances.size() == 1); + SHASTA_ASSERT(exits.size() == 1); + + const vertex_descriptor entrance = entrances.front(); + const vertex_descriptor exit = exits.front(); + + // Do a BFS starting at the entrance and stopping at the exit. + std::set<vertex_descriptor> internalVertices; + std::queue<vertex_descriptor> q; + q.push(entrance); + while(not q.empty()) { + const vertex_descriptor cv0 = q.front(); + q.pop(); + BGL_FORALL_OUTEDGES(cv0, e, cGraph, AssemblyGraph) { + const vertex_descriptor cv1 = target(e, cGraph); + if(cv1 != exit) { + if(not internalVertices.contains(cv1)) { + internalVertices.insert(cv1); + q.push(cv1); + } + } + } + } + + push_back(entrance); + copy(internalVertices.begin(), internalVertices.end(), back_inserter(*this)); + push_back(exit); + +} + + + +AssemblyGraph::Superbubbles::~Superbubbles() +{ + cGraph.clearVertexNumbering(); +} + + + +// Remove short superbubbles with one entry and one exit. +bool AssemblyGraph::removeShortSuperbubbles( + bool debug, + uint64_t maxOffset1, // Used to define superbubbles + uint64_t maxOffset2) // Compared against the offset between entry and exit +{ + AssemblyGraph& cGraph = *this; + bool changesWereMade = false; + + // Find the superbubbles. + Superbubbles superbubbles(cGraph, maxOffset1); + + // Loop over the superbubbles. + for(uint64_t superbubbleId=0; superbubbleId<superbubbles.size(); superbubbleId++) { + Superbubble& superbubble = superbubbles.getSuperbubble(superbubbleId); + SHASTA_ASSERT(superbubble.size() > 1); + + if(debug) { + cout << "Found a superbubble with " << superbubble.size() << " vertices:"; + for(const vertex_descriptor v: superbubble) { + cout << " " << cGraph[v].edgeId; + } + cout << endl; + } + + // Skip it if it has more than one entrance or exit. + if(not(superbubble.entrances.size()==1 and superbubble.exits.size()==1)) { + if(debug) { + cout << "This superbubble will not be removed because it has " << + superbubble.entrances.size() << " entrances and " << + superbubble.exits.size() << " exits." << endl; + } + continue; + } + + const vertex_descriptor entrance = superbubble.entrances.front(); + const vertex_descriptor exit = superbubble.exits.front(); + if(entrance == exit) { + if(debug) { + cout << "This superbubble will not be removed because it the entrance vertex" + " is the same as the exit vertex." << endl; + } + continue; + } + + // Check the base offset between the entrance and the exit. + MarkerGraphEdgePairInfo info; + SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(cGraph[entrance].edgeId, cGraph[exit].edgeId, info)); + if(info.common == 0) { + if(debug) { + cout << "This superbubble will not be removed because " + "there are no common oriented reads between the entrance and the exit." << endl; + } + continue; + } + if(info.offsetInBases > int64_t(maxOffset2)) { + if(debug) { + cout << "This superbubble will not be removed because offsetInBases is " << + info.offsetInBases << endl; + } + continue; + } + +#if 1 + // If a trivial superbubble, skip it. + // Trivial means: + // - Has two vertices of which one is the entrance and one is the exit. + // - There is only one edge between the two. + if(superbubble.size() == 2) { + uint64_t edgeCount = 0; + BGL_FORALL_OUTEDGES(entrance, e, cGraph, AssemblyGraph) { + if(target(e, cGraph) == exit) { + ++edgeCount; + } + } + if(edgeCount == 1) { + if(debug) { + cout << "This superbubble will not be removed because it is trivial." << endl; + } + continue; + } + } +#endif + if(debug) { + cout << "This superbubble will be removed." << endl; + } + + // Remove all vertices and edges internal to the superbubble. + for(const vertex_descriptor cv: superbubble) { + if(cv!=entrance and cv!=exit) { + boost::clear_vertex(cv, cGraph); + cGraph.removeVertex(cv); + } + } + // We must also remove edges between the entrance and the exit. + vector<edge_descriptor> entranceToExitEdges; + BGL_FORALL_OUTEDGES(entrance, ce, cGraph, AssemblyGraph) { + if(target(ce, cGraph) == exit) { + entranceToExitEdges.push_back(ce); + } + } + for(const edge_descriptor ce: entranceToExitEdges) { + boost::remove_edge(ce, cGraph); + } + vector<edge_descriptor> exitToEntranceEdges; + BGL_FORALL_OUTEDGES(exit, ce, cGraph, AssemblyGraph) { + if(target(ce, cGraph) == entrance) { + exitToEntranceEdges.push_back(ce); + } + } + for(const edge_descriptor ce: exitToEntranceEdges) { + boost::remove_edge(ce, cGraph); + } + + // Generate an edge between the entrance and the exit. + // This will be a BubbleChain consisting of a single haploid Bubble. + edge_descriptor eNew; + tie(eNew, ignore) = add_edge(entrance, exit, cGraph); + AssemblyGraphEdge& newEdge = cGraph[eNew]; + newEdge.id = nextEdgeId++; + BubbleChain& bubbleChain = newEdge; + bubbleChain.resize(1); + Bubble& bubble = bubbleChain.front(); + bubble.resize(1); + Chain& chain = bubble.front(); + chain.push_back(cGraph[entrance].edgeId); + chain.push_back(cGraph[exit].edgeId); + + changesWereMade = true; + } + + return changesWereMade; +} + + + +// Cleanup/simplify superbubbles that are likely to be caused by errors, +// completely or in part. +void AssemblyGraph::cleanupSuperbubbles( + bool debug, + uint64_t maxOffset1, // Used to define superbubbles + uint64_t maxOffset2, // Compared against the offset between entry and exit + uint64_t chainTerminalCommonThreshold) +{ + AssemblyGraph& cGraph = *this; + + if(debug) { + cout << "cleanupSuperbubbles begins." << endl; + } + + // Find the superbubbles. + Superbubbles superbubbles(cGraph, maxOffset1); + + // The bubbles constructed in this way are guaranteed to not overlap, + // so we don't have to worry about overlapping bubbles. + std::set<vertex_descriptor> previousSuperbubblesVertices; + + // Loop over the superbubbles. + for(uint64_t superbubbleId=0; superbubbleId<superbubbles.size(); superbubbleId++) { + cleanupSuperbubble(debug, superbubbles, superbubbleId, + maxOffset2, chainTerminalCommonThreshold, previousSuperbubblesVertices); + } + if(debug) { + cout << "cleanupSuperbubbles ends." << endl; + } +} + + + +// This version of superbubble cleanup uses dominator trees to define superbubbles, +// instead of computing connected components using edges of length uo tp maxOffset1. +void AssemblyGraph::cleanupSuperbubbles( + bool debug, + uint64_t maxOffset2, // Compared against the offset between entry and exit + uint64_t chainTerminalCommonThreshold) +{ + performanceLog << timestamp << "AssemblyGraph::cleanupSuperbubbles begins." << endl; + AssemblyGraph& cGraph = *this; + + if(debug) { + cout << "cleanupSuperbubbles begins." << endl; + } + + // Find the superbubbles using dominator trees. + Superbubbles superbubbles(cGraph); + + // The superbubbles found in this way can have overlaps. + // To deal with this, we process superbubbles in order of increasing size + // and keep track of the vertices. + // If a bubble contains a previously encountered vertex, don't process it. + // Note cleanupSuperbubble does not create any new vertices, + // so keeping track of the vertex descriptors that were removed is save. + std::set<vertex_descriptor> previousSuperbubblesVertices; + + // Sort the superbubbles in order of increasing size. + vector< pair<uint64_t, uint64_t> > superbubbleTable; // (superbubbleId, size) + for(uint64_t superbubbleId=0; superbubbleId<superbubbles.size(); superbubbleId++) { + const Superbubble& superbubble = superbubbles.getSuperbubble(superbubbleId); + superbubbleTable.push_back({superbubbleId, superbubble.size()}); + } + sort(superbubbleTable.begin(), superbubbleTable.end(), + OrderPairsBySecondOnly<uint64_t, uint64_t>()); + + // Loop over the superbubbles in order of increasing size. + for(const auto& p: superbubbleTable) { + const uint64_t superbubbleId = p.first; + cleanupSuperbubble(debug, superbubbles, superbubbleId, maxOffset2, + chainTerminalCommonThreshold, previousSuperbubblesVertices); + } + if(debug) { + cout << "cleanupSuperbubbles ends." << endl; + } + performanceLog << timestamp << "AssemblyGraph::cleanupSuperbubbles ends." << endl; + +} + + + +// Cleanup/simplify a superbubble that is likely to be caused by errors, +// completely or in part. +// This handles superbubbles caused by two marker graph bubbles with +// no primary edges in between. +void AssemblyGraph::cleanupSuperbubble( + bool debug, + const Superbubbles& superbubbles, + uint64_t superbubbleId, + uint64_t maxOffset2, // Compared against the offset between entry and exit + uint64_t chainTerminalCommonThreshold, + std::set<vertex_descriptor>& previousSuperbubblesVertices) +{ + AssemblyGraph& cGraph = *this; + const Superbubble& superbubble = superbubbles.getSuperbubble(superbubbleId); + +#if 0 + debug = (superbubble.entrances.size() == 1 and + (cGraph[superbubble.entrances.front()].edgeId == 16093908 or + cGraph[superbubble.entrances.front()].edgeId == 9555933)); +#endif + + if(debug) { + cout << "Working on a superbubble with " << superbubble.size() << " vertices:"; + for(const vertex_descriptor v: superbubble) { + cout << " " << cGraph[v].edgeId; + } + cout << endl; + } + + // See if it overlaps any vertices of previous superbubbles. + bool overlaps = false; + for(const vertex_descriptor v: superbubble) { + if(previousSuperbubblesVertices.contains(v)) { + if(debug) { + cout << "This superbubble ignored because it contains vertex " << cGraph[v].edgeId << + " which is in a previously processed superbubble." << endl; + } + overlaps = true; + break; + } + } + for(const vertex_descriptor v: superbubble) { + previousSuperbubblesVertices.insert(v); + } + if(overlaps) { + return; + } + + // Skip it if it has more than one entrance or exit. + if(not(superbubble.entrances.size()==1 and superbubble.exits.size()==1)) { + if(debug) { + cout << "This superbubble will be skipped because it has " << + superbubble.entrances.size() << " entrances and " << + superbubble.exits.size() << " exits." << endl; + } + return; + } + + const vertex_descriptor entrance = superbubble.entrances.front(); + const vertex_descriptor exit = superbubble.exits.front(); + if(debug) { + cout << "Entrance " << cGraph[entrance].edgeId << endl; + cout << "Exit " << cGraph[exit].edgeId << endl; + } + + if(entrance == exit) { + if(debug) { + cout << "This superbubble will be skipped because the entrance vertex" + " is the same as the exit vertex." << endl; + } + return; + } + + + + // Check the base offset between the entrance and the exit. + MarkerGraphEdgePairInfo info; + SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(cGraph[entrance].edgeId, cGraph[exit].edgeId, info)); + if(info.common == 0) { + if(debug) { + cout << "This superbubble will be skipped because " + "there are no common oriented reads between the entrance and the exit." << endl; + } + return; + } + if(info.offsetInBases > int64_t(maxOffset2)) { + if(debug) { + cout << "This superbubble will be skipped because offsetInBases is " << + info.offsetInBases << endl; + } + return; + } + + // If a trivial superbubble, skip it. + // Trivial means: + // - Has two vertices of which one is the entrance and one is the exit. + // - There is only one edge between the two. + if(superbubble.size() == 2) { + uint64_t edgeCount = 0; + BGL_FORALL_OUTEDGES(entrance, e, cGraph, AssemblyGraph) { + if(target(e, cGraph) == exit) { + ++edgeCount; + } + } + if(edgeCount == 1) { + if(debug) { + cout << "This superbubble be skipped because it is trivial." << endl; + } + return; + } + } + + // Find the out-edges of the entrance that go inside the superbubble. + vector<edge_descriptor> entranceOutEdges; + BGL_FORALL_OUTEDGES(entrance, ce, cGraph, AssemblyGraph) { + const vertex_descriptor cv = target(ce, cGraph); + if(superbubbles.isInSuperbubble(superbubbleId, cv)) { + entranceOutEdges.push_back(ce); + } + } + sort(entranceOutEdges.begin(), entranceOutEdges.end()); + + // Find the in-edges of the exit that come from inside the superbubble. + vector<edge_descriptor> exitInEdges; + BGL_FORALL_INEDGES(exit, ce, cGraph, AssemblyGraph) { + const vertex_descriptor cv = source(ce, cGraph); + if(superbubbles.isInSuperbubble(superbubbleId, cv)) { + exitInEdges.push_back(ce); + } + } + sort(exitInEdges.begin(), exitInEdges.end()); + + if(debug) { + cout << "Entrance out-edges to inside the superbubble:"; + for(const edge_descriptor ce: entranceOutEdges) { + cout << " " << bubbleChainStringId(ce); + } + cout << endl; + cout << "Exit in-edges from inside the superbubble:"; + for(const edge_descriptor ce: exitInEdges) { + cout << " " << bubbleChainStringId(ce); + } + cout << endl; + } + + // If there are common edges between the entranceOutEdges and exitInEdges, + // skip this superbubble. + { + vector<edge_descriptor> commonEdges; + std::set_intersection( + entranceOutEdges.begin(), entranceOutEdges.end(), + exitInEdges.begin(), exitInEdges.end(), + back_inserter(commonEdges)); + + if(not commonEdges.empty()) { + if(debug) { + cout << "This superbubble will be skipped because there are " << + commonEdges.size() << " common edges between the out-edges of the entrance " + "and the in-edges of the exit." << endl; + } + return; + } + } + + + // We will consider replacing this superbubble with either its "entrance bubble" + // or its "exit bubble": + // - The "entrance bubble" is obtained by removing all edges + // except for the out-edges of the entrance, and joining them directly with the exit. + // - The "exit bubble" is obtained by removing all edges + // except for the in-edges of the exit, and joining the entry directly with them. + + + + // If there are exactly two entranceOutEdges, construct the entrance bubble. + // This can only be done if the two entranceOutEdges consist of simple chains. + Bubble entranceBubble; + if(entranceOutEdges.size() == 2) { + + // See if the two entranceOutEdges consist of simple chains. + bool canDo = true; + for(const edge_descriptor ce: entranceOutEdges) { + if(not cGraph[ce].isSimpleChain()) { + canDo = false; + break; + } + } + + // Only continue creating the entranceBubble if both entranceOutEdges + // consist of single chains. + if(canDo) { + + // Construct the two chains of the entranceBubble and assemble their sequence. + entranceBubble.resize(2); + ofstream noCsv; + for(uint64_t i=0; i<2; i++) { + const edge_descriptor entranceOutEdge = entranceOutEdges[i]; + Chain& chain = entranceBubble[i]; + chain = cGraph[entranceOutEdge].getOnlyChain(); + chain.push_back(cGraph[exit].edgeId); + assembleChain(chain, chainTerminalCommonThreshold); + } + + if(debug) { + cout << "Entrance bubble:" << endl; + for(uint64_t i=0; i<2; i++) { + const Chain& chain = entranceBubble[i]; + cout << "Entrance bubble chain " << i << ":"; + for (const MarkerGraphEdgeId edgeId: chain) { + cout << " " << edgeId; + } + cout << endl; + } + for(uint64_t i=0; i<2; i++) { + const Chain& chain = entranceBubble[i]; + cout << ">Entrance-" << i << " " << chain.sequence.size() << "\n"; + copy(chain.sequence.begin(), chain.sequence.end(), ostream_iterator<shasta::Base>(cout)); + cout << "\n"; + } + } + + // If the sequences differ just by a copy number of short periodicity, + // the entrance bubble is probably causes by errors and so we don't wat to use it. + const uint64_t period = isCopyNumberDifference(entranceBubble[0].sequence, entranceBubble[1].sequence, 4); + if(debug) { + cout << "Period " << period << "\n"; + } + if(period != 0) { + entranceBubble.clear(); + } + } + } + + + + // If there are exactly two exitEdges, construct the exit bubble. + // This can only be done if the two exitInEdges consist of simple chains. + Bubble exitBubble; + if(exitInEdges.size() == 2) { + + // See if the two exitInEdges consist of simple chains. + bool canDo = true; + for(const edge_descriptor ce: exitInEdges) { + if(not cGraph[ce].isSimpleChain()) { + canDo = false; + break; + } + } + + // Only continue creating the exitBubble if both exitInEdges + // consist of single chains. + if(canDo) { + + // Construct the two chains of the exitBubble and assemble their sequence. + exitBubble.resize(2); + ofstream noCsv; + for(uint64_t i=0; i<2; i++) { + const edge_descriptor exitInEdge = exitInEdges[i]; + Chain& chain = exitBubble[i]; + chain.push_back(cGraph[entrance].edgeId); + const Chain& exitChain = cGraph[exitInEdge].getOnlyChain(); + copy(exitChain.begin(), exitChain.end(), back_inserter(chain)); + assembleChain(chain, chainTerminalCommonThreshold); + } + + if(debug) { + cout << "Exit bubble:" << endl; + for(uint64_t i=0; i<2; i++) { + const Chain& chain = exitBubble[i]; + cout << "Exit bubble chain " << i << ":"; + for (const MarkerGraphEdgeId edgeId: chain) { + cout << " " << edgeId; + } + cout << endl; + } + for(uint64_t i=0; i<2; i++) { + const Chain& chain = exitBubble[i]; + cout << ">Exit-" << i << " " << chain.sequence.size() << "\n"; + copy(chain.sequence.begin(), chain.sequence.end(), ostream_iterator<shasta::Base>(cout)); + cout << "\n"; + } + } + + // If the sequences differ just by a copy number of short periodicity, + // the exit bubble is probably causes by errors and so we don't wat to use it. + const uint64_t period = isCopyNumberDifference(exitBubble[0].sequence, exitBubble[1].sequence, 4); + if(debug) { + cout << "Period " << period << "\n"; + } + if(period != 0) { + exitBubble.clear(); + } + } + } + + + // Handle the case where both the entrance and the exit bubble look usable. + if(entranceBubble.size() == 2 and exitBubble.size() == 2) { + + // If the entrance and exit bubbles have the same assembled sequences, we can just keep one of them. + const auto& entrance0 = entranceBubble[0].sequence; + const auto& entrance1 = entranceBubble[1].sequence; + const auto& exit0 = exitBubble[0].sequence; + const auto& exit1 = exitBubble[1].sequence; + if( + (entrance0 == exit0 and entrance1 == exit1) + or + (entrance0 == exit1 and entrance1 == exit0)) { + if(debug) { + cout << "The entrance and exit bubbles are equivalent." << endl; + cout << "Keeping only the entrance bubble." << endl; + } + exitBubble.clear(); + } else { + + // In other cases it is difficult to pick which bubble is best to keep, + // so we remove both of them. + // This is no worse than letting removeShortBubbles remove it. + // The sequence assembly process will still pick the best sequence + // for each haplotype, but these bubbles are excluded from the + // phasing/detangling process. + entranceBubble.clear(); + exitBubble.clear(); + + if(debug) { + cout << "Both the entrance and the exit bubble are usable but both will be removed." << endl; + } + + } + } + + + + // Figure out which ones of the entrance/exit bubbles is usable. + SHASTA_ASSERT(entranceBubble.size() == 0 or entranceBubble.size() == 2); + SHASTA_ASSERT(exitBubble.size() == 0 or exitBubble.size() == 2); + const bool entranceBubbleIsGood = (entranceBubble.size() == 2); + const bool exitBubbleIsGood = (exitBubble.size() == 2); + + + if(entranceBubbleIsGood) { + if(exitBubbleIsGood) { + if(debug) { + cout << "Both the entrance bubble and the exit bubble are good." << endl; + } + SHASTA_ASSERT(0); + } else { + if(debug) { + cout << "Only the entrance bubble is good." << endl; + } + } + } else { + if(exitBubbleIsGood) { + if(debug) { + cout << "Only the exit bubble is good." << endl; + } + } else { + if(debug) { + cout << "Neither the entrance bubble nor the exit bubble are good." << endl; + } + } + } + + + // Remove all vertices and edges internal to the superbubble. + for(const vertex_descriptor cv: superbubble) { + if(cv != entrance and cv != exit) { + clear_vertex(cv, cGraph); + remove_vertex(cv, cGraph); + } + } + + // Create the new edge and bubble chain between the entrance and the exit that will replace + // the superbubble. + edge_descriptor ce; + tie(ce, ignore) = add_edge(entrance, exit, cGraph); + AssemblyGraphEdge& edge = cGraph[ce]; + edge.id = nextEdgeId++; + BubbleChain& bubbleChain = edge; + SHASTA_ASSERT(not (entranceBubbleIsGood and exitBubbleIsGood)); + if(entranceBubbleIsGood or exitBubbleIsGood) { + const Bubble& newBubble = entranceBubbleIsGood ? entranceBubble : exitBubble; + SHASTA_ASSERT(newBubble.size() == 2); + bubbleChain.push_back(newBubble); + } else { + Chain newChain; + newChain.push_back(cGraph[entrance].edgeId); + newChain.push_back(cGraph[exit].edgeId); + Bubble newBubble; + newBubble.push_back(newChain); + bubbleChain.push_back(newBubble); + } + +} + + + +#if 0 +bool AssemblyGraph::detangleVerticesStrict(bool debug) +{ + if(debug) { + cout << "Detangling vertices." << endl; + } + AssemblyGraph& cGraph = *this; + + vector<vertex_descriptor> allVertices; + BGL_FORALL_VERTICES(cv, cGraph, AssemblyGraph) { + allVertices.push_back(cv); + } + + uint64_t detangledCount = 0; + for(const vertex_descriptor cv: allVertices) { + if(detangleVertexStrict(cv, debug)) { + ++detangledCount; + } + } + + if(debug) { + cout << "Detangled " << detangledCount << " vertices." << endl; + + } + + return detangledCount > 0; +} +#endif + + + +bool AssemblyGraph::detangleVertices( + bool debug, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP) +{ + if(debug) { + cout << "Detangling vertices." << endl; + } + AssemblyGraph& cGraph = *this; + + vector<vertex_descriptor> allVertices; + BGL_FORALL_VERTICES(cv, cGraph, AssemblyGraph) { + allVertices.push_back(cv); + } + + uint64_t detangledCount = 0; + for(const vertex_descriptor cv: allVertices) { + if(detangleVertex(cv, debug, detangleToleranceLow, detangleToleranceHigh, + useBayesianModel, epsilon, minLogP)) { + ++detangledCount; + } + } + + if(debug) { + cout << "Detangled " << detangledCount << " vertices." << endl; + } + + return detangledCount > 0; +} + + + +bool AssemblyGraph::detangleVerticesGeneral( + bool debug, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP) +{ + if(debug) { + cout << "Detangling vertices (general detangling)." << endl; + } + AssemblyGraph& cGraph = *this; + + vector<vertex_descriptor> allVertices; + BGL_FORALL_VERTICES(cv, cGraph, AssemblyGraph) { + allVertices.push_back(cv); + } + + uint64_t detangledCount = 0; + for(const vertex_descriptor cv: allVertices) { + if(detangleVertexGeneral(cv, debug, detangleToleranceLow, detangleToleranceHigh, + useBayesianModel, epsilon, minLogP)) { + ++detangledCount; + } + } + + if(debug) { + cout << "Detangled " << detangledCount << " vertices." << endl; + + } + + return detangledCount > 0; +} + + +// Compute the tangle matrix given in-edges and out-edges. +// The last bubble of each in-edge and the first bubble +// of each out-edge must be haploid. +void AssemblyGraph::computeTangleMatrix( + const vector<edge_descriptor>& inEdges, + const vector<edge_descriptor>& outEdges, + vector< vector<uint64_t> >& tangleMatrix, + bool setToZeroForComplementaryPairs + ) const +{ + const AssemblyGraph& cGraph = *this; + + tangleMatrix.clear(); + tangleMatrix.resize(inEdges.size(), vector<uint64_t>(outEdges.size())); + + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + const edge_descriptor ce0 = inEdges[i0]; + const BubbleChain& bubbleChain0 = cGraph[ce0]; + const Bubble& bubble0 = bubbleChain0.lastBubble(); + SHASTA_ASSERT(bubble0.isHaploid()); + const Chain& chain0 = bubble0.front(); + SHASTA_ASSERT(chain0.size() >= 2); + const MarkerGraphEdgeId markerGraphEdgeId0 = chain0[chain0.size() - 2]; // Exclude last + + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + const edge_descriptor ce1 = outEdges[i1]; + const BubbleChain& bubbleChain1 = cGraph[ce1]; + const Bubble& bubble1 = bubbleChain1.firstBubble(); + SHASTA_ASSERT(bubble1.isHaploid()); + const Chain& chain1 = bubble1.front(); + SHASTA_ASSERT(chain1.size() >= 2); + const MarkerGraphEdgeId markerGraphEdgeId1 = chain1[1]; // Exclude first + + if(setToZeroForComplementaryPairs and + assembler.markerGraph.reverseComplementEdge[markerGraphEdgeId0] == markerGraphEdgeId1) { + tangleMatrix[i0][i1] = 0; + } else { + MarkerGraphEdgePairInfo info; + SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(markerGraphEdgeId0, markerGraphEdgeId1, info)); + tangleMatrix[i0][i1] = info.common; + } + } + } +} + + + +#if 0 +// This works if the following is true: +// - For all incoming edges (bubble chains) of cv, the last bubble is haploid. +// - For all outgoing edges (bubble chains) of cv, the first bubble is haploid. +bool AssemblyGraph::detangleVertexStrict( + vertex_descriptor cv, bool debug) +{ + AssemblyGraph& cGraph = *this; + + // Gather the in-edges and check that the last bubble is haploid. + vector<edge_descriptor> inEdges; + BGL_FORALL_INEDGES(cv, ce, cGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = cGraph[ce]; + if(not bubbleChain.lastBubble().isHaploid()) { + return false; + } + inEdges.push_back(ce); + } + + // Gather the out-edges and check that the first bubble is haploid. + vector<edge_descriptor> outEdges; + BGL_FORALL_OUTEDGES(cv, ce, cGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = cGraph[ce]; + if(not bubbleChain.firstBubble().isHaploid()) { + return false; + } + outEdges.push_back(ce); + } + + if(inEdges.size() == 1 and outEdges.size() == 1) { + return false; + } + + // Compute the tangle matrix. + vector< vector<uint64_t> > tangleMatrix; + computeTangleMatrix(inEdges, outEdges, tangleMatrix, false); + + if(debug) { + cout << "Tangle matrix for vertex " << cGraph[cv].edgeId << endl; + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + cout << bubbleChainStringId(inEdges[i0]) << " " << + bubbleChainStringId(outEdges[i1]) << " " << + tangleMatrix[i0][i1] << endl; + } + } + } + + // If the tangle matrix contains no zeros, there is nothing to do. + bool foundZero = false; + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + if(tangleMatrix[i0][i1] == 0) { + foundZero = true; + break; + } + } + if(foundZero) { + break; + } + } + if(not foundZero) { + return false; + } + + // To avoid breaking contiguity, we require each column and each row of the + // tangle matrix to have at least one non-zero element. + // This means that each in-edge will be "merged" with at least one out-edge, + // and each out-edge will be "merged" with at least one in-edge. + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + bool foundNonZero = false; + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + if(tangleMatrix[i0][i1] != 0) { + foundNonZero = true; + break; + } + } + if(not foundNonZero) { + return false; + } + } + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + bool foundNonZero = false; + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + if(tangleMatrix[i0][i1] != 0) { + foundNonZero = true; + break; + } + } + if(not foundNonZero) { + return false; + } + } + + if(debug) { + cout << "This vertex will be detangled " << inEdges.size() << " by " << outEdges.size() << endl; + } + + + + // Each non-zero element of the tangle matrix generates a new edge, + // obtained by "merging" an in-edge with an out-edge. + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + const edge_descriptor ce0 = inEdges[i0]; + const BubbleChain& bubbleChain0 = cGraph[ce0]; + const Bubble& bubble0 = bubbleChain0.lastBubble(); + SHASTA_ASSERT(bubble0.isHaploid()); + const Chain& chain0 = bubble0.front(); + SHASTA_ASSERT(chain0.size() >= 2); + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + if(tangleMatrix[i0][i1] == 0) { + continue; + } + const edge_descriptor ce1 = outEdges[i1]; + const BubbleChain& bubbleChain1 = cGraph[ce1]; + const Bubble& bubble1 = bubbleChain1.firstBubble(); + SHASTA_ASSERT(bubble1.isHaploid()); + const Chain& chain1 = bubble1.front(); + SHASTA_ASSERT(chain1.size() >= 2); + + edge_descriptor eNew; + tie(eNew, ignore) = add_edge(source(ce0, cGraph), target(ce1, graph), cGraph); + AssemblyGraphEdge& newEdge = cGraph[eNew]; + newEdge.id = nextEdgeId++; + BubbleChain& newBubbleChain = newEdge; + + if(debug) { + cout << "Merging " << + bubbleChainStringId(ce0) << " " << + bubbleChainStringId(ce1) << " into " << + bubbleChainStringId(eNew) << endl; + } + + // Create the new BubbleChain. It is obtained by joining + // bubbleChain0 and bubbleChain1, with vertex cv + // removed from the end of bubbleChain0 + // and from the beginning of bubbleChain1. + // Here we use the above assumption that + // the last bubble of bubbleChain0 and the first bubble of bubbleChain1 + // are haploid. + newBubbleChain = bubbleChain0; + + // Remove cv from the end. + Bubble& newBubbleLast = newBubbleChain.back(); + SHASTA_ASSERT(newBubbleLast.size() == 1); + Chain& newChainLast = newBubbleLast.front(); + SHASTA_ASSERT(newChainLast.back() == cGraph[cv].edgeId); + newChainLast.resize(newChainLast.size() - 1); + + // Append chain1, except for cv. + SHASTA_ASSERT(chain1.front() == cGraph[cv].edgeId); + copy(chain1.begin() + 1, chain1.end(), back_inserter(newChainLast)); + + // Append the rest of bubbleChain1. + copy(bubbleChain1.begin() + 1, bubbleChain1.end(), back_inserter(newBubbleChain)); + } + + } + + // Now we can remove cv and all of its in-edges and out-edges. + clear_vertex(cv, cGraph); + cGraph.removeVertex(cv); + + return true; +} +#endif + + + +bool AssemblyGraph::detangleVertex( + vertex_descriptor cv, + bool debug, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP) +{ + AssemblyGraph& cGraph = *this; + + if(debug) { + cout << "Attempting to detangle vertex " << cGraph[cv].edgeId << endl; + } + + + // Gather the in-edges and check that the last bubble is haploid. + vector<edge_descriptor> inEdges; + BGL_FORALL_INEDGES(cv, ce, cGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = cGraph[ce]; + if(not bubbleChain.lastBubble().isHaploid()) { + if(debug) { + cout << "Not detangled because the last bubble of in-edge " << + bubbleChainStringId(ce) << " is not haploid." << endl; + } + return false; + } + inEdges.push_back(ce); + } + + // Gather the out-edges and check that the first bubble is haploid. + vector<edge_descriptor> outEdges; + BGL_FORALL_OUTEDGES(cv, ce, cGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = cGraph[ce]; + if(not bubbleChain.firstBubble().isHaploid()) { + if(debug) { + cout << "Not detangled because the first bubble of out-edge " << + bubbleChainStringId(ce) << " is not haploid." << endl; + } + return false; + } + outEdges.push_back(ce); + } + + if(inEdges.size() == 0 or outEdges.size() == 0) { + if(debug) { + cout << "Not detangling due to degree (case 1)." << endl; + } + return false; + } + if(inEdges.size() < 2 and outEdges.size() < 2) { + if(debug) { + cout << "Not detangling due to degree (case 2)." << endl; + } + return false; + } + + + + // If a MarkerGraphEdgeId appears both in the inEdges and in the outEdges, + // detangling could generate a chain with two consecutive copies of the same + // MarkerGraphEdgeId. Don't detangle. + for(const edge_descriptor ce0: inEdges) { + const BubbleChain& bubbleChain0 = cGraph[ce0]; + const Bubble& bubble0 = bubbleChain0.lastBubble(); + SHASTA_ASSERT(bubble0.isHaploid()); + const Chain& chain0 = bubble0.front(); + SHASTA_ASSERT(chain0.size() >= 2); + const MarkerGraphEdgeId markerGraphEdgeId0 = chain0[chain0.size() - 2]; // Exclude last + + for(const edge_descriptor ce1: outEdges) { + const BubbleChain& bubbleChain1 = cGraph[ce1]; + const Bubble& bubble1 = bubbleChain1.firstBubble(); + SHASTA_ASSERT(bubble1.isHaploid()); + const Chain& chain1 = bubble1.front(); + SHASTA_ASSERT(chain1.size() >= 2); + const MarkerGraphEdgeId markerGraphEdgeId1 = chain1[1]; // Exclude first + + if(markerGraphEdgeId0 == markerGraphEdgeId1) { + if(debug) { + cout << "Not detangling due to cycle." << endl; + } + return false; + } + } + } + + + + // Compute the tangle matrix. + vector< vector<uint64_t> > tangleMatrix; + computeTangleMatrix(inEdges, outEdges, tangleMatrix, false); + + if(debug) { + cout << "Tangle matrix for vertex " << cGraph[cv].edgeId << endl; + + cout << "In-edges: "; + for(const edge_descriptor ce: inEdges) { + cout << " " << bubbleChainStringId(ce); + } + cout << endl; + + cout << "Out-edges: "; + for(const edge_descriptor ce: outEdges) { + cout << " " << bubbleChainStringId(ce); + } + cout << endl; + + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + cout << bubbleChainStringId(inEdges[i0]) << " " << + bubbleChainStringId(outEdges[i1]) << " " << + tangleMatrix[i0][i1] << endl; + } + } + } + + + + // Do the detangling based on the tangle matrix. + if(useBayesianModel and inEdges.size() == 2 and outEdges.size() == 2) { + + // Use the 2 by 2 Bayesian model for detangling. + array< array<uint64_t, 2>, 2> tangleMatrix22; + for(uint64_t i=0; i<2; i++) { + for(uint64_t j=0; j<2; j++) { + tangleMatrix22[i][j] = tangleMatrix[i][j]; + } + } + + // Compute logarithmic probability ratio of in-phase and out-of-phase + // against random. + double logPin; + double logPout; + tie(logPin, logPout) = diploidBayesianPhase(tangleMatrix22, epsilon); + if(debug) { + cout << "logPin = " << logPin << ", logPout = " << logPout << endl; + } + + // const bool isInPhase = (logPin >= minLogP) and ((logPin - logPout) >= minLogP); + // const bool isOutOfPhase = (logPout >= minLogP) and ((logPout - logPin) >= minLogP); + // Ignore the random hypothesis. + const bool isInPhase = (logPin - logPout) >= minLogP; + const bool isOutOfPhase = (logPout - logPin) >= minLogP; + + if(isInPhase or isOutOfPhase) { + + // We can detangle. + if(debug) { + cout << "This vertex will be detangled." << endl; + } + + // Create truncated versions of the inEdges and outEdges. + vector<vertex_descriptor> inVertices; + for(const edge_descriptor ce: inEdges) { + inVertices.push_back(cloneAndTruncateAtEnd(ce)); + } + vector<vertex_descriptor> outVertices; + for(const edge_descriptor ce: outEdges) { + outVertices.push_back(cloneAndTruncateAtBeginning(ce)); + } + + if(isInPhase) { + connect(inVertices[0], outVertices[0]); + connect(inVertices[1], outVertices[1]); + } else { + connect(inVertices[0], outVertices[1]); + connect(inVertices[1], outVertices[0]); + } + + // Now we can remove cv and all of its in-edges and out-edges. + clear_vertex(cv, cGraph); + cGraph.removeVertex(cv); + return true; + + } else { + + // Ambiguous. Don't detangle. + if(debug) { + cout << "This vertex will not be detangled." << endl; + } + return false; + } + + } else { + + // Don't use the Bayesian model. + // Instead, do simple counting of tangle matrix elements. + + // Count the number of significant, ambiguous, and negligible elements + // in the tangle matrix. + uint64_t significantCount = 0; + uint64_t ambiguousCount = 0; + uint64_t negligibleCount = 0; + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + const uint64_t t = tangleMatrix[i0][i1]; + if(t <= detangleToleranceLow) { + ++negligibleCount; + } else if(t >= detangleToleranceHigh) { + ++significantCount; + } else { + ++ambiguousCount; + } + } + } + + // If the tangle matrix contains any ambiguous elements, do nothing. + if(ambiguousCount > 0) { + return false; + } + + // There are no ambiguous elements. + // If there are no negligible element, that is all elements of the tangle matrix are significant, + // there is nothing to do. + if(negligibleCount == 0) { + return false; + } + + // To avoid breaking contiguity, we require each column and each row of the + // tangle matrix to have at least one significant element. + // This means that each in-edge will be "merged" with at least one out-edge, + // and each out-edge will be "merged" with at least one in-edge. + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + bool foundSignificant = false; + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + if(tangleMatrix[i0][i1] >= detangleToleranceHigh) { + foundSignificant = true; + break; + } + } + if(not foundSignificant) { + return false; + } + } + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + bool foundSignificant = false; + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + if(tangleMatrix[i0][i1] >= detangleToleranceHigh) { + foundSignificant = true; + break; + } + } + if(not foundSignificant) { + return false; + } + } + + if(debug) { + cout << "This vertex will be detangled " << inEdges.size() << " by " << outEdges.size() << endl; + } + + // Create truncated versions of the inEdges and outEdges. + vector<vertex_descriptor> inVertices; + for(const edge_descriptor ce: inEdges) { + inVertices.push_back(cloneAndTruncateAtEnd(ce)); + } + vector<vertex_descriptor> outVertices; + for(const edge_descriptor ce: outEdges) { + outVertices.push_back(cloneAndTruncateAtBeginning(ce)); + } + + // Each significant element of the tangle matrix generates a new edge. + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + if(tangleMatrix[i0][i1] >= detangleToleranceHigh) { + connect(inVertices[i0], outVertices[i1]); + } + } + } + + // Now we can remove cv and all of its in-edges and out-edges. + clear_vertex(cv, cGraph); + cGraph.removeVertex(cv); + return true; + } + + +#if 0 + // Each significant element of the tangle matrix generates a new edge, + // obtained by "merging" an in-edge with an out-edge. + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + const edge_descriptor ce0 = inEdges[i0]; + const BubbleChain& bubbleChain0 = cGraph[ce0]; + const Bubble& bubble0 = bubbleChain0.lastBubble(); + SHASTA_ASSERT(bubble0.isHaploid()); + const Chain& chain0 = bubble0.front(); + SHASTA_ASSERT(chain0.size() >= 2); + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + if(tangleMatrix[i0][i1] < detangleToleranceHigh) { + continue; + } + const edge_descriptor ce1 = outEdges[i1]; + const BubbleChain& bubbleChain1 = cGraph[ce1]; + const Bubble& bubble1 = bubbleChain1.firstBubble(); + SHASTA_ASSERT(bubble1.isHaploid()); + const Chain& chain1 = bubble1.front(); + SHASTA_ASSERT(chain1.size() >= 2); + + edge_descriptor eNew; + tie(eNew, ignore) = add_edge(source(ce0, cGraph), target(ce1, graph), cGraph); + AssemblyGraphEdge& newEdge = cGraph[eNew]; + newEdge.id = nextEdgeId++; + BubbleChain& newBubbleChain = newEdge; + + if(debug) { + cout << "Merging " << + bubbleChainStringId(ce0) << " " << + bubbleChainStringId(ce1) << " into " << + bubbleChainStringId(eNew) << endl; + } + + // Create the new BubbleChain. It is obtained by joining + // bubbleChain0 and bubbleChain1, with vertex cv + // removed from the end of bubbleChain0 + // and from the beginning of bubbleChain1. + // Here we use the above assumption that + // the last bubble of bubbleChain0 and the first bubble of bubbleChain1 + // are haploid. + newBubbleChain = bubbleChain0; + + // Remove cv from the end. + Bubble& newBubbleLast = newBubbleChain.back(); + SHASTA_ASSERT(newBubbleLast.size() == 1); + Chain& newChainLast = newBubbleLast.front(); + SHASTA_ASSERT(newChainLast.back() == cGraph[cv].edgeId); + newChainLast.resize(newChainLast.size() - 1); + + // Append chain1, except for cv. + SHASTA_ASSERT(chain1.front() == cGraph[cv].edgeId); + copy(chain1.begin() + 1, chain1.end(), back_inserter(newChainLast)); + + // Append the rest of bubbleChain1. + copy(bubbleChain1.begin() + 1, bubbleChain1.end(), back_inserter(newBubbleChain)); + } + + } +#endif + + + SHASTA_ASSERT(0); +} + + + +// Ths version can handle the case where the last bubble of an in-edge +// or the first bubble of an out-edge is not haploid. +// It works like this: +// - Compute a generalized tangle matrix taking using the next to last +// MarkerGraphEdgeId of each incoming chain +// and the second MarkerGraphEdgeId of each outgoing chain. +// - If detangling is possible based on this generalized tangle matrix, +// split the last bubble of each incoming edge and the first +// bubble of each outgoing edge. After this operation, +// the last bubble of each in-edge is haploid and the first bubble +// of each out-edge is haploid. +// - Call detangleVertex to do the detangling. +bool AssemblyGraph::detangleVertexGeneral( + vertex_descriptor cv, + bool debug, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP) +{ + AssemblyGraph& cGraph = *this; + +#if 0 + // Use detangleVertex, if possible. + bool involvesNonHaploidBubbles = false; + BGL_FORALL_INEDGES(cv, ce, cGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = cGraph[ce]; + if(not bubbleChain.lastBubble().isHaploid()) { + involvesNonHaploidBubbles = true; + } + } + BGL_FORALL_OUTEDGES(cv, ce, cGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = cGraph[ce]; + if(not bubbleChain.firstBubble().isHaploid()) { + involvesNonHaploidBubbles = true; + } + } + if(not involvesNonHaploidBubbles) { + if(debug) { + cout << "No non-haploid bubbles involved, using detangleVertex." << endl; + } + return detangleVertex(cv, debug, detangleToleranceLow, detangleToleranceHigh); + } +#endif + + if(in_degree(cv, cGraph) < 2 or out_degree(cv, cGraph) < 2) { + return false; + } + + if(debug) { + cout << "Attempting general detangling for vertex " << cGraph[cv].edgeId << endl; + } + + class ChainInfo { + public: + edge_descriptor ce; + uint64_t indexInBubble; + MarkerGraphEdgeId edgeId; + }; + vector<ChainInfo> inChains; + BGL_FORALL_INEDGES(cv, ce, cGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = cGraph[ce]; + const Bubble& bubble = bubbleChain.lastBubble(); + for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) { + const Chain& chain = bubble[indexInBubble]; + inChains.push_back({ce, indexInBubble, chain[chain.size() - 2]}); + } + } + vector<ChainInfo> outChains; + BGL_FORALL_OUTEDGES(cv, ce, cGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = cGraph[ce]; + const Bubble& bubble = bubbleChain.firstBubble(); + for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) { + const Chain& chain = bubble[indexInBubble]; + outChains.push_back({ce, indexInBubble, chain[1]}); + } + } + + if(debug) { + + cout << "In:" << endl; + for(const ChainInfo& chainInfo: inChains) { + cout << bubbleChainStringId(chainInfo.ce) << " " << + chainInfo.indexInBubble << " " << + chainInfo.edgeId << endl; + } + + cout << "Out:" << endl; + for(const ChainInfo& chainInfo: outChains) { + cout << bubbleChainStringId(chainInfo.ce) << " " << + chainInfo.indexInBubble << " " << + chainInfo.edgeId << endl; + } + } + + // Compute a generalized tangle matrix. + vector<vector<uint64_t> > tangleMatrix(inChains.size(), vector<uint64_t>(outChains.size())); + for(uint64_t i0=0; i0<inChains.size(); i0++) { + const MarkerGraphEdgeId markerGraphEdgeId0 = inChains[i0].edgeId; + + for(uint64_t i1=0; i1<outChains.size(); i1++) { + const MarkerGraphEdgeId markerGraphEdgeId1 = outChains[i1].edgeId; + + MarkerGraphEdgePairInfo info; + SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(markerGraphEdgeId0, markerGraphEdgeId1, info)); + tangleMatrix[i0][i1] = info.common; + } + } + + if(debug) { + cout << "Tangle matrix:" << endl; + for(uint64_t i0=0; i0<inChains.size(); i0++) { + const ChainInfo& chainInfo0 = inChains[i0]; + for(uint64_t i1=0; i1<outChains.size(); i1++) { + const ChainInfo& chainInfo1 = outChains[i1]; + + cout << + bubbleChainStringId(chainInfo0.ce) << " " << + chainInfo0.indexInBubble << " " << + chainInfo0.edgeId << " " << + bubbleChainStringId(chainInfo1.ce) << " " << + chainInfo1.indexInBubble << " " << + chainInfo1.edgeId << " " << + tangleMatrix[i0][i1] << endl; + } + } + + } + + + // Figure out if we can detangle. + if(useBayesianModel and + (inChains.size() == 2) and + (outChains.size() == 2)) { + + // Use the 2 by 2 Bayesian model for detangling. + array< array<uint64_t, 2>, 2> tangleMatrix22; + for(uint64_t i=0; i<2; i++) { + for(uint64_t j=0; j<2; j++) { + tangleMatrix22[i][j] = tangleMatrix[i][j]; + } + } + + // Compute logarithmic probability ratio of in-phase and out-of-phase + // against random. + double logPin; + double logPout; + tie(logPin, logPout) = diploidBayesianPhase(tangleMatrix22, epsilon); + if(debug) { + cout << "logPin = " << logPin << ", logPout = " << logPout << endl; + } + + const bool isInPhase = (logPin >= minLogP) and ((logPin - logPout) >= minLogP); + const bool isOutOfPhase = (logPout >= minLogP) and ((logPout - logPin) >= minLogP); + if(not (isInPhase or isOutOfPhase)) { + if(debug) { + cout << "Ambiguous, don't detangle." << endl; + } + return false; + } + + } else { + + // Not using the Bayesian model. + // Count the number of significant, ambiguous, and negligible elements + // in the tangle matrix. + uint64_t significantCount = 0; + uint64_t ambiguousCount = 0; + uint64_t negligibleCount = 0; + for(uint64_t i0=0; i0<inChains.size(); i0++) { + for(uint64_t i1=0; i1<outChains.size(); i1++) { + const uint64_t t = tangleMatrix[i0][i1]; + if(t <= detangleToleranceLow) { + ++negligibleCount; + } else if(t >= detangleToleranceHigh) { + ++significantCount; + } else { + ++ambiguousCount; + } + } + } + + // If the tangle matrix contains any ambiguous elements, do nothing. + if(ambiguousCount > 0) { + if(debug) { + cout << "Tangle matrix is ambiguous." << endl; + } + return false; + } + // There are no ambiguous elements. + // If there are no negligible element, that is all elements of the tangle matrix are significant, + // there is nothing to do. + if(negligibleCount == 0) { + return false; + } + + // To avoid breaking contiguity, we require each column and each row of the + // tangle matrix to have at least one significant element. + // This means that each in-edge will be "merged" with at least one out-edge, + // and each out-edge will be "merged" with at least one in-edge. + for(uint64_t i0=0; i0<inChains.size(); i0++) { + bool foundSignificant = false; + for(uint64_t i1=0; i1<outChains.size(); i1++) { + if(tangleMatrix[i0][i1] >= detangleToleranceHigh) { + foundSignificant = true; + break; + } + } + if(not foundSignificant) { + return false; + } + } + for(uint64_t i1=0; i1<outChains.size(); i1++) { + bool foundSignificant = false; + for(uint64_t i0=0; i0<inChains.size(); i0++) { + if(tangleMatrix[i0][i1] >= detangleToleranceHigh) { + foundSignificant = true; + break; + } + } + if(not foundSignificant) { + return false; + } + } + } + + if(debug) { + cout << "This vertex can be detangled after some splitting of bubble chains." << endl; + } + + + // Make sure the last bubble of all in-edges is haploid. + in_edge_iterator itIn, itInEnd; + tie(itIn, itInEnd) = in_edges(cv, cGraph); + while(itIn != itInEnd) { + const edge_descriptor ce = *itIn; + ++itIn; // Increment before possibly removing this edge! + const BubbleChain& bubbleChain = cGraph[ce]; + if(not bubbleChain.lastBubble().isHaploid()) { + if(debug) { + cout << "In-edge " << bubbleChainStringId(ce) << + " needs to be split at the end." << endl; + } + splitBubbleChainAtEnd(ce); + } + } + + // Make sure the first bubble of all out-edges is haploid. + out_edge_iterator itOut, itOutEnd; + tie(itOut, itOutEnd) = out_edges(cv, cGraph); + while(itOut != itOutEnd) { + const edge_descriptor ce = *itOut; + ++itOut; // Increment before possibly removing this edge! + const BubbleChain& bubbleChain = cGraph[ce]; + if(not bubbleChain.firstBubble().isHaploid()) { + if(debug) { + cout << "Out-edge " << bubbleChainStringId(ce) << + " needs to be split at the beginning." << endl; + } + splitBubbleChainAtBeginning(ce); + } + } + + // Now we can detangle using detangleVertex. + if(debug) { + cout << "Calling detangleVertex." << endl; + } + return detangleVertex(cv, debug, detangleToleranceLow, detangleToleranceHigh, + useBayesianModel, epsilon, minLogP); +} + + + +// Split the first bubble of a bubble chain. +// Used by detangleVertexGeneral to eliminate +// non-haploid bubbles adjacent to a vertex to be detangled. +void AssemblyGraph::splitBubbleChainAtBeginning(edge_descriptor ce) +{ + AssemblyGraph& cGraph = *this; + + const BubbleChain& bubbleChain = cGraph[ce]; + const Bubble& firstBubble = bubbleChain.firstBubble(); + SHASTA_ASSERT(not firstBubble.isHaploid()); + + const vertex_descriptor cv0 = source(ce, cGraph); + const vertex_descriptor cv1 = target(ce, cGraph); + + + + // General case where the bubble chain has more than one bubble. + // Generate a new edge containing the bubble chain except for + // the first bubble, plus one new edge for each chain in the firstBubble. + if(bubbleChain.size() > 1) { + + // Generate one new edge containing the bubble chain except for + // the first bubble. + AssemblyGraphEdge newEdge; + newEdge.id = nextEdgeId++; + copy(bubbleChain.begin() + 1, bubbleChain.end(), back_inserter(newEdge)); + const vertex_descriptor cv2 = createVertex(newEdge.front().front().front()); + boost::add_edge(cv2, cv1, newEdge, cGraph); + + // Generate a new edge for each chain in the firstBubble. + for(const Chain& chain: firstBubble) { + AssemblyGraphEdge newEdge; + newEdge.resize(1); // The new edge has only one bubble. + Bubble& newBubble = newEdge.front(); + newEdge.id = nextEdgeId++; + newBubble.push_back(chain); + boost::add_edge(cv0, cv2, newEdge, cGraph); + } + } + + + // Special case where the bubble chain has one bubble. + // We generate one new edge for each chain in the firstBubble. + else { + + // Generate a new edge for each chain in the firstBubble. + for(const Chain& chain: firstBubble) { + AssemblyGraphEdge newEdge; + newEdge.resize(1); // The new edge has only one bubble. + Bubble& newBubble = newEdge.front(); + newEdge.id = nextEdgeId++; + newBubble.push_back(chain); + boost::add_edge(cv0, cv1, newEdge, cGraph); + } + } + + // Now we can remove the original bubble chain. + boost::remove_edge(ce, cGraph); +} + + + +// Split the last bubble of a bubble chain. +// Used by detangleVertexGeneral to eliminate +// non-haploid bubbles adjacent to a vertex to be detangled. +void AssemblyGraph::splitBubbleChainAtEnd(edge_descriptor ce) +{ + AssemblyGraph& cGraph = *this; + + const BubbleChain& bubbleChain = cGraph[ce]; + const Bubble& lastBubble = bubbleChain.lastBubble(); + SHASTA_ASSERT(not lastBubble.isHaploid()); + + const vertex_descriptor cv0 = source(ce, cGraph); + const vertex_descriptor cv1 = target(ce, cGraph); + + + + // General case where the bubble chain has more than one bubble. + // Generate a new edge containing the bubble chain except for + // the last bubble, plus one new edge for each chain in the lastBubble. + if(bubbleChain.size() > 1) { + + // Generate one new edge containing the bubble chain except for + // the last bubble. + AssemblyGraphEdge newEdge; + newEdge.id = nextEdgeId++; + copy(bubbleChain.begin(), bubbleChain.end()-1, back_inserter(newEdge)); + const vertex_descriptor cv2 = createVertex(newEdge.back().front().back()); + boost::add_edge(cv0, cv2, newEdge, cGraph); + + // Generate a new edge for each chain in the lastBubble. + for(const Chain& chain: lastBubble) { + AssemblyGraphEdge newEdge; + newEdge.resize(1); // The new edge has only one bubble. + Bubble& newBubble = newEdge.front(); + newEdge.id = nextEdgeId++; + newBubble.push_back(chain); + boost::add_edge(cv2, cv1, newEdge, cGraph); + } + } + + + // Special case where the bubble chain has one bubble. + // We generate one new edge for each chain in the lastBubble. + else { + + // Generate a new edge for each chain in the lastBubble. + for(const Chain& chain: lastBubble) { + AssemblyGraphEdge newEdge; + newEdge.resize(1); // The new edge has only one bubble. + Bubble& newBubble = newEdge.front(); + newEdge.id = nextEdgeId++; + newBubble.push_back(chain); + boost::add_edge(cv0, cv1, newEdge, cGraph); + } + } + + // Now we can remove the original bubble chain. + boost::remove_edge(ce, cGraph); +} + + + +bool AssemblyGraph::detangleEdges( + bool debug, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP) +{ + if(debug) { + cout << "Detangling edges." << endl; + } + + AssemblyGraph& cGraph = *this; + + // To safely iterate over edges while removing edges we must use edge ids + // as unique identifiers, because edge descriptors can be reused as edges are + // deleted ndw new edges are created. + std::map<uint64_t, edge_descriptor> edgeMap; + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + edgeMap.insert({cGraph[ce].id, ce}); + } + + uint64_t detangleCount = 0;; + for(auto it=edgeMap.begin(); it!=edgeMap.end(); /* Incremented safely by detangleEdgeStrict */) { + if(detangleEdge(debug, edgeMap, it, detangleToleranceLow, detangleToleranceHigh, + useBayesianModel, epsilon, minLogP)) { + ++detangleCount; + } + } + + if(debug) { + cout << "Detangled " << detangleCount << " edges." << endl; + } + + return detangleCount > 0; +} + + + +bool AssemblyGraph::detangleEdgesGeneral( + bool debug, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP) +{ + if(debug) { + cout << "Detangling edges." << endl; + } + + AssemblyGraph& cGraph = *this; + + // To safely iterate over edges while removing edges we must use edge ids + // as unique identifiers, because edge descriptors can be reused as edges are + // deleted ndw new edges are created. + std::map<uint64_t, edge_descriptor> edgeMap; + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + edgeMap.insert({cGraph[ce].id, ce}); + } + + uint64_t detangleCount = 0;; + for(auto it=edgeMap.begin(); it!=edgeMap.end(); /* Incremented safely by detangleEdgeStrict */) { + if(detangleEdgeGeneral(debug, edgeMap, it, detangleToleranceLow, detangleToleranceHigh, + useBayesianModel, epsilon, minLogP)) { + ++detangleCount; + } + } + + if(debug) { + cout << "Detangled " << detangleCount << " edges." << endl; + } + + return detangleCount > 0; +} + + + +bool AssemblyGraph::detangleEdge( + bool debug, + std::map<uint64_t, edge_descriptor>& edgeMap, + std::map<uint64_t, edge_descriptor>::iterator& it, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP) +{ + AssemblyGraph& cGraph = *this; + const edge_descriptor ce = it->second; + ++it; + // edgeMap.erase(cGraph[ce].id); + + // Only try detangling if the edge consists of a single haploid bubble. + // Otherwise detangling would lose information. + BubbleChain& bubbleChain = cGraph[ce]; + if(bubbleChain.size() > 1) { + return false; + } + if(bubbleChain.front().size() > 1) { + return false; + } + + // Tangle matrix elements <= detangleToleranceLow are treated as negigible. + // Tangle matrix elements >= detangleToleranceHigh are treated as significant. + // Tangle matrix elements in between are considered ambiguous. + SHASTA_ASSERT(detangleToleranceHigh > detangleToleranceLow); + + const vertex_descriptor cv0 = source(ce, cGraph); + const vertex_descriptor cv1 = target(ce, cGraph); + + if(out_degree(cv0, cGraph) != 1) { + return false; + } + if(in_degree(cv1, cGraph) != 1) { + return false; + } + + if(debug) { + cout << "Attempting to detangle edge " << bubbleChainStringId(ce) << endl; + } + + // Gather the in-edges and check that the last bubble is haploid. + // Ignore in-edges coming from cv1 (back-edges). + vector<edge_descriptor> inEdges; + vector<edge_descriptor> backEdges; + BGL_FORALL_INEDGES(cv0, ce, cGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = cGraph[ce]; + if(not bubbleChain.lastBubble().isHaploid()) { + if(debug) { + cout << "Not detangling because the last bubble of in-edge " << + bubbleChainStringId(ce) << " is not haploid." << endl; + } + return false; + } + if(source(ce, cGraph) != cv1) { + inEdges.push_back(ce); + } else { + backEdges.push_back(ce); + } + } + + // Gather the out-edges and check that the first bubble is haploid. + // Ignore out-edges going to cv0 (back-edges). + vector<edge_descriptor> outEdges; + BGL_FORALL_OUTEDGES(cv1, ce, cGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = cGraph[ce]; + if(not bubbleChain.firstBubble().isHaploid()) { + if(debug) { + cout << "Not detangling because the first bubble of out-edge " << + bubbleChainStringId(ce) << " is not haploid." << endl; + } + return false; + } + if(target(ce, cGraph) != cv0) { + outEdges.push_back(ce); + } + } + + if(inEdges.size() == 0 or outEdges.size() == 0) { + if(debug) { + cout << "Not detangling due to degree (case 1)." << endl; + } + return false; + } + if(inEdges.size() < 2 and outEdges.size() < 2) { + if(debug) { + cout << "Not detangling due to degree (case 2)." << endl; + } + return false; + } + if(inEdges.size() != outEdges.size()) { + if(debug) { + cout << "Not detangling due to degree (case 3)." << endl; + } + return false; + } + + + + // If a MarkerGraphEdgeId appears both in the inEdges and in the outEdges, + // detangling could generate a chain with two consecutive copies of the same + // MarkerGraphEdgeId. Don't detangle. + for(const edge_descriptor ce0: inEdges) { + const BubbleChain& bubbleChain0 = cGraph[ce0]; + const Bubble& bubble0 = bubbleChain0.lastBubble(); + SHASTA_ASSERT(bubble0.isHaploid()); + const Chain& chain0 = bubble0.front(); + SHASTA_ASSERT(chain0.size() >= 2); + const MarkerGraphEdgeId markerGraphEdgeId0 = chain0[chain0.size() - 2]; // Exclude last + + for(const edge_descriptor ce1: outEdges) { + const BubbleChain& bubbleChain1 = cGraph[ce1]; + const Bubble& bubble1 = bubbleChain1.firstBubble(); + SHASTA_ASSERT(bubble1.isHaploid()); + const Chain& chain1 = bubble1.front(); + SHASTA_ASSERT(chain1.size() >= 2); + const MarkerGraphEdgeId markerGraphEdgeId1 = chain1[1]; // Exclude first + + if(markerGraphEdgeId0 == markerGraphEdgeId1) { + if(debug) { + cout << "Not detangling due to cycle." << endl; + } + return false; + } + } + } + + + + // Compute the tangle matrix. + vector< vector<uint64_t> > tangleMatrix; + computeTangleMatrix(inEdges, outEdges, tangleMatrix, false); + + if(debug) { + cout << "Computing tangle matrix for edge " << bubbleChainStringId(ce) << endl; + + cout << "In-edges: "; + for(const edge_descriptor ce: inEdges) { + cout << " " << bubbleChainStringId(ce); + } + cout << endl; + + cout << "Out-edges: "; + for(const edge_descriptor ce: outEdges) { + cout << " " << bubbleChainStringId(ce); + } + cout << endl; + + cout << "Tangle matrix:" << endl; + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + const edge_descriptor ce0 = inEdges[i0]; + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + const edge_descriptor ce1 = outEdges[i1]; + cout << + bubbleChainStringId(ce0) << " " << + bubbleChainStringId(ce1) << " " << + tangleMatrix[i0][i1]; + if(tangleMatrix[i0][i1] == 0) { + cout << " zero tangle matrix element"; + } + cout << endl; + } + } + } + + + + // Detangle based on the contents of the tangle matrix. + if(useBayesianModel and inEdges.size() == 2 and outEdges.size() == 2) { + + // Use the 2 by 2 Bayesian model for detangling. + array< array<uint64_t, 2>, 2> tangleMatrix22; + for(uint64_t i=0; i<2; i++) { + for(uint64_t j=0; j<2; j++) { + tangleMatrix22[i][j] = tangleMatrix[i][j]; + } + } + + // Compute logarithmic probability ratio of in-phase and out-of-phase + // against random. + double logPin; + double logPout; + tie(logPin, logPout) = diploidBayesianPhase(tangleMatrix22, epsilon); + if(debug) { + cout << "logPin = " << logPin << ", logPout = " << logPout << endl; + } + + // const bool isInPhase = (logPin >= minLogP) and ((logPin - logPout) >= minLogP); + // const bool isOutOfPhase = (logPout >= minLogP) and ((logPout - logPin) >= minLogP); + // Ignore the random hypothesis. + const bool isInPhase = (logPin - logPout) >= minLogP; + const bool isOutOfPhase = (logPout - logPin) >= minLogP; + + if(isInPhase or isOutOfPhase) { + + // We can detangle. + + // Create truncated versions of the inEdges and outEdges. + vector<vertex_descriptor> inVertices; + for(const edge_descriptor ce: inEdges) { + inVertices.push_back(cloneAndTruncateAtEnd(ce)); + } + vector<vertex_descriptor> outVertices; + for(const edge_descriptor ce: outEdges) { + outVertices.push_back(cloneAndTruncateAtBeginning(ce)); + } + + if(isInPhase) { + const edge_descriptor e0 = connect(inVertices[0], outVertices[0]); + const edge_descriptor e1 = connect(inVertices[1], outVertices[1]); + if(debug) { + cout << "In phase: created " << bubbleChainStringId(e0) << " and " << + bubbleChainStringId(e1) << endl; + } + } else { + const edge_descriptor e0 = connect(inVertices[0], outVertices[1]); + const edge_descriptor e1 = connect(inVertices[1], outVertices[0]); + if(debug) { + cout << "Out of phase phase: created " << bubbleChainStringId(e0) << " and " << + bubbleChainStringId(e1) << endl; + } + } + + } else { + + // Ambiguous. Don't detangle. + if(debug) { + cout << "Ambiguous. NOt detangling." << endl; + } + return false; + } + + } else { + + + + // We are not using the Bayesian model. + + // Count the number of significant, ambiguous, and negligible elements + // in the tangle matrix. + uint64_t significantCount = 0; + uint64_t ambiguousCount = 0; + uint64_t negligibleCount = 0; + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + const uint64_t t = tangleMatrix[i0][i1]; + if(t <= detangleToleranceLow) { + ++negligibleCount; + } else if(t >= detangleToleranceHigh) { + ++significantCount; + } else { + ++ambiguousCount; + } + } + } + + // If the tangle matrix contains any ambiguous elements, do nothing. + if(ambiguousCount > 0) { + return false; + } + + // There are no ambiguous elements. + // If there are no negligible element, that is all elements of the tangle matrix are significant, + // there is nothing to do. + if(negligibleCount == 0) { + return false; + } + + // To avoid breaking contiguity, we require each column and each row of the + // tangle matrix to have at least one significant element. + // This means that each in-edge will be "merged" with at least one out-edge, + // and each out-edge will be "merged" with at least one in-edge. + // ACTUALY, FOR MORE ROBUSTNESS REQUIRE EXACTLY OEN SIGNIFICANT ELEMENT PER ROW AND COLUMN. + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + uint64_t significantCount = 0; + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + if(tangleMatrix[i0][i1] >= detangleToleranceHigh) { + ++significantCount; + } + } + if(significantCount != 1) { + return false; + } + } + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + uint64_t significantCount = 0; + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + if(tangleMatrix[i0][i1] >= detangleToleranceHigh) { + ++significantCount; + } + } + if(significantCount != 1) { + return false; + } + } + + #if 0 + // In an in-edge is also an out-edge, don't detangle. + for(const edge_descriptor ce: inEdges) { + if(find(outEdges.begin(), outEdges.end(), ce) != outEdges.end()) { + if(debug) { + cout << "Not degangled because an in-edge is also an out-edge." << endl; + } + return false; + } + } + #endif + + if(debug) { + cout << "This edge will be detangled " << inEdges.size() << " by " << outEdges.size() << endl; + } + + // Create truncated versions of the inEdges and outEdges. + vector<vertex_descriptor> inVertices; + for(const edge_descriptor ce: inEdges) { + inVertices.push_back(cloneAndTruncateAtEnd(ce)); + } + vector<vertex_descriptor> outVertices; + for(const edge_descriptor ce: outEdges) { + outVertices.push_back(cloneAndTruncateAtBeginning(ce)); + } + + + // Each significant element of the tangle matrix generates a new edge. + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + if(tangleMatrix[i0][i1] >= detangleToleranceHigh) { + const edge_descriptor ceNew = connect(inVertices[i0], outVertices[i1]); + if(debug) { + cout << "Created " << bubbleChainStringId(ceNew) << endl; + } + } + } + } + } + + + // Now we can remove cv0, cv1, ce, and all of the in-edges and out-edges. + // We have to do this while safely incrementing the edge iterator to point to the + // next edge that was not removed. + // We already incremented the iterator to point past ce. + boost::remove_edge(ce, cGraph); + for(const edge_descriptor ce: inEdges) { + if(it != edgeMap.end() and cGraph[ce].id == it->first) { + ++it; + } + edgeMap.erase(cGraph[ce].id); + boost::remove_edge(ce, cGraph); + } + for(const edge_descriptor ce: outEdges) { + if(it != edgeMap.end() and cGraph[ce].id == it->first) { + ++it; + } + edgeMap.erase(cGraph[ce].id); + boost::remove_edge(ce, cGraph); + } + for(const edge_descriptor ce: backEdges) { + if(it != edgeMap.end() and cGraph[ce].id == it->first) { + ++it; + } + edgeMap.erase(cGraph[ce].id); + boost::remove_edge(ce, cGraph); + } + cGraph.removeVertex(cv0); + cGraph.removeVertex(cv1); + + return true; +} + + + +bool AssemblyGraph::detangleEdgeGeneral( + bool debug, + std::map<uint64_t, edge_descriptor>& edgeMap, + std::map<uint64_t, edge_descriptor>::iterator& it, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP) +{ + // detangleEdgeGeneral does not have code to use the Bayesian model + // for the 2 by 2 case. See detangleEdge. + SHASTA_ASSERT(not useBayesianModel); + + AssemblyGraph& cGraph = *this; + const edge_descriptor ce = it->second; + ++it; + + // Tangle matrix elements <= detangleToleranceLow are treated as negigible. + // Tangle matrix elements >= detangleToleranceHigh are treated as significant. + // Tangle matrix elements in between are considered ambiguous. + SHASTA_ASSERT(detangleToleranceHigh > detangleToleranceLow); + + const vertex_descriptor cv0 = source(ce, cGraph); + const vertex_descriptor cv1 = target(ce, cGraph); + + if(out_degree(cv0, cGraph) != 1) { + return false; + } + if(in_degree(cv1, cGraph) != 1) { + return false; + } + + if(debug) { + cout << "Attempting general detangling of edge " << bubbleChainStringId(ce) << endl; + } + + class ChainInfo { + public: + edge_descriptor ce; + uint64_t indexInBubble; + MarkerGraphEdgeId edgeId; + }; + vector<ChainInfo> inChains; + BGL_FORALL_INEDGES(cv0, ce, cGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = cGraph[ce]; + const Bubble& bubble = bubbleChain.lastBubble(); + for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) { + const Chain& chain = bubble[indexInBubble]; + inChains.push_back({ce, indexInBubble, chain[chain.size() - 2]}); + } + } + vector<ChainInfo> outChains; + BGL_FORALL_OUTEDGES(cv1, ce, cGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = cGraph[ce]; + const Bubble& bubble = bubbleChain.firstBubble(); + for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) { + const Chain& chain = bubble[indexInBubble]; + outChains.push_back({ce, indexInBubble, chain[1]}); + } + } + + if(debug) { + + cout << "In:" << endl; + for(const ChainInfo& chainInfo: inChains) { + cout << bubbleChainStringId(chainInfo.ce) << " " << + chainInfo.indexInBubble << " " << + chainInfo.edgeId << endl; + } + + cout << "Out:" << endl; + for(const ChainInfo& chainInfo: outChains) { + cout << bubbleChainStringId(chainInfo.ce) << " " << + chainInfo.indexInBubble << " " << + chainInfo.edgeId << endl; + } + } + + if(inChains.size() != outChains.size()) { + if(debug) { + cout << "Not detangling due to degree." << endl; + } + return false; + } + + + // Compute a generalized tangle matrix. + vector<vector<uint64_t> > tangleMatrix(inChains.size(), vector<uint64_t>(outChains.size())); + for(uint64_t i0=0; i0<inChains.size(); i0++) { + const MarkerGraphEdgeId markerGraphEdgeId0 = inChains[i0].edgeId; + + for(uint64_t i1=0; i1<outChains.size(); i1++) { + const MarkerGraphEdgeId markerGraphEdgeId1 = outChains[i1].edgeId; + + MarkerGraphEdgePairInfo info; + SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(markerGraphEdgeId0, markerGraphEdgeId1, info)); + tangleMatrix[i0][i1] = info.common; + } + } + + if(debug) { + cout << "Tangle matrix:" << endl; + for(uint64_t i0=0; i0<inChains.size(); i0++) { + const ChainInfo& chainInfo0 = inChains[i0]; + for(uint64_t i1=0; i1<outChains.size(); i1++) { + const ChainInfo& chainInfo1 = outChains[i1]; + + cout << + bubbleChainStringId(chainInfo0.ce) << " " << + chainInfo0.indexInBubble << " " << + chainInfo0.edgeId << " " << + bubbleChainStringId(chainInfo1.ce) << " " << + chainInfo1.indexInBubble << " " << + chainInfo1.edgeId << " " << + tangleMatrix[i0][i1] << endl; + } + } + + } + + // Count the number of significant, ambiguous, and negligible elements + // in the tangle matrix. + uint64_t significantCount = 0; + uint64_t ambiguousCount = 0; + uint64_t negligibleCount = 0; + for(uint64_t i0=0; i0<inChains.size(); i0++) { + for(uint64_t i1=0; i1<outChains.size(); i1++) { + const uint64_t t = tangleMatrix[i0][i1]; + if(t <= detangleToleranceLow) { + ++negligibleCount; + } else if(t >= detangleToleranceHigh) { + ++significantCount; + } else { + ++ambiguousCount; + } + } + } + + // If the tangle matrix contains any ambiguous elements, do nothing. + if(ambiguousCount > 0) { + return false; + } + + // There are no ambiguous elements. + // If there are no negligible element, that is all elements of the tangle matrix are significant, + // there is nothing to do. + if(negligibleCount == 0) { + return false; + } + + // To avoid breaking contiguity, we require each column and each row of the + // tangle matrix to have at least one significant element. + // This means that each in-edge will be "merged" with at least one out-edge, + // and each out-edge will be "merged" with at least one in-edge. + for(uint64_t i0=0; i0<inChains.size(); i0++) { + bool foundSignificant = false; + for(uint64_t i1=0; i1<outChains.size(); i1++) { + if(tangleMatrix[i0][i1] >= detangleToleranceHigh) { + foundSignificant = true; + break; + } + } + if(not foundSignificant) { + return false; + } + } + for(uint64_t i1=0; i1<outChains.size(); i1++) { + bool foundSignificant = false; + for(uint64_t i0=0; i0<inChains.size(); i0++) { + if(tangleMatrix[i0][i1] >= detangleToleranceHigh) { + foundSignificant = true; + break; + } + } + if(not foundSignificant) { + return false; + } + } + + if(debug) { + cout << "This edge can be detangled after some splitting of bubble chains." << endl; + } + + // Make sure the last bubble of all in-edges is haploid. + in_edge_iterator itIn, itInEnd; + tie(itIn, itInEnd) = in_edges(cv0, cGraph); + while(itIn != itInEnd) { + const edge_descriptor ce = *itIn; + ++itIn; // Increment before possibly removing this edge! + const BubbleChain& bubbleChain = cGraph[ce]; + if(not bubbleChain.lastBubble().isHaploid()) { + if(debug) { + cout << "In-edge " << bubbleChainStringId(ce) << + " needs to be split at the end." << endl; + } + splitBubbleChainAtEnd(ce); + } + } + + // Make sure the first bubble of all out-edges is haploid. + out_edge_iterator itOut, itOutEnd; + tie(itOut, itOutEnd) = out_edges(cv1, cGraph); + while(itOut != itOutEnd) { + const edge_descriptor ce = *itOut; + ++itOut; // Increment before possibly removing this edge! + const BubbleChain& bubbleChain = cGraph[ce]; + if(not bubbleChain.firstBubble().isHaploid()) { + if(debug) { + cout << "Out-edge " << bubbleChainStringId(ce) << + " needs to be split at the beginning." << endl; + } + splitBubbleChainAtBeginning(ce); + } + } + + // Now we can detangle using detangleEdge. + if(debug) { + cout << "Calling detangleEdge." << endl; + } + --it; // Because detangleEdge increments it again. + return detangleEdge(debug, edgeMap, it, detangleToleranceLow, detangleToleranceHigh, + useBayesianModel, epsilon, minLogP); +} + + +#if 0 +bool AssemblyGraph::detangleEdgesWithSearch( + bool debug, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh) +{ + if(debug) { + cout << "Detangling edges with search." << endl; + } + + AssemblyGraph& cGraph = *this; + + // To safely iterate over edges while removing edges we must use edge ids + // as unique identifiers, because edge descriptors can be reused as edges are + // deleted ndw new edges are created. + std::map<uint64_t, edge_descriptor> edgeMap; + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + edgeMap.insert({cGraph[ce].id, ce}); + } + + uint64_t detangleCount = 0;; + for(auto it=edgeMap.begin(); it!=edgeMap.end(); /* Incremented safely by detangleEdgeStrict */) { + if(detangleEdgeWithSearch(debug, edgeMap, it, detangleToleranceLow, detangleToleranceHigh)) { + ++detangleCount; + } + } + + if(debug) { + cout << "Detangled " << detangleCount << " edges." << endl; + } + + return detangleCount > 0; +} + + + +bool AssemblyGraph::detangleEdgeWithSearch( + bool debug, + std::map<uint64_t, edge_descriptor>& edgeMap, + std::map<uint64_t, edge_descriptor>::iterator& it, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh) +{ + AssemblyGraph& cGraph = *this; + const edge_descriptor ce = it->second; + ++it; + + // Only try detangling if the edge consists of a single haploid bubble. + // Otherwise detangling would lose information. + BubbleChain& bubbleChain = cGraph[ce]; + if(bubbleChain.size() > 1) { + return false; + } + if(bubbleChain.front().size() > 1) { + return false; + } + + + const vertex_descriptor cv0 = source(ce, cGraph); + const vertex_descriptor cv1 = target(ce, cGraph); + + if(out_degree(cv0, cGraph) != 1) { + return false; + } + if(in_degree(cv1, cGraph) != 1) { + return false; + } + + if(debug) { + cout << "Attempting to detangle edge " << bubbleChainStringId(ce) << " with search." << endl; + } + + // Gather the in-edges and check that the last bubble is haploid. + vector<edge_descriptor> inEdges; + vector<edge_descriptor> backEdges; + BGL_FORALL_INEDGES(cv0, ce, cGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = cGraph[ce]; + if(not bubbleChain.lastBubble().isHaploid()) { + if(debug) { + cout << "Not detangling because the last bubble of in-edge " << + bubbleChainStringId(ce) << " is not haploid." << endl; + } + return false; + } + if(source(ce, cGraph) != cv1) { + inEdges.push_back(ce); + } else { + backEdges.push_back(ce); + } + } + + // Gather the out-edges and check that the first bubble is haploid. + // Ignore out-edges going to cv0 (back-edges). + vector<edge_descriptor> outEdges; + BGL_FORALL_OUTEDGES(cv1, ce, cGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = cGraph[ce]; + if(not bubbleChain.firstBubble().isHaploid()) { + if(debug) { + cout << "Not detangling because the first bubble of out-edge " << + bubbleChainStringId(ce) << " is not haploid." << endl; + } + return false; + } + if(target(ce, cGraph) != cv0) { + outEdges.push_back(ce); + } + } + + if(inEdges.size() == 0 or outEdges.size() == 0) { + if(debug) { + cout << "Not detangling due to degree (case 1)." << endl; + } + return false; + } + if(inEdges.size() != 2 and outEdges.size() != 2) { + if(debug) { + cout << "Not detangling due to degree (case 2)." << endl; + } + return false; + } + if(inEdges.size() != outEdges.size()) { + if(debug) { + cout << "Not detangling due to degree (case 3)." << endl; + } + return false; + } + + + // Get the second to last MarkerGraphEdgeIds of the incoming chains. + array<MarkerGraphEdgeId, 2> in; + for(uint64_t i=0; i<2; i++) { + const Chain& chain = cGraph[inEdges[i]].back().front(); + in[i] = chain.secondToLast(); + } + + // Get the second MarkerGraphEdgeIds of the outgoing chains. + array<MarkerGraphEdgeId, 2> out; + for(uint64_t i=0; i<2; i++) { + const Chain& chain = cGraph[outEdges[i]].front().front(); + out[i] = chain.second(); + } + if(debug) { + cout << "in " << bubbleChainStringId(inEdges[0]) << " " << bubbleChainStringId(inEdges[1]) << endl; + cout << "out " << bubbleChainStringId(outEdges[0]) << " " << bubbleChainStringId(outEdges[1]) << endl; + cout << "in " << in[0] << " " << in[1] << endl; + cout << "out " << out[0] << " " << out[1] << endl; + } + + array<array<vector<MarkerGraphEdgeId>, 2>, 2> detanglingCandidates; + GlobalPathGraph::searchForDetangling( + in, out, + detangleToleranceHigh, detangleToleranceLow, + assembler, detanglingCandidates); + for(uint64_t i0=0; i0<2; i0++) { + for(uint64_t i1=0; i1<2; i1++) { + const auto& hits = detanglingCandidates[i0][i1]; + cout << "Found " << hits.size() << " hits for " << i0 << " " << i1 << ":" << endl; + if(not hits.empty()) { + copy(hits.begin(), hits.end(), ostream_iterator<MarkerGraphEdgeId>(cout, " ")); + cout << endl; + } + } + } + + return false; + +#if 0 + // Compute the tangle matrix. + vector< vector<uint64_t> > tangleMatrix; + computeTangleMatrix(inEdges, outEdges, tangleMatrix, false); + + if(debug) { + cout << "Computing tangle matrix for edge " << bubbleChainStringId(ce) << endl; + + cout << "In-edges: "; + for(const edge_descriptor ce: inEdges) { + cout << " " << bubbleChainStringId(ce); + } + cout << endl; + + cout << "Out-edges: "; + for(const edge_descriptor ce: outEdges) { + cout << " " << bubbleChainStringId(ce); + } + cout << endl; + + cout << "Tangle matrix:" << endl; + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + const edge_descriptor ce0 = inEdges[i0]; + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + const edge_descriptor ce1 = outEdges[i1]; + cout << + bubbleChainStringId(ce0) << " " << + bubbleChainStringId(ce1) << " " << + tangleMatrix[i0][i1]; + if(tangleMatrix[i0][i1] == 0) { + cout << " zero tangle matrix element"; + } + cout << endl; + } + } + } + + + + // Detangle based on the contents of the tangle matrix. + if(useBayesianModel and inEdges.size() == 2 and outEdges.size() == 2) { + + // Use the 2 by 2 Bayesian model for detangling. + array< array<uint64_t, 2>, 2> tangleMatrix22; + for(uint64_t i=0; i<2; i++) { + for(uint64_t j=0; j<2; j++) { + tangleMatrix22[i][j] = tangleMatrix[i][j]; + } + } + + // Compute logarithmic probability ratio of in-phase and out-of-phase + // against random. + double logPin; + double logPout; + tie(logPin, logPout) = diploidBayesianPhase(tangleMatrix22, epsilon); + if(debug) { + cout << "logPin = " << logPin << ", logPout = " << logPout << endl; + } + + // const bool isInPhase = (logPin >= minLogP) and ((logPin - logPout) >= minLogP); + // const bool isOutOfPhase = (logPout >= minLogP) and ((logPout - logPin) >= minLogP); + // Ignore the random hypothesis. + const bool isInPhase = (logPin - logPout) >= minLogP; + const bool isOutOfPhase = (logPout - logPin) >= minLogP; + + if(isInPhase or isOutOfPhase) { + + // We can detangle. + + // Create truncated versions of the inEdges and outEdges. + vector<vertex_descriptor> inVertices; + for(const edge_descriptor ce: inEdges) { + inVertices.push_back(cloneAndTruncateAtEnd(ce)); + } + vector<vertex_descriptor> outVertices; + for(const edge_descriptor ce: outEdges) { + outVertices.push_back(cloneAndTruncateAtBeginning(ce)); + } + + if(isInPhase) { + const edge_descriptor e0 = connect(inVertices[0], outVertices[0]); + const edge_descriptor e1 = connect(inVertices[1], outVertices[1]); + if(debug) { + cout << "In phase: created " << bubbleChainStringId(e0) << " and " << + bubbleChainStringId(e1) << endl; + } + } else { + const edge_descriptor e0 = connect(inVertices[0], outVertices[1]); + const edge_descriptor e1 = connect(inVertices[1], outVertices[0]); + if(debug) { + cout << "Out of phase phase: created " << bubbleChainStringId(e0) << " and " << + bubbleChainStringId(e1) << endl; + } + } + + } else { + + // Ambiguous. Don't detangle. + if(debug) { + cout << "Ambiguous. NOt detangling." << endl; + } + return false; + } + + } else { + + + + // We are not using the Bayesian model. + + // Count the number of significant, ambiguous, and negligible elements + // in the tangle matrix. + uint64_t significantCount = 0; + uint64_t ambiguousCount = 0; + uint64_t negligibleCount = 0; + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + const uint64_t t = tangleMatrix[i0][i1]; + if(t <= detangleToleranceLow) { + ++negligibleCount; + } else if(t >= detangleToleranceHigh) { + ++significantCount; + } else { + ++ambiguousCount; + } + } + } + + // If the tangle matrix contains any ambiguous elements, do nothing. + if(ambiguousCount > 0) { + return false; + } + + // There are no ambiguous elements. + // If there are no negligible element, that is all elements of the tangle matrix are significant, + // there is nothing to do. + if(negligibleCount == 0) { + return false; + } + + // To avoid breaking contiguity, we require each column and each row of the + // tangle matrix to have at least one significant element. + // This means that each in-edge will be "merged" with at least one out-edge, + // and each out-edge will be "merged" with at least one in-edge. + // ACTUALY, FOR MORE ROBUSTNESS REQUIRE EXACTLY OEN SIGNIFICANT ELEMENT PER ROW AND COLUMN. + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + uint64_t significantCount = 0; + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + if(tangleMatrix[i0][i1] >= detangleToleranceHigh) { + ++significantCount; + } + } + if(significantCount != 1) { + return false; + } + } + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + uint64_t significantCount = 0; + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + if(tangleMatrix[i0][i1] >= detangleToleranceHigh) { + ++significantCount; + } + } + if(significantCount != 1) { + return false; + } + } + + #if 0 + // In an in-edge is also an out-edge, don't detangle. + for(const edge_descriptor ce: inEdges) { + if(find(outEdges.begin(), outEdges.end(), ce) != outEdges.end()) { + if(debug) { + cout << "Not degangled because an in-edge is also an out-edge." << endl; + } + return false; + } + } + #endif + + if(debug) { + cout << "This edge will be detangled " << inEdges.size() << " by " << outEdges.size() << endl; + } + + // Create truncated versions of the inEdges and outEdges. + vector<vertex_descriptor> inVertices; + for(const edge_descriptor ce: inEdges) { + inVertices.push_back(cloneAndTruncateAtEnd(ce)); + } + vector<vertex_descriptor> outVertices; + for(const edge_descriptor ce: outEdges) { + outVertices.push_back(cloneAndTruncateAtBeginning(ce)); + } + + + // Each significant element of the tangle matrix generates a new edge. + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + if(tangleMatrix[i0][i1] >= detangleToleranceHigh) { + const edge_descriptor ceNew = connect(inVertices[i0], outVertices[i1]); + if(debug) { + cout << "Created " << bubbleChainStringId(ceNew) << endl; + } + } + } + } + } + + + // Now we can remove cv0, cv1, ce, and all of the in-edges and out-edges. + // We have to do this while safely incrementing the edge iterator to point to the + // next edge that was not removed. + // We already incremented the iterator to point past ce. + boost::remove_edge(ce, cGraph); + for(const edge_descriptor ce: inEdges) { + if(it != edgeMap.end() and cGraph[ce].id == it->first) { + ++it; + } + edgeMap.erase(cGraph[ce].id); + boost::remove_edge(ce, cGraph); + } + for(const edge_descriptor ce: outEdges) { + if(it != edgeMap.end() and cGraph[ce].id == it->first) { + ++it; + } + edgeMap.erase(cGraph[ce].id); + boost::remove_edge(ce, cGraph); + } + for(const edge_descriptor ce: backEdges) { + if(it != edgeMap.end() and cGraph[ce].id == it->first) { + ++it; + } + edgeMap.erase(cGraph[ce].id); + boost::remove_edge(ce, cGraph); + } + cGraph.removeVertex(cv0); + cGraph.removeVertex(cv1); + + return true; +#endif +} +#endif + + +// Detangle short superbubbles with any number of entrances and exits. +bool AssemblyGraph::detangleShortSuperbubbles( + bool debug, + uint64_t maxOffset1, // Used to define superbubbles + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP) +{ + AssemblyGraph& cGraph = *this; + + // Find the superbubbles. + Superbubbles superbubbles(cGraph, maxOffset1); + + // Loop over the superbubbles. + bool changesWereMade = false; + for(uint64_t superbubbleId=0; superbubbleId<superbubbles.size(); superbubbleId++) { + if(detangleShortSuperbubble(debug, + superbubbles, superbubbleId, detangleToleranceLow, detangleToleranceHigh, + useBayesianModel, epsilon, minLogP)) { + changesWereMade = true; + } + } + + return changesWereMade; +} + + + +bool AssemblyGraph::detangleShortSuperbubble( + bool debug, + const Superbubbles& superbubbles, + uint64_t superbubbleId, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP) +{ + AssemblyGraph& cGraph = *this; + const Superbubble& superbubble = superbubbles.getSuperbubble(superbubbleId); + + if(debug) { + cout << "Found a superbubble with " << superbubble.size() << + " vertices:"; + for(const vertex_descriptor cv: superbubble) { + cout << " " << cGraph[cv].edgeId; + } + cout << endl; + } + + // Fill in the in-edges and out-edges. + // These cannot be computed while constructing the superbubbles + // as they can change when other superbubbles are detangled. + vector<edge_descriptor> inEdges; + vector<edge_descriptor> outEdges; + for(const vertex_descriptor cv0: superbubble) { + BGL_FORALL_INEDGES(cv0, ce, cGraph, AssemblyGraph) { + const vertex_descriptor cv1 = source(ce, cGraph); + if(not superbubbles.isInSuperbubble(superbubbleId, cv1)) { + inEdges.push_back(ce); + } + } + BGL_FORALL_OUTEDGES(cv0, ce, cGraph, AssemblyGraph) { + const vertex_descriptor cv1 = target(ce, cGraph); + if(not superbubbles.isInSuperbubble(superbubbleId, cv1)) { + outEdges.push_back(ce); + } + } + } + const uint64_t inDegree = inEdges.size(); + const uint64_t outDegree = outEdges.size(); + + if(debug) { + cout << inDegree << " in-edges:"; + for(const edge_descriptor ce: inEdges) { + cout << " " << bubbleChainStringId(ce); + } + cout << endl; + cout << outDegree << " out-edges:"; + for(const edge_descriptor ce: outEdges) { + cout << " " << bubbleChainStringId(ce); + } + cout << endl; + } + + if(inDegree == 0 or outDegree == 0) { + if(debug) { + cout << "Not detangling due to degree (case 1)." << endl; + } + return false; + } + +#if 0 + // Skip this check. We still want to remove the superbubble if possible. + if(inDegree < 2 and outDegree < 2) { + if(debug) { + cout << "Not detangling due to degree (case 2)." << endl; + } + return false; + } +#endif + + // This requires the last bubble of each in-edge + // and the first bubble of each out-edge to be haploid. + bool canDo = true; + for(const edge_descriptor ce: inEdges) { + const BubbleChain& bubbleChain = cGraph[ce]; + if(not bubbleChain.lastBubble().isHaploid()) { + if(debug) { + cout << "Not detangling because the last bubble of in-edge " << + bubbleChainStringId(ce) << " is not haploid." << endl; + } + canDo = false; + break; + } + } + for(const edge_descriptor ce: outEdges) { + const BubbleChain& bubbleChain = cGraph[ce]; + if(not bubbleChain.firstBubble().isHaploid()) { + if(debug) { + cout << "Not detangling because the first bubble of out-edge " << + bubbleChainStringId(ce) << " is not haploid." << endl; + } + canDo = false; + break; + } + } + if(not canDo) { + return false; + } + + + + // If a MarkerGraphEdgeId appears both in the inEdges and in the outEdges, + // detangling could generate a chain with two consecutive copies of the same + // MarkerGraphEdgeId. Don't detangle. + for(const edge_descriptor ce0: inEdges) { + const BubbleChain& bubbleChain0 = cGraph[ce0]; + const Bubble& bubble0 = bubbleChain0.lastBubble(); + SHASTA_ASSERT(bubble0.isHaploid()); + const Chain& chain0 = bubble0.front(); + SHASTA_ASSERT(chain0.size() >= 2); + const MarkerGraphEdgeId markerGraphEdgeId0 = chain0[chain0.size() - 2]; // Exclude last + + for(const edge_descriptor ce1: outEdges) { + const BubbleChain& bubbleChain1 = cGraph[ce1]; + const Bubble& bubble1 = bubbleChain1.firstBubble(); + SHASTA_ASSERT(bubble1.isHaploid()); + const Chain& chain1 = bubble1.front(); + SHASTA_ASSERT(chain1.size() >= 2); + const MarkerGraphEdgeId markerGraphEdgeId1 = chain1[1]; // Exclude first + + if(markerGraphEdgeId0 == markerGraphEdgeId1) { + if(debug) { + cout << "Not detangling due to cycle." << endl; + } + return false; + } + } + } + + + + // Compute the tangle matrix. + vector< vector<uint64_t> > tangleMatrix; + computeTangleMatrix(inEdges, outEdges, tangleMatrix, true); + + if(debug) { + cout << "Tangle matrix:" << endl; + for(uint64_t i0=0; i0<inDegree; i0++) { + const edge_descriptor inEdge = inEdges[i0]; + for(uint64_t i1=0; i1<outDegree; i1++) { + const edge_descriptor outEdge = outEdges[i1]; + + cout << bubbleChainStringId(inEdge) << " " << + bubbleChainStringId(outEdge) << " " << tangleMatrix[i0][i1]; + + cout << endl; + } + } + } + + + + // Detangle based on the contents of the tangle matrix. + if(useBayesianModel and inEdges.size() == 2 and outEdges.size() == 2) { + + // Use the 2 by 2 Bayesian model for detangling. + array< array<uint64_t, 2>, 2> tangleMatrix22; + for(uint64_t i=0; i<2; i++) { + for(uint64_t j=0; j<2; j++) { + tangleMatrix22[i][j] = tangleMatrix[i][j]; + } + } + + // Compute logarithmic probability ratio of in-phase and out-of-phase + // against random. + double logPin; + double logPout; + tie(logPin, logPout) = diploidBayesianPhase(tangleMatrix22, epsilon); + if(debug) { + cout << "logPin = " << logPin << ", logPout = " << logPout << endl; + } + + // const bool isInPhase = (logPin >= minLogP) and ((logPin - logPout) >= minLogP); + // const bool isOutOfPhase = (logPout >= minLogP) and ((logPout - logPin) >= minLogP); + // Ignore the random hypothesis. + const bool isInPhase = (logPin - logPout) >= minLogP; + const bool isOutOfPhase = (logPout - logPin) >= minLogP; + + if(isInPhase or isOutOfPhase) { + + // We can detangle. + + // Create truncated versions of the inEdges and outEdges. + vector<vertex_descriptor> inVertices; + for(const edge_descriptor ce: inEdges) { + inVertices.push_back(cloneAndTruncateAtEnd(ce)); + } + vector<vertex_descriptor> outVertices; + for(const edge_descriptor ce: outEdges) { + outVertices.push_back(cloneAndTruncateAtBeginning(ce)); + } + + if(isInPhase) { + connect(inVertices[0], outVertices[0]); + connect(inVertices[1], outVertices[1]); + } else { + connect(inVertices[0], outVertices[1]); + connect(inVertices[1], outVertices[0]); + } + + // Now we can remove all the vertices in the superbubble. + for(const vertex_descriptor cv: superbubble) { + clear_vertex(cv, cGraph); + remove_vertex(cv, cGraph); + } + + return true; + + } else { + + // Ambiguous. Don't detangle. + if(debug) { + cout << "Ambiguous. Not detangling." << endl; + } + return false; + } + } + + + + // If getting here, we are not using the Bayesian model. + + // Count the number of significant, ambiguous, and negligible elements + // in the tangle matrix. + uint64_t significantCount = 0; + uint64_t ambiguousCount = 0; + uint64_t negligibleCount = 0; + for(uint64_t i0=0; i0<inDegree; i0++) { + for(uint64_t i1=0; i1<outDegree; i1++) { + const uint64_t t = tangleMatrix[i0][i1]; + if(t <= detangleToleranceLow) { + ++negligibleCount; + } else if(t >= detangleToleranceHigh) { + ++significantCount; + } else { + ++ambiguousCount; + } + } + } + + // If the tangle matrix contains any ambiguous elements, do nothing. + if(ambiguousCount > 0) { + if(debug) { + cout << "Not detangled because the tangle matrix contains ambiguous elements." << endl; + } + return false; + } + +#if 0 + // (Skip this check - we still want to get rid of the superbubble in that case too!) + // There are no ambiguous elements. + // If there are no negligible element, that is all elements of the tangle matrix are significant, + // there is nothing to do. + if(negligibleCount == 0) { + if(debug) { + cout << "Not detangled because the tangle matrix contains no negligible elements." << endl; + } + return false; + } +#endif + + // To avoid breaking contiguity, we require each column and each row of the + // tangle matrix to have at least one significant element. + // This means that each in-edge will be "merged" with at least one out-edge, + // and each out-edge will be "merged" with at least one in-edge. + bool ok = true; + for(uint64_t i0=0; i0<inDegree; i0++) { + bool foundSignificant = false; + for(uint64_t i1=0; i1<outDegree; i1++) { + if(tangleMatrix[i0][i1] >= detangleToleranceHigh) { + foundSignificant = true; + break; + } + } + if(not foundSignificant) { + ok = false; + break; + } + } + for(uint64_t i1=0; i1<outDegree; i1++) { + bool foundSignificant = false; + for(uint64_t i0=0; i0<inDegree; i0++) { + if(tangleMatrix[i0][i1] >= detangleToleranceHigh) { + foundSignificant = true; + break; + } + } + if(not foundSignificant) { + ok = false; + break; + } + } + if(not ok) { + if(debug) { + cout << "Not detangled to avoid breaking contiguity." << endl; + } + return false; + } + + if(debug) { + cout << "This superbubble will be detangled." << endl; + } + + // Create truncated versions of the inEdges and outEdges. + vector<vertex_descriptor> inVertices; + for(const edge_descriptor ce: inEdges) { + inVertices.push_back(cloneAndTruncateAtEnd(ce)); + } + vector<vertex_descriptor> outVertices; + for(const edge_descriptor ce: outEdges) { + outVertices.push_back(cloneAndTruncateAtBeginning(ce)); + } + + + + // Each significant element of the tangle matrix generates a new edge. + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + if(tangleMatrix[i0][i1] >= detangleToleranceHigh) { + connect(inVertices[i0], outVertices[i1]); + } + } + } + if(debug) { + cout << "After creating new edges, nextEdgeId is " << nextEdgeId << endl; + } + + +#if 0 + // Each significant element of the tangle matrix generates a new edge, + // obtained by "merging" an in-edge with an out-edge. + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + const edge_descriptor ce0 = inEdges[i0]; + const BubbleChain& bubbleChain0 = cGraph[ce0]; + const Bubble& bubble0 = bubbleChain0.lastBubble(); + SHASTA_ASSERT(bubble0.isHaploid()); + const Chain& chain0 = bubble0.front(); + SHASTA_ASSERT(chain0.size() >= 2); + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + if(tangleMatrix[i0][i1] < detangleToleranceHigh) { + continue; + } + const edge_descriptor ce1 = outEdges[i1]; + const BubbleChain& bubbleChain1 = cGraph[ce1]; + const Bubble& bubble1 = bubbleChain1.firstBubble(); + SHASTA_ASSERT(bubble1.isHaploid()); + const Chain& chain1 = bubble1.front(); + SHASTA_ASSERT(chain1.size() >= 2); + + edge_descriptor eNew; + tie(eNew, ignore) = add_edge(source(ce0, cGraph), target(ce1, cGraph), cGraph); + AssemblyGraphEdge& newEdge = cGraph[eNew]; + newEdge.id = nextEdgeId++; + BubbleChain& newBubbleChain = newEdge; + + if(debug) { + cout << "Merging " << + bubbleChainStringId(ce0) << " " << + bubbleChainStringId(ce1) << " into " << + bubbleChainStringId(eNew) << endl; + } + + // Create the new BubbleChain. It is obtained by joining + // bubbleChain0 and bubbleChain1, with vertex cv + // removed from the end of bubbleChain0 + // and from the beginning of bubbleChain1. + // Here we use the above assumption that + // the last bubble of bubbleChain0 and the first bubble of bubbleChain1 + // are haploid. + newBubbleChain = bubbleChain0; + + // Remove the last marker graph edge, which is in the superbubble. + Bubble& newBubbleLast = newBubbleChain.back(); + SHASTA_ASSERT(newBubbleLast.size() == 1); + Chain& newChainLast = newBubbleLast.front(); + newChainLast.resize(newChainLast.size() - 1); + + // Append chain1, except for the first marker graph edge, which is in the superbubble. + copy(chain1.begin() + 1, chain1.end(), back_inserter(newChainLast)); + + // Append the rest of bubbleChain1. + copy(bubbleChain1.begin() + 1, bubbleChain1.end(), back_inserter(newBubbleChain)); + } + + } +#endif + + // Now we can remove all the vertices in the superbubble. + for(const vertex_descriptor cv: superbubble) { + clear_vertex(cv, cGraph); + remove_vertex(cv, cGraph); + } + + return true; +} + + + +// Special treatment to detangle back edges that were too long +// to be handled by detangleEdges. +bool AssemblyGraph::detangleBackEdges( + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh) +{ + cout << "Detangling back edges." << endl; + AssemblyGraph& cGraph = *this; + + // To safely iterate over edges while removing edges we must use edge ids + // as unique identifiers, because edge descriptors can be reused as edges are + // deleted a new edges are created. + std::map<uint64_t, edge_descriptor> edgeMap; + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + edgeMap.insert({cGraph[ce].id, ce}); + } + + uint64_t detangleCount = 0;; + for(auto it=edgeMap.begin(); it!=edgeMap.end(); /* Incremented safely by detangleEdgeStrict */) { + if(detangleBackEdge(edgeMap, it, detangleToleranceLow, detangleToleranceHigh)) { + ++detangleCount; + } + } + cout << "Detangled " << detangleCount << " back edges." << endl; + + return detangleCount > 0; + +} + + + +// Special treatment to detangle back edges that were too long +// to be handled by detangleEdge. +bool AssemblyGraph::detangleBackEdge( + std::map<uint64_t, edge_descriptor>& edgeMap, + std::map<uint64_t, edge_descriptor>::iterator& it, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh) +{ + AssemblyGraph& cGraph = *this; + const edge_descriptor ce = it->second; + ++it; + // edgeMap.erase(cGraph[ce].id); + + const bool debug = false; + + // Tangle matrix elements <= detangleToleranceLow are treated as negligible. + // Tangle matrix elements >= detangleToleranceHigh are treated as significant. + // Tangle matrix elements in between are considered ambiguous. + SHASTA_ASSERT(detangleToleranceHigh > detangleToleranceLow); + + const vertex_descriptor cv0 = source(ce, cGraph); + const vertex_descriptor cv1 = target(ce, cGraph); + + // Check the degrees. + if(out_degree(cv0, cGraph) != 1) { + return false; + } + if(in_degree(cv1, cGraph) != 1) { + return false; + } + + // Look for a back edge. + vector<edge_descriptor> backEdges; + BGL_FORALL_OUTEDGES(cv1, ce, cGraph, AssemblyGraph) { + if(target(ce, cGraph) == cv0) { + backEdges.push_back(ce); + } + } + if(backEdges.empty()) { + return false; + } + + // Only attempt to handle the case with a single back-edge. + if(backEdges.size() != 1) { + return false; + } + const edge_descriptor ceBack = backEdges.front(); + + if(debug) { + cout << "Attempting to detangle edge " << bubbleChainStringId(ce) << + " with back-edge " << bubbleChainStringId(ceBack) << endl; + } + + // The back-edge is both an in-edge and an out-edge. + // Store it at the first position of both inEdges and outEdges. + + // Gather the in-edges. + vector<edge_descriptor> inEdges(1, ceBack); + BGL_FORALL_INEDGES(cv0, ce, cGraph, AssemblyGraph) { + if(ce == ceBack) { + continue; + } + const BubbleChain& bubbleChain = cGraph[ce]; + if(not bubbleChain.lastBubble().isHaploid()) { + if(debug) { + cout << "Not detangling because the last bubble of in-edge " << + bubbleChainStringId(ce) << " is not haploid." << endl; + } + return false; + } + inEdges.push_back(ce); + } + + // Gather the out-edges. + vector<edge_descriptor> outEdges(1, ceBack); + BGL_FORALL_OUTEDGES(cv1, ce, cGraph, AssemblyGraph) { + if(ce == ceBack) { + continue; + } + const BubbleChain& bubbleChain = cGraph[ce]; + if(not bubbleChain.firstBubble().isHaploid()) { + if(debug) { + cout << "Not detangling because the first bubble of out-edge " << + bubbleChainStringId(ce) << " is not haploid." << endl; + } + return false; + } + outEdges.push_back(ce); + } + + + if(debug) { + + // Position 0 of the inEdges and outEdges stores the back-edge. + + cout << "In-edges: "; + for(uint64_t i=1; i<inEdges.size(); i++) { + const edge_descriptor ce = inEdges[i]; + cout << " " << bubbleChainStringId(ce); + } + cout << endl; + + cout << "Out-edges: "; + for(uint64_t i=1; i<outEdges.size(); i++) { + const edge_descriptor ce = outEdges[i]; + cout << " " << bubbleChainStringId(ce); + } + cout << endl; + } + // Compute the tangle matrix. + vector< vector<uint64_t> > tangleMatrix; + computeTangleMatrix(inEdges, outEdges, tangleMatrix, false); + + if(debug) { + cout << "Tangle matrix:" << endl; + for(uint64_t i0=0; i0<inEdges.size(); i0++) { + const edge_descriptor ce0 = inEdges[i0]; + for(uint64_t i1=0; i1<outEdges.size(); i1++) { + const edge_descriptor ce1 = outEdges[i1]; + cout << + bubbleChainStringId(ce0) << " " << + bubbleChainStringId(ce1) << " " << + tangleMatrix[i0][i1]; + cout << endl; + } + } + } + + return false; +} + + + +void AssemblyGraph::phaseBubbleChainsUsingPhasingGraph( + bool debug, + uint64_t n, // Maximum number of Chain MarkerGraphEdgeIds to use when computing tangle matrices. + uint64_t lowThreshold, + uint64_t highThreshold, + bool useBayesianModel, + double epsilon, + double minLogP, + uint64_t longBubbleThreshold) +{ + AssemblyGraph& cGraph = *this; + + if(debug) { + cout << "phaseBubbleChainsUsingPhasingGraph begins." << endl; + } + + vector<edge_descriptor> allEdges; + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + allEdges.push_back(ce); + } + + for(const edge_descriptor ce: allEdges) { + phaseBubbleChainUsingPhasingGraph(ce, n, lowThreshold, highThreshold, useBayesianModel, epsilon, minLogP, longBubbleThreshold, debug); + } + + if(debug) { + cout << "phaseBubbleChainsUsingPhasingGraph ends." << endl; + } +} + + + +void AssemblyGraph::phaseBubbleChainsUsingPhasingTable( + const string& debugOutputFileNamePrefix, + double phaseErrorThreshold, + double bubbleErrorThreshold, + uint64_t longBubbleThreshold) +{ + AssemblyGraph& cGraph = *this; + + const bool debug = not debugOutputFileNamePrefix.empty(); + if(debug) { + cout << "phaseBubbleChainsUsingPhasingTable begins." << endl; + } + performanceLog << timestamp << "AssemblyGraph::phaseBubbleChainsUsingPhasingTable begins." << endl; + + // If debug output was requested, make sure we have a directory + // where the debug output files will go. + string directoryName; + if(debug) { + directoryName = debugOutputFileNamePrefix + "-PhasingTables"; + std::filesystem::create_directory(directoryName); + } + + vector<edge_descriptor> allEdges; + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + allEdges.push_back(ce); + } + + for(const edge_descriptor ce: allEdges) { + phaseBubbleChainUsingPhasingTable( + debug ? (directoryName + "/" + bubbleChainStringId(ce)) : "", + ce, phaseErrorThreshold, bubbleErrorThreshold, longBubbleThreshold); + } + + if(debug) { + cout << "phaseBubbleChainsUsingPhasingTable ends." << endl; + } + performanceLog << timestamp << "AssemblyGraph::phaseBubbleChainsUsingPhasingTable ends." << endl; + +} + + + +void AssemblyGraph::phaseBubbleChainUsingPhasingGraph( + edge_descriptor ce, + uint64_t n, // Maximum number of Chain MarkerGraphEdgeIds to use when computing tangle matrices. + uint64_t lowThreshold, + uint64_t highThreshold, + bool useBayesianModel, + double epsilon, + double minLogP, + uint64_t longBubbleThreshold, + bool debug) +{ + AssemblyGraph& cGraph = *this; + BubbleChain& bubbleChain = cGraph[ce]; + + // debug = debug and (cGraph[ce].id == 500048); + + if(debug) { + cout << "Phasing " << bubbleChainStringId(ce) << endl; + } + + const bool detailedDebug = debug; // (cGraph[ce].id == 49557); + + // If this bubble chain has a single bubble, there is nothing to do. + if(bubbleChain.size() == 1) { + if(debug) { + cout << "Not phased because it has only one bubble." << endl; + } + return; + } + + // Table to contain the Phasing graph vertex corresponding to each diploid bubble. + // Indexed by the bubble position in the bubble chains, and contains + // PhasingGraph::null_vertex() for non-diploid bubbles. + vector<PhasingGraph::vertex_descriptor> vertexTable(bubbleChain.size(), PhasingGraph::null_vertex()); + + // Create the PhasingGraph and its vertices, one for + // each diploid bubble in the bubble chain. + PhasingGraph phasingGraph; + for(uint64_t i=0; i<bubbleChain.size(); i++) { + if(bubbleChain[i].isDiploid()) { + vertexTable[i] = add_vertex({i, 0}, phasingGraph); + } + } + + // Write a histogram of the bubbles in this bubble chain by ploidy. + if(debug) { + cout << "Phasing a bubble chain with " << bubbleChain.size() << " bubbles." << endl; + vector<uint64_t> histogram; + for(const Bubble& bubble: bubbleChain) { + const uint64_t ploidy = bubble.size(); + if(histogram.size() <= ploidy) { + histogram.resize(ploidy + 1); + } + ++histogram[ploidy]; + } + for(uint64_t ploidy=1; ploidy<histogram.size(); ploidy++) { + const uint64_t frequency = histogram[ploidy]; + if(frequency) { + cout << frequency << " bubbles of ploidy " << ploidy << endl; + } + } + } + +#if 0 + // If this bubble chain has less than two diploid bubbles, there is nothing to do. + uint64_t diploidBubblesCount = 0; + for(const Bubble& bubble: bubbleChain) { + if(bubble.size() == 2) { + ++diploidBubblesCount; + } + } + if(diploidBubblesCount < 2) { + if(debug) { + cout << "Not phased because it has less than 2 diploid bubbles." << endl; + } + return; + } +#endif + + // Add edges of the phasing graph. + for(uint64_t i0=0; i0<bubbleChain.size()-1; i0++) { + const PhasingGraph::vertex_descriptor pv0 = vertexTable[i0]; + if(pv0 == PhasingGraph::null_vertex()) { + continue; + } + + // Gather the next-to-last two marker graph edges for the two chains + // of this bubble. + const Bubble& bubble0 = bubbleChain[i0]; + SHASTA_ASSERT(bubble0.size() == 2); + const Chain& chain00 = bubble0[0]; + const Chain& chain01 = bubble0[1]; + const array<MarkerGraphEdgeId, 2> edges0 = + {chain00[chain00.size()-2], chain01[chain01.size()-2]}; + + for(uint64_t i1=i0+1; i1<bubbleChain.size(); i1++) { + const PhasingGraph::vertex_descriptor pv1 = vertexTable[i1]; + if(pv1 == PhasingGraph::null_vertex()) { + continue; + } + + // Gather the next-to-last two marker graph edges for the two chains + // of this bubble. + const Bubble& bubble1 = bubbleChain[i1]; + SHASTA_ASSERT(bubble1.size() == 2); + const Chain& chain10 = bubble1[0]; + const Chain& chain11 = bubble1[1]; + const array<MarkerGraphEdgeId, 2> edges1 = + {chain10[1], chain11[1]}; + + // Compute the tangle matrix. + TangleMatrix tangleMatrix; + if(n == 1) { + for(uint64_t j0=0; j0<2; j0++) { + for(uint64_t j1=0; j1<2; j1++) { + MarkerGraphEdgePairInfo info; + SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair( + edges0[j0], edges1[j1], info)); + tangleMatrix[j0][j1] = info.common; + } + } + } else { + computeTangleMatrix( + {&chain00, &chain01}, + {&chain10, &chain11}, + n, tangleMatrix); + } + + // Analyze the tangle matrix. + int64_t phase; + uint64_t minConcordant; + uint64_t maxDiscordant; + uint64_t total; + double logPInPhase; + double logPOutOfPhase; + tangleMatrix.analyze( + lowThreshold, + highThreshold, + phase, + minConcordant, + maxDiscordant, + total, + epsilon, + logPInPhase, + logPOutOfPhase); + + // If no common reads, stop the loop on i1. + if(total == 0) { + break; + } + + if(detailedDebug) { + cout << "Tangle matrix " << i0 << " " << i1 << ": " << + tangleMatrix[0][0] << " " << + tangleMatrix[0][1] << " " << + tangleMatrix[1][0] << " " << + tangleMatrix[1][1] << endl; + cout << "minConcordant " << minConcordant << endl; + cout << "maxDiscordant " << maxDiscordant << endl; + cout << "log[p(in-phase)/p(random)] = " << logPInPhase << + " dB, log[p(out-of-phase)/p(random)] = " << logPOutOfPhase << " dB." << endl; + } + + // If using the Bayesian model, redefine the phase based on logPInPhase and logPOutOfPhase. + if(useBayesianModel) { + if((logPInPhase > minLogP) and (logPInPhase - logPOutOfPhase) > minLogP) { + phase = +1; + } else if((logPOutOfPhase > minLogP) and (logPOutOfPhase - logPInPhase) > minLogP) { + phase = -1; + } else { + phase = 0; + } + } + + // If not ambiguous, add an edge to the PhasingGraph. + if(phase != 0) { + boost::add_edge(pv0, pv1, {phase, minConcordant, maxDiscordant, logPInPhase, logPOutOfPhase}, phasingGraph); + + if(detailedDebug) { + cout << " Added phasing graph edge " << + phasingGraph[pv0].positionInBubbleChain << " " << + phasingGraph[pv1].positionInBubbleChain << " with minConcordant " << + minConcordant << ", maxDiscordant " << maxDiscordant << endl; + } + } else { + if(detailedDebug) { + cout << " No phasing graph edge for " << + phasingGraph[pv0].positionInBubbleChain << " " << + phasingGraph[pv1].positionInBubbleChain << endl; + } + } + + } + } + + if(debug) { + const uint64_t vertexCount = num_vertices(phasingGraph); + const uint64_t edgeCount = num_edges(phasingGraph); + const double connectivity = 2. * double(edgeCount) / double(vertexCount); + cout << "The phasing graph has " << vertexCount << + " vertices and " << edgeCount << " edges." + " Average connectivity " << connectivity << endl; + } + + phasingGraph.phase1(false, useBayesianModel); + + + + // Use the PhasedComponents in the PhasingGraph to create + // a new BubbleChain that will replace the existing one. + phaseBubbleChainUsingPhasedComponents(debug, ce, phasingGraph.phasedComponents, longBubbleThreshold); +} + + + +// Use PhasedComponents to create a new BubbleChain that will replace the existing one. +void AssemblyGraph::phaseBubbleChainUsingPhasedComponents( + bool debug, + edge_descriptor e, + const vector<shared_ptr<PhasedComponent> >& phasedComponents, + uint64_t longBubbleThreshold) +{ + AssemblyGraph& cGraph = *this; + BubbleChain& bubbleChain = cGraph[e]; + + BubbleChain newBubbleChain; + if(debug) { + cout << "Creating the new bubble chain for " << bubbleChainStringId(e) << endl; + } + + // Loop over the phased components. + for(uint64_t i=0; /* Check later */; i++) { + + // Bubbles in-between phased components, or before the first phased component, + // or after the last phased component. + { + const uint64_t beginPositionInBubbleChain = + (i == 0) ? 0 : phasedComponents[i-1]->maxPositionInBubbleChain + 1; + const uint64_t endPositionInBubbleChain = + (i == phasedComponents.size()) ? + bubbleChain.size() : + phasedComponents[i]->minPositionInBubbleChain; + + + if(debug) { + cout << "Adding unphased bubbles at positions [" << + beginPositionInBubbleChain << "," << endPositionInBubbleChain << ")" << endl; + } + + for(uint64_t i=beginPositionInBubbleChain; i<endPositionInBubbleChain; i++) { + const Bubble& bubble = bubbleChain[i]; + + // This unphased bubble will be copied verbatim to the new chain if it is + // haploid or if it is long. + bool copyVerbatim = bubble.isHaploid(); + if(not copyVerbatim) { + uint64_t averageOffset; + uint64_t minOffset; + uint64_t maxOffset; +#if 0 + if(bubbleOffsetNoException(bubble, averageOffset, minOffset, maxOffset)) { + copyVerbatim = maxOffset >= longBubbleThreshold; + } else { + copyVerbatim = false; + } +#else + bubbleOffset(bubble, averageOffset, minOffset, maxOffset); + copyVerbatim = maxOffset >= longBubbleThreshold; +#endif + } + + if(copyVerbatim) { + newBubbleChain.push_back(bubble); + } else { + // Just add a simple haploid bubble with only the source + // and target MarkerGraphEdgeIds. + Bubble newBubble; + newBubble.resize(1); // Make it haploid + Chain& newChain = newBubble.front(); // Its only chain. + newChain.push_back(bubble.front().front()); // Source MarkerGraphEdgeId + newChain.push_back(bubble.front().back()); // Target MarkerGraphEdgeId + newBubbleChain.push_back(newBubble); + } + } + } + + + + // If we are past the last phased component, we are done. + if(i == phasedComponents.size()) { + break; + } + + // Add a diploid bubble for the i-th phased component. + const PhasedComponent& phasedComponent = *phasedComponents[i]; + const uint64_t minPositionInBubbleChain = phasedComponent.minPositionInBubbleChain; + const uint64_t maxPositionInBubbleChain = phasedComponent.maxPositionInBubbleChain; + if(debug) { + cout << "Adding phased bubbles at positions " << + minPositionInBubbleChain << "-" << maxPositionInBubbleChain << endl; + } + newBubbleChain.emplace_back(); + Bubble& newBubble = newBubbleChain.back(); + newBubble.resize(2); // Make it diploid. + Chain& newChain0 = newBubble[0]; // The first haplotype after phasing. + Chain& newChain1 = newBubble[1]; // The second haplotype after phasing. + + // Add the source MarkerGraphEdgeId. + newChain0.push_back(bubbleChain[minPositionInBubbleChain].front().front()); + newChain1.push_back(bubbleChain[minPositionInBubbleChain].front().front()); + + // Add the internal MarkerGraphEdgeIds of all phased diploid bubbles in this PhasedComponent. + for(const auto& p: phasedComponent) { + const uint64_t positionInBubbleChain = p.first; + const int64_t phase = p.second; + SHASTA_ASSERT(phase==1 or phase==-1); + const Bubble& bubble = bubbleChain[positionInBubbleChain]; + SHASTA_ASSERT(bubble.isDiploid()); + const Chain& chain0 = (phase==1) ? bubble[0] : bubble[1]; + const Chain& chain1 = (phase==1) ? bubble[1] : bubble[0]; + copy(chain0.begin()+1, chain0.end()-1, back_inserter(newChain0)); + copy(chain1.begin()+1, chain1.end()-1, back_inserter(newChain1)); + } + + // Add the target MarkerGraphEdgeId. + newChain0.push_back(bubbleChain[maxPositionInBubbleChain].front().back()); + newChain1.push_back(bubbleChain[maxPositionInBubbleChain].front().back()); + } + + // Replace the old BubbleChain with the new one, leaving the id of the edge unchanged. + newBubbleChain.compress(); + bubbleChain = newBubbleChain; +} + + + +void AssemblyGraph::phaseBubbleChainUsingPhasingTable( + const string& debugOutputFileNamePrefix, + edge_descriptor e, + double phaseErrorThreshold, + double bubbleErrorThreshold, + uint64_t longBubbleThreshold) +{ + AssemblyGraph& cGraph = *this; + BubbleChain& bubbleChain = cGraph[e]; + + const bool debug = not debugOutputFileNamePrefix.empty(); + + cleanupBubbleChainUsingPhasingTable( + debug ? (debugOutputFileNamePrefix + "-PreCleanup") : "", + e, + phaseErrorThreshold, + bubbleErrorThreshold, + longBubbleThreshold); + + +#if 0 + // If this bubble chain has a single bubble, there is nothing to do. + // NOT TRUE, WE STILL MAY HAVE TO REMOVE SOME BUBBLES. + if(bubbleChain.size() == 1) { + if(debug) { + cout << "Skipped because it has only one bubble." << endl; + } + return; + } +#endif + + // Create the phasing table for this bubble chain. + PhasingTable phasingTable(bubbleChain, assembler.markerGraph, phaseErrorThreshold); + + if(phasingTable.empty()) { + if(debug) { + cout << "Not phasing because the phasing table is empty." << endl; + } + return; + } +#if 0 + // WE STILL MAY HAVE TO REMOVE SOME BUBBLES. + if(phasingTable.bubbleCount() < 2) { + if(debug) { + cout << "Not phasing because the phasing table has less than 2 bubbles." << endl; + } + return; + } +#endif + + if(debug) { + const uint64_t totalCount = phasingTable.entryCount(); + const uint64_t ambiguousCount = phasingTable.ambiguousEntryCount(); + const uint64_t unambiguousCount = totalCount - ambiguousCount; + const uint64_t bubbleCount = phasingTable.bubbleCount(); + const uint64_t orientedReadCount = phasingTable.orientedReadCount(); + const double coverage = double(unambiguousCount) / double(bubbleCount); + + cout << "Phasing table summary for " << bubbleChainStringId(e) << ":" << endl; + cout << bubbleCount << " diploid bubbles." << endl; + cout << orientedReadCount << " oriented reads." << endl; + cout << unambiguousCount << " unambiguous entries." << endl; + cout << ambiguousCount << " ambiguous entries." << endl; + cout << "Average coverage " << std::round(coverage) << endl; + cout << "Average number of diploid bubbles seen by each oriented read " << + std::round(double(unambiguousCount)/double(orientedReadCount)) << endl; + } + + // Phasing of the phasing table. + phasingTable.greedyPhasing(); + if(debug) { + uint64_t consistentCount; + uint64_t inconsistentCount; + tie(consistentCount, inconsistentCount) = phasingTable.countConsistentEntries(); + + cout << "After greedy phasing, the phasing table has " << consistentCount << + " consistent entries and " << inconsistentCount << + " inconsistent entries (" << consistentCount + inconsistentCount << + " total)." << endl; + + phasingTable.writePng(debugOutputFileNamePrefix + "-Consistency.png", + PhasingTable::ColoringMethod::byConsistency); + phasingTable.writeCsv(debugOutputFileNamePrefix); + phasingTable.writePng(debugOutputFileNamePrefix + "-RelativePhase.png", + PhasingTable::ColoringMethod::byRelativePhase); + phasingTable.writePng(debugOutputFileNamePrefix + "-DiscreteRelativePhase.png", + PhasingTable::ColoringMethod::byDiscreteRelativePhase); + } + + // Create the PhasedComponents. + phasingTable.constructPhasedComponents(debug); + + +#if 1 + // Split each PhasedComponent at locations where this is necessary. + // Check pairs of adjacent consecutive bubbles in the same phased component. + vector< shared_ptr<PhasedComponent> > splitComponents; + for(const auto& phasedComponentPointer: phasingTable.phasedComponents) { + const PhasedComponent& phasedComponent = *phasedComponentPointer; + if(phasedComponent.size() < 2) { + break; + } + if(debug) { + cout << "Checking for splitting a PhasedComponent of size " << phasedComponent.size() << endl; + } + vector<uint64_t> splitComponentsBegin(1, 0); + for(uint64_t i=1; i<phasedComponent.size(); i++) { + const auto& p0 = phasedComponent[i-1]; + const auto& p1 = phasedComponent[i]; + const uint64_t positionInBubbleChain0 = p0.first; + const uint64_t positionInBubbleChain1 = p1.first; + const int64_t phase0 = p0.second; + const int64_t phase1 = p1.second; + + const Bubble& bubble0 = bubbleChain[positionInBubbleChain0]; + const Bubble& bubble1 = bubbleChain[positionInBubbleChain1]; + SHASTA_ASSERT(bubble0.isDiploid()); + SHASTA_ASSERT(bubble1.isDiploid()); + + const Chain& chain00 = bubble0[0]; + const Chain& chain01 = bubble0[1]; + const Chain& chain10 = (phase0 == phase1) ? bubble1[0] : bubble1[1]; + const Chain& chain11 = (phase0 == phase1) ? bubble1[1] : bubble1[0]; + + MarkerGraphEdgeId e00 = chain00.secondToLast(); + MarkerGraphEdgeId e01 = chain01.secondToLast(); + MarkerGraphEdgeId e10 = chain10.second(); + MarkerGraphEdgeId e11 = chain11.second(); + + MarkerGraphEdgePairInfo info; + SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(e00, e10, info)); + const uint64_t common0 = info.common; + SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(e01, e11, info)); + const uint64_t common1 = info.common; + + if(debug) { + cout << "Bubble pair: " << + positionInBubbleChain0 << " " << + positionInBubbleChain1 << + ": side 0 " << e00 << " " << e10 << " " << common0 << " " << + ", side 1 " << e01 << " " << e11 << " " << common1 << endl; + if(common0 == 0 or common1 == 0) { + cout << "No common oriented reads." << endl; + } + } + + if(common0 == 0 or common1 == 0) { + splitComponentsBegin.push_back(i); + } + } + splitComponentsBegin.push_back(phasedComponent.size()); + + + // Split this phased component, if necessary. + if(splitComponentsBegin.size() == 2) { + // No splitting necessary. + splitComponents.push_back(phasedComponentPointer); + if(debug) { + cout << "No splitting was necessary." << endl; + } + } else { + // Split at the split points. + for(uint64_t i=0; i<splitComponentsBegin.size()-1; i++) { + const uint64_t begin = splitComponentsBegin[i]; + const uint64_t end = splitComponentsBegin[i+1]; + shared_ptr<PhasedComponent> splitComponentPointer = make_shared<PhasedComponent>(); + copy(phasedComponent.begin() + begin, phasedComponent.begin() + end, + back_inserter(*splitComponentPointer)); + splitComponentPointer->computePositionRange(); + splitComponents.push_back(splitComponentPointer); + if(debug) { + cout << "Created a split component at " << begin << " to " << end-1 << " (inclusive)." << endl; + } + } + } + } + phasingTable.phasedComponents.swap(splitComponents); +#endif + + + + // Remove PhasedComponents consisting of only one short bubble. + { + vector< shared_ptr<PhasedComponent> > newPhasedComponents; + for(const auto& phasedComponent: phasingTable.phasedComponents) { + bool keep = true; + if(phasedComponent->size() == 1) { + const uint64_t positionInBubbleChain = phasedComponent->front().first; + const Bubble& bubble = bubbleChain[positionInBubbleChain]; + + uint64_t averageOffset; + uint64_t minOffset; + uint64_t maxOffset; + bubbleOffset(bubble, averageOffset, minOffset, maxOffset); + if(maxOffset < longBubbleThreshold) { + keep = false; + } + } + if(keep) { + newPhasedComponents.push_back(phasedComponent); + } + } + phasingTable.phasedComponents.swap(newPhasedComponents); + } + + + + // Use the phased components to phase the BubbleChain. + phaseBubbleChainUsingPhasedComponents( + debug, + e, + phasingTable.phasedComponents, + longBubbleThreshold); + +} + + + +void AssemblyGraph::cleanupBubbleChainUsingPhasingTable( + const string& debugOutputFileNamePrefix, + edge_descriptor e, + double phaseErrorThreshold, + double bubbleErrorThreshold, + uint64_t longBubbleThreshold) +{ + + AssemblyGraph& cGraph = *this; + BubbleChain& bubbleChain = cGraph[e]; + + const bool debug = not debugOutputFileNamePrefix.empty(); + if(debug) { + cout << "Before bubble clean up, bubble chain " << + bubbleChainStringId(e) << " has " << cGraph[e].size() << " bubbles." << endl; + } + + // If this bubble chain has a single bubble, there is nothing to do. + if(bubbleChain.size() == 1) { + if(debug) { + cout << "Skipped because it has only one bubble." << endl; + } + return; + } + + // Create the phasing table for this bubble chain. + PhasingTable phasingTable(bubbleChain, assembler.markerGraph, phaseErrorThreshold); + + if(phasingTable.empty()) { + return; + } + if(phasingTable.bubbleCount() < 2) { + return; + } + + if(debug) { + const uint64_t totalCount = phasingTable.entryCount(); + const uint64_t ambiguousCount = phasingTable.ambiguousEntryCount(); + const uint64_t unambiguousCount = totalCount - ambiguousCount; + const uint64_t bubbleCount = phasingTable.bubbleCount(); + const uint64_t orientedReadCount = phasingTable.orientedReadCount(); + const double coverage = double(unambiguousCount) / double(bubbleCount); + + cout << "Phasing table summary (for bubble cleanup) " << bubbleChainStringId(e) << ":" << endl; + cout << bubbleCount << " diploid bubbles." << endl; + cout << orientedReadCount << " oriented reads." << endl; + cout << unambiguousCount << " unambiguous entries." << endl; + cout << ambiguousCount << " ambiguous entries." << endl; + cout << "Average coverage " << std::round(coverage) << endl; + cout << "Average number of diploid bubbles seen by each oriented read " << + std::round(double(unambiguousCount)/double(orientedReadCount)) << endl; + } + + // Phasing of the phasing table. + phasingTable.greedyPhasing(); + if(debug) { + uint64_t consistentCount; + uint64_t inconsistentCount; + tie(consistentCount, inconsistentCount) = phasingTable.countConsistentEntries(); + + cout << "After greedy phasing, the phasing table (for bubble cleanup) has " << consistentCount << + " consistent entries and " << inconsistentCount << + " inconsistent entries (" << consistentCount + inconsistentCount << + " total)." << endl; + + phasingTable.writePng(debugOutputFileNamePrefix + "-Consistency.png", + PhasingTable::ColoringMethod::byConsistency); + phasingTable.writeCsv(debugOutputFileNamePrefix); + phasingTable.writePng(debugOutputFileNamePrefix + "-RelativePhase.png", + PhasingTable::ColoringMethod::byRelativePhase); + phasingTable.writePng(debugOutputFileNamePrefix + "-DiscreteRelativePhase.png", + PhasingTable::ColoringMethod::byDiscreteRelativePhase); + } + + + // Use the PhasingTable to create a new BubbleChain that will replace the existing one. + // In the new bubble chain, we remove: + // - All diploid bubbles that have a high error rate in the PhasingTable, + // unless they are longer than longBubbleThreshold. + // - All bubbles with ploidy greater than 2, + // unless they are longer than longBubbleThreshold. + // Each bubble that is removed is replaced by a haploid bubble consisting + // of only the terminal MarkerGraphEdgeIds. + BubbleChain newBubbleChain; + for(uint64_t positionInBubbleChain = 0; positionInBubbleChain < bubbleChain.size(); + positionInBubbleChain++) { + const Bubble& bubble = bubbleChain[positionInBubbleChain]; + + // Decide whether this Bubble will be copied verbatim to the new bubble chain. + bool copyVerbatim = false; + if(bubble.isHaploid()) { + copyVerbatim = true; + if(debug) { + cout << "Bubble at position in bubble chain " << positionInBubbleChain << + " is haploid and will be kept." << endl; + } + } else if(bubble.isDiploid()) { + const double bubbleErrorRate = phasingTable.bubbleErrorRate(positionInBubbleChain); + if(debug) { + cout << "Bubble at phasing table index " << phasingTable.bubblesMap[positionInBubbleChain] << + " position in bubble chain " << positionInBubbleChain << + " has error rate " << bubbleErrorRate; + if(bubbleErrorRate <= bubbleErrorThreshold) { + cout << " and will be kept." << endl; + } else { + cout << " and will be removed." << endl; + } + } + if(bubbleErrorRate <= bubbleErrorThreshold) { + copyVerbatim = true; + } + } else { + if(debug) { + cout << "Bubble at position in bubble chain " << positionInBubbleChain << + " has ploidy " << bubble.size() << " and will be removed." << endl; + } + } + if(not copyVerbatim) { + uint64_t averageOffset; + uint64_t minOffset; + uint64_t maxOffset; + bubbleOffset(bubble, averageOffset, minOffset, maxOffset); + copyVerbatim = maxOffset >= longBubbleThreshold; + } + + if(copyVerbatim) { + newBubbleChain.push_back(bubble); + if(debug) { + cout << "Bubble at position in bubble chain " << positionInBubbleChain << + " was copied to the new bubble chain." << endl; + } + } else { + // Just add a simple haploid bubble with only the source + // and target MarkerGraphEdgeIds. + Bubble newBubble; + newBubble.resize(1); // Make it haploid + Chain& newChain = newBubble.front(); // Its only chain. + newChain.push_back(bubble.front().front()); // Source MarkerGraphEdgeId + newChain.push_back(bubble.front().back()); // Target MarkerGraphEdgeId + newBubbleChain.push_back(newBubble); + if(debug) { + cout << "Bubble at position in bubble chain " << positionInBubbleChain << + " was replaced by a simple haploid bubble in the new bubble chain: " << + bubble.front().front() << " " << bubble.front().back() << endl; + } + } + } + bubbleChain = newBubbleChain; + + if(debug) { + cout << "After bubble clean up, bubble chain " << + bubbleChainStringId(e) << " has " << newBubbleChain.size() << + " bubbles of which " << + newBubbleChain.diploidBubbleCount() << " diploid." << endl; + const string csvFileName = debugOutputFileNamePrefix + "-ChainsDetails-PostBubbleCleanup.csv"; + ofstream csv(csvFileName); + cout << "For chain details after bubble cleanup, see " << csvFileName << endl; + writeChainDetailsCsv(csv, e, true); + } + + // Replace the old BubbleChain with the new one, leaving the id of the edge unchanged. + bubbleChain.compress(); + if(debug) { + cout << "After bubble clean up and compression, bubble chain " << + bubbleChainStringId(e) << " has " << newBubbleChain.size() << + " bubbles of which " << + newBubbleChain.diploidBubbleCount() << " diploid." << endl; + const string csvFileName = debugOutputFileNamePrefix + + "-ChainsDetails-PostBubbleCleanupSAndCompress.csv"; + ofstream csv(csvFileName); + cout << "For chain details after bubble cleanup and compress, see " << csvFileName << endl; + writeChainDetailsCsv(csv, e, true); + } +} + + + +// Compute the tangle matrix between two incoming chains +// and two outgoing chains, taking into account up to +// n MarkergraphEdgeIds for each Chain. +void AssemblyGraph::computeTangleMatrix( + const array<const Chain*, 2> inChains, + const array<const Chain*, 2> outChains, + uint64_t n, + TangleMatrix& tangleMatrix) const +{ + // Gather the OrientedReadIds near the end of the inChains. + array<vector<OrientedReadId>, 2> allOrientedReadIdsIn; + for(uint64_t i=0; i<2; i++) { + gatherOrientedReadIdsAtEnd(*inChains[i], n, allOrientedReadIdsIn[i]); + + } + + // Gather the OrientedReadIds near the beginning of the outChains. + array<vector<OrientedReadId>, 2> allOrientedReadIdsOut; + for(uint64_t i=0; i<2; i++) { + gatherOrientedReadIdsAtBeginning(*outChains[i], n, allOrientedReadIdsOut[i]); + } + + // Discard OrientedReadIds that appear in both inChains. + array<vector<OrientedReadId>, 2> orientedReadIdsIn; + for(uint64_t i=0; i<2; i++) { + std::set_difference( + allOrientedReadIdsIn[i] .begin(), allOrientedReadIdsIn[i] .end(), + allOrientedReadIdsIn[1-i].begin(), allOrientedReadIdsIn[1-i].end(), + back_inserter(orientedReadIdsIn[i])); + } + + // Discard OrientedReadIds that appear in both outChains. + array<vector<OrientedReadId>, 2> orientedReadIdsOut; + for(uint64_t i=0; i<2; i++) { + std::set_difference( + allOrientedReadIdsOut[i] .begin(), allOrientedReadIdsOut[i] .end(), + allOrientedReadIdsOut[1-i].begin(), allOrientedReadIdsOut[1-i].end(), + back_inserter(orientedReadIdsOut[i])); + } + + // Now we can compute the tangle matrix. + vector<OrientedReadId> commonOrientedReads; + for(uint64_t i0=0; i0<2; i0++) { + for(uint64_t i1=0; i1<2; i1++) { + commonOrientedReads.clear(); + set_intersection( + orientedReadIdsIn[i0] .begin(), orientedReadIdsIn[i0] .end(), + orientedReadIdsOut[i1].begin(), orientedReadIdsOut[i1].end(), + back_inserter(commonOrientedReads)); + tangleMatrix[i0][i1] = commonOrientedReads.size(); + } + } +} + + + +// Gather OrientedReadIds from up to n MarkergraphEdgeIds +// near the end of a chain. +void AssemblyGraph::gatherOrientedReadIdsAtEnd( + const Chain& chain, + uint64_t n, + vector<OrientedReadId>& orientedReadIds) const +{ + + const uint64_t last = chain.size() - 2; // Exclude last MarkergraphEdgeId. + const uint64_t first = (last > (n-1)) ? last + 1 - n : 0; // Use up to n. + + SHASTA_ASSERT(first < chain.size()); + SHASTA_ASSERT(last < chain.size()); + + orientedReadIds.clear(); + for(uint64_t i=first; i<=last; i++) { + const MarkerGraphEdgeId markerGraphEdgeId = chain[i]; + const auto& markerIntervals = + assembler.markerGraph.edgeMarkerIntervals[markerGraphEdgeId]; + for(const MarkerInterval& markerInterval: markerIntervals) { + orientedReadIds.push_back(markerInterval.orientedReadId); + } + } + deduplicate(orientedReadIds); +} + + + +// Gather OrientedReadIds from up to n MarkergraphEdgeIds +// near the beginning of a chain. +void AssemblyGraph::gatherOrientedReadIdsAtBeginning( + const Chain& chain, + uint64_t n, + vector<OrientedReadId>& orientedReadIds) const +{ + + const uint64_t first = 1; // / Exclude first MarkergraphEdgeId. + const uint64_t last = (chain.size() > (n+1)) ? n : chain.size() - 1; + + SHASTA_ASSERT(first < chain.size()); + SHASTA_ASSERT(last < chain.size()); + + orientedReadIds.clear(); + for(uint64_t i=first; i<=last; i++) { + const MarkerGraphEdgeId markerGraphEdgeId = chain[i]; + const auto& markerIntervals = + assembler.markerGraph.edgeMarkerIntervals[markerGraphEdgeId]; + for(const MarkerInterval& markerInterval: markerIntervals) { + orientedReadIds.push_back(markerInterval.orientedReadId); + } + } + deduplicate(orientedReadIds); +} + + + +// To phase the PhasingGraph, we create an optimal spanning tree +// using edges in order of decreasing "significance". +void AssemblyGraph::PhasingGraph::phase(bool debug) +{ + PhasingGraph& phasingGraph = *this; + + // Gather edges by maxDiscordant and minConcordant. + // edgeTable[maxDiscordant][minConcordant] contains the + // edges with those values of maxDiscordant and minConcordant. + // This allows the code later ot process edges in order + // of increasing maxDiscordant and decreasing minConcordant. + vector< vector< vector<edge_descriptor> > > edgeTable; + BGL_FORALL_EDGES(pe, phasingGraph, PhasingGraph) { + const PhasingGraphEdge& edge = phasingGraph[pe]; + const uint64_t maxDiscordant = edge.maxDiscordant; + const uint64_t minConcordant = edge.minConcordant; + if(edgeTable.size() <= maxDiscordant) { + edgeTable.resize(maxDiscordant + 1); + } + vector< vector<edge_descriptor> >& v = edgeTable[maxDiscordant]; + if(v.size() <= minConcordant) { + v.resize(minConcordant + 1); + } + v[minConcordant].push_back(pe); + } + + // Map vertices to integers. + std::map<vertex_descriptor, uint64_t> vertexIndexMap; + uint64_t vertexIndex = 0; + BGL_FORALL_VERTICES(pv, phasingGraph, PhasingGraph) { + vertexIndexMap.insert({pv, vertexIndex++}); + } + const uint64_t vertexCount = vertexIndexMap.size(); + + + + // Compute optimal spanning tree and connected components. + vector<uint64_t> rank(vertexCount); + vector<uint64_t> parent(vertexCount); + boost::disjoint_sets<uint64_t*, uint64_t*> disjointSets(&rank[0], &parent[0]); + for(uint64_t i=0; i<vertexCount; i++) { + disjointSets.make_set(i); + } + uint64_t spanningTreeEdgeCount = 0; + for(uint64_t maxDiscordant=0; maxDiscordant<edgeTable.size(); maxDiscordant++) { + const vector< vector<edge_descriptor> >& v = edgeTable[maxDiscordant]; + for(int64_t minConcordant=v.size()-1; minConcordant>=0; minConcordant--) { + const vector<edge_descriptor>& vv = v[minConcordant]; + if(false) { + cout << "Processing " << vv.size() << " phasing graph edges with maxDiscordant=" << + maxDiscordant << ", minConcordant=" << minConcordant << endl; + } + for(const edge_descriptor e: vv) { + PhasingGraphEdge& edge = phasingGraph[e]; + const vertex_descriptor pv0 = source(e, phasingGraph); + const vertex_descriptor pv1 = target(e, phasingGraph); + const uint64_t vertexIndex0 = vertexIndexMap[pv0]; + const uint64_t vertexIndex1 = vertexIndexMap[pv1]; + const uint64_t componentId0 = disjointSets.find_set(vertexIndex0); + const uint64_t componentId1 = disjointSets.find_set(vertexIndex1); + if(componentId0 != componentId1) { + disjointSets.union_set(vertexIndex0, vertexIndex1); + edge.isSpanningTreeEdge = true; + ++spanningTreeEdgeCount; + } + } + if(false) { + cout << "Found " << spanningTreeEdgeCount << " spanning tree edges so far." << endl; + } + } + } + + // Gather the vertices in each connected component. + vector< vector<vertex_descriptor> > components(vertexCount); + BGL_FORALL_VERTICES(pv, phasingGraph, PhasingGraph) { + const uint64_t componentId = disjointSets.find_set(vertexIndexMap[pv]); + components[componentId].push_back(pv); + } + + // Write a histogram of component sizes. + if(debug) { + vector<uint64_t> histogram; + for(const vector<vertex_descriptor>& component: components) { + const uint64_t componentSize = component.size(); + if(histogram.size() <= componentSize) { + histogram.resize(componentSize + 1, 0); + } + ++histogram[componentSize]; + } + + cout << "Histogram of component sizes:" << endl; + cout << "Size,Frequency,Vertices" << endl; + for(uint64_t componentSize=1; componentSize<histogram.size(); componentSize++) { + const uint64_t frequency = histogram[componentSize]; + if(frequency) { + cout << componentSize << "," << frequency << "," << componentSize*frequency << endl; + } + } + } + + // Gather the non-trivial component and sort them by decreasing size. + vector< pair<uint64_t, uint64_t> > componentTable; // (componentId, componentSize) + for(uint64_t componentId=0; componentId<vertexCount; componentId++) { + const vector<vertex_descriptor>& component = components[componentId]; + if(component.size() > 1) { + componentTable.push_back({componentId, component.size()}); + } + } + sort(componentTable.begin(), componentTable.end(), + OrderPairsBySecondOnlyGreater<uint64_t, uint64_t>()); + + + + // Process the non-trivial components in order of decreasing size. + phasedComponents.clear(); + for(const pair<uint64_t, uint64_t>& p: componentTable) { + const uint64_t componentId = p.first; + const vector<vertex_descriptor>& component = components[componentId]; + SHASTA_ASSERT(component.size() == p.second); + if(debug) { + cout << "Processing a phasing component with " << component.size() << + " vertices." << endl; + } + + // Use a BFS on the spanning tree to phase the vertices in this component. + // Use the spanning tree to phase vertices in the largest component. + // It does not matter which vertex we start from. + const vertex_descriptor vFirst = component.front(); + phasingGraph[vFirst].phase = +1; + std::queue<vertex_descriptor> q; + q.push(vFirst); + while(not q.empty()) { + const vertex_descriptor v0 = q.front(); + q.pop(); + BGL_FORALL_OUTEDGES(v0, e, phasingGraph, PhasingGraph) { + PhasingGraphEdge& edge = phasingGraph[e]; + if(not edge.isSpanningTreeEdge) { + continue; + } + const PhasingGraphVertex& vertex0 = phasingGraph[v0]; + const vertex_descriptor v1 = target(e, phasingGraph); + PhasingGraphVertex& vertex1 = phasingGraph[v1]; + if(vertex1.phase == 0) { + vertex1.phase = vertex0.phase; + if(edge.phase == -1) { + vertex1.phase = - vertex1.phase; + } + q.push(v1); + } + } + } + + // Count inconsistent edges in this component. + if(debug) { + uint64_t inconsistentCount = 0; + uint64_t totalCount = 0; + for(const vertex_descriptor v: component) { + BGL_FORALL_OUTEDGES(v, e, phasingGraph, PhasingGraph) { + totalCount++; + if(not isConsistent(e)) { + ++inconsistentCount; + } + } + } + // This counts edges twice. + inconsistentCount /= 2; + totalCount /= 2; + cout << inconsistentCount << " inconsistent edges in this component out of " << + totalCount << " total." << endl; + } + + + // Create the PhasedComponent corresponding to this component. + // Don't include any vertices that overlap previous PhasedComponent. + shared_ptr<PhasedComponent> phasedComponentPointer = make_shared<PhasedComponent>(); + PhasedComponent& phasedComponent = *phasedComponentPointer; + for(const vertex_descriptor pv: component) { + const PhasingGraphVertex& vertex = phasingGraph[pv]; + const uint64_t positionInBubbleChain = vertex.positionInBubbleChain; + bool overlapsPrevious = false; + for(const auto& phasedComponent: phasedComponents) { + if( + positionInBubbleChain >= phasedComponent->minPositionInBubbleChain and + positionInBubbleChain <= phasedComponent->maxPositionInBubbleChain) { + overlapsPrevious = true; + break; + } + } + if(not overlapsPrevious) { + phasedComponent.push_back({vertex.positionInBubbleChain, vertex.phase}); + } + } + if(phasedComponent.size() < 2) { + if(debug) { + cout << "This component will be discarded due to overlap with previous components." << endl; + } + continue; + } + phasedComponent.sort(); + if(debug) { + cout << "Phasing range for this component " << phasedComponent.minPositionInBubbleChain << + " " << phasedComponent.maxPositionInBubbleChain << endl; + } + phasedComponents.push_back(phasedComponentPointer); + } + + // Sort the phased components in order of increasing position. + class SortHelper { + public: + bool operator()( + const shared_ptr<PhasedComponent>& p0, + const shared_ptr<PhasedComponent>& p1 + ) const + { + return p0->minPositionInBubbleChain < p1->minPositionInBubbleChain; + } + }; + sort(phasedComponents.begin(), phasedComponents.end(), SortHelper()); + + if(debug) { + cout << "Kept " << phasedComponents.size() << " phased components:" << endl; + for(const auto& phasedComponent: phasedComponents) { + cout << phasedComponent->size() << " diploid bubbles at positions " << + phasedComponent->minPositionInBubbleChain << "..." << + phasedComponent->maxPositionInBubbleChain << " in bubble chain." << endl; + + } + phasingGraph.writeGraphviz("PhasingGraph.dot"); + } +} + + + +// Sort edges in order of decreasing significance: +// - If using the Bayesian model, logP. +// - Otherwise, minConcordant/maxDiscordant. +void AssemblyGraph::PhasingGraph::sortEdges( + vector<edge_descriptor>& sortedEdges, + bool useBayesianModel) const +{ + const PhasingGraph& phasingGraph = *this; + + if(useBayesianModel) { + + // Gather edges and their logP. + vector< pair<edge_descriptor, double> > edgeTable; + BGL_FORALL_EDGES(pe, phasingGraph, PhasingGraph) { + const PhasingGraphEdge& edge = phasingGraph[pe]; + edgeTable.push_back({pe, edge.logP()}); + } + + // Sort by decreasing logP. + sort(edgeTable.begin(), edgeTable.end(), + OrderPairsBySecondOnlyGreater<edge_descriptor, double>()); + sortedEdges.clear(); + for(const auto& p: edgeTable) { + sortedEdges.push_back(p.first); + } + + } else { + + // Gather edges by maxDiscordant and minConcordant. + // edgeTable[maxDiscordant][minConcordant] contains the + // edges with those values of maxDiscordant and minConcordant. + vector< vector< vector<edge_descriptor> > > edgeTable; + BGL_FORALL_EDGES(pe, phasingGraph, PhasingGraph) { + const PhasingGraphEdge& edge = phasingGraph[pe]; + const uint64_t maxDiscordant = edge.maxDiscordant; + const uint64_t minConcordant = edge.minConcordant; + if(edgeTable.size() <= maxDiscordant) { + edgeTable.resize(maxDiscordant + 1); + } + vector< vector<edge_descriptor> >& v = edgeTable[maxDiscordant]; + if(v.size() <= minConcordant) { + v.resize(minConcordant + 1); + } + v[minConcordant].push_back(pe); + } + + // The sorted edges are in order of increasing maxDiscordant + // and decreasing minConcordant. + sortedEdges.clear(); + for(uint64_t maxDiscordant=0; maxDiscordant<edgeTable.size(); maxDiscordant++) { + const vector< vector<edge_descriptor> >& v = edgeTable[maxDiscordant]; + for(int64_t minConcordant=v.size()-1; minConcordant>=0; minConcordant--) { + const vector<edge_descriptor>& vv = v[minConcordant]; + for(const edge_descriptor e: vv) { + sortedEdges.push_back(e); + } + } + } + + } +} + + + +// To phase the PhasingGraph, we create an optimal spanning tree +// using edges in order of decreasing "significance". +// We do this iteratively. At each iteration we process the largest +// connected component of the surviving PhasingGraph. +void AssemblyGraph::PhasingGraph::phase1(bool debug, bool useBayesianModel) +{ + PhasingGraph& phasingGraph = *this; + phasedComponents.clear(); + + if(debug) { + cout << "Beginning phasing for a PhasingGraph with " << num_vertices(phasingGraph) << + " vertices." << endl; + } + + // Main iteration loop. + while(true) { + + // Clear the isSpanningTreeEdge flag of all edges. + BGL_FORALL_EDGES(pe, phasingGraph, PhasingGraph) { + phasingGraph[pe].isSpanningTreeEdge = false; + } + + // Sort edges in order of decreasing significance: + // - If using the Bayesian model, logP. + // - Otherwise, minConcordant/maxDiscordant. + vector<edge_descriptor> sortedEdges; + sortEdges(sortedEdges, useBayesianModel); + + // Map vertices to integers. + // This is needed for the computation of the spanning tree and + // connected components. + std::map<vertex_descriptor, uint64_t> vertexIndexMap; + uint64_t vertexIndex = 0; + BGL_FORALL_VERTICES(pv, phasingGraph, PhasingGraph) { + vertexIndexMap.insert({pv, vertexIndex++}); + } + const uint64_t vertexCount = vertexIndexMap.size(); + + if(debug) { + cout << "Beginning a new phasing iteration. The phasing graph has " << + vertexCount << " vertices left." << endl; + } + + + + // Compute optimal spanning tree and connected components. + vector<uint64_t> rank(vertexCount); + vector<uint64_t> parent(vertexCount); + boost::disjoint_sets<uint64_t*, uint64_t*> disjointSets(&rank[0], &parent[0]); + for(uint64_t i=0; i<vertexCount; i++) { + disjointSets.make_set(i); + } + uint64_t spanningTreeEdgeCount = 0; + + for(const edge_descriptor e: sortedEdges) { + PhasingGraphEdge& edge = phasingGraph[e]; + const vertex_descriptor pv0 = source(e, phasingGraph); + const vertex_descriptor pv1 = target(e, phasingGraph); + const uint64_t vertexIndex0 = vertexIndexMap[pv0]; + const uint64_t vertexIndex1 = vertexIndexMap[pv1]; + const uint64_t componentId0 = disjointSets.find_set(vertexIndex0); + const uint64_t componentId1 = disjointSets.find_set(vertexIndex1); + if(componentId0 != componentId1) { + disjointSets.union_set(vertexIndex0, vertexIndex1); + edge.isSpanningTreeEdge = true; + ++spanningTreeEdgeCount; + } + } + + // Gather the vertices in each connected component. + vector< vector<vertex_descriptor> > components(vertexCount); + BGL_FORALL_VERTICES(pv, phasingGraph, PhasingGraph) { + const uint64_t componentId = disjointSets.find_set(vertexIndexMap[pv]); + components[componentId].push_back(pv); + } + + // Find the largest connected component. + uint64_t largestComponentId = invalid<uint64_t>; + uint64_t largestComponentSize = 0; + for(uint64_t componentId=0; componentId<vertexCount; componentId++) { + const uint64_t componentSize = components[componentId].size(); + if(componentSize > largestComponentSize) { + largestComponentSize = componentSize; + largestComponentId = componentId; + } + } + + // If the largest component has less than two vertices, we are done. + if(largestComponentSize < 2) { + if(debug) { + cout << "Phasing terminates because only trivial connected components were found." << endl; + } + break; + } + + // Access the largest connected component, which we will be working on + // for the rest of this iteration. + const vector<vertex_descriptor>& component = components[largestComponentId]; + SHASTA_ASSERT(component.size() == largestComponentSize); + if(debug) { + cout << "The largest component of the current PhasingGraph has " << + largestComponentSize << " vertices." << endl; + } + + // Use a BFS on the spanning tree to phase the vertices in this component. + // It does not matter which vertex we start from. + const vertex_descriptor vFirst = component.front(); + phasingGraph[vFirst].phase = +1; + std::queue<vertex_descriptor> q; + q.push(vFirst); + while(not q.empty()) { + const vertex_descriptor v0 = q.front(); + q.pop(); + BGL_FORALL_OUTEDGES(v0, e, phasingGraph, PhasingGraph) { + PhasingGraphEdge& edge = phasingGraph[e]; + if(not edge.isSpanningTreeEdge) { + continue; + } + const PhasingGraphVertex& vertex0 = phasingGraph[v0]; + const vertex_descriptor v1 = target(e, phasingGraph); + PhasingGraphVertex& vertex1 = phasingGraph[v1]; + if(vertex1.phase == 0) { + vertex1.phase = vertex0.phase; + if(edge.phase == -1) { + vertex1.phase = - vertex1.phase; + } + q.push(v1); + } + } + } + + // Count inconsistent edges in this component. + if(debug) { + uint64_t inconsistentCount = 0; + uint64_t totalCount = 0; + for(const vertex_descriptor v: component) { + BGL_FORALL_OUTEDGES(v, e, phasingGraph, PhasingGraph) { + totalCount++; + if(not isConsistent(e)) { + ++inconsistentCount; + } + } + } + // This counts edges twice. + inconsistentCount /= 2; + totalCount /= 2; + cout << inconsistentCount << " inconsistent edges in this component out of " << + totalCount << " total." << endl; + } + + // All vertices in this component have been phased. + // However, when creating the PhasedComponent, we have to make sure that adjacent + // phased vertices have common reads. + // To guarantee this, we find a longest path in this component, in order of increasing + // positionInBubbleChain. Only vertices in this longest path are then included in the + // PhasedComponent. + + // To find this longest path, we use an algorithm similar to the one in longestPath.cpp, + // using the topological ordering induced by positionInBubbleChain. + + // Table of the vertices in order of increasing positionInBubbleChain. + vector< pair<vertex_descriptor, uint64_t> > vertexTable; + for(const vertex_descriptor v: component) { + vertexTable.push_back({v, phasingGraph[v].positionInBubbleChain}); + } + sort(vertexTable.begin(), vertexTable.end(), OrderPairsBySecondOnly<vertex_descriptor, uint64_t>()); + + // The length of the longest path ending at each vertex. + std::map<vertex_descriptor, uint64_t> lengthMap; + for(const vertex_descriptor v: component) { + lengthMap.insert(make_pair(v, 0)); + } + + // Process the vertices in order of increasing positionInBubbleChain. + for(const auto& p: vertexTable) { + const vertex_descriptor v0 = p.first; + const uint64_t positionInBubbleChain0 = phasingGraph[v0].positionInBubbleChain; + + uint64_t maximumLength = 0; + BGL_FORALL_OUTEDGES_T(v0, e, phasingGraph, PhasingGraph) { + const vertex_descriptor v1 = target(e, phasingGraph); + const uint64_t positionInBubbleChain1 = phasingGraph[v1].positionInBubbleChain; + + if(positionInBubbleChain1 < positionInBubbleChain0) { + maximumLength = max(maximumLength, lengthMap[v1]); + } + } + lengthMap[v0] = maximumLength + 1; + } + + // Find the vertex with the longest length. + // This will be the end of the longest path. + vertex_descriptor v = PhasingGraph::null_vertex(); + uint64_t maximumLength = 0; + for(const auto& p: lengthMap) { + if(p.second > maximumLength) { + v = p.first; + maximumLength = p.second; + } + } + + // Constuct the path, moving backward from here. + vector<vertex_descriptor> longestPath; + longestPath.push_back(v); + while(true) { + vertex_descriptor vPrevious = PhasingGraph::null_vertex(); + uint64_t maximumLength = 0; + BGL_FORALL_OUTEDGES(v, e, phasingGraph, PhasingGraph) { + const vertex_descriptor v0 = target(e, phasingGraph); + if(phasingGraph[v0].positionInBubbleChain < phasingGraph[v].positionInBubbleChain) { + const uint64_t length = lengthMap[v0]; + if(length > maximumLength) { + vPrevious = v0; + maximumLength = length; + } + } + } + if(vPrevious == PhasingGraph::null_vertex()) { + break; + } + v = vPrevious; + longestPath.push_back(v); + + } + std::reverse(longestPath.begin(), longestPath.end()); + + if(debug) { + cout << "The longest path contains " << longestPath.size() << " vertices." << endl; + } + + + + // If the longest path is non-trivial, use it to create a new PhasedComponent. + if(longestPath.size() > 1) { + if(debug) { + cout << "Creating a new PhasedComponent." << endl; + } + shared_ptr<PhasedComponent> phasedComponentPointer = make_shared<PhasedComponent>(); + phasedComponents.push_back(phasedComponentPointer); + PhasedComponent& phasedComponent = *phasedComponentPointer; + + for(const vertex_descriptor v: longestPath) { + const PhasingGraphVertex& vertex = phasingGraph[v]; + phasedComponent.push_back({vertex.positionInBubbleChain, vertex.phase}); + } + phasedComponent.minPositionInBubbleChain = phasingGraph[longestPath.front()].positionInBubbleChain; + phasedComponent.maxPositionInBubbleChain = phasingGraph[longestPath.back()].positionInBubbleChain; + if(debug) { + cout << "Phasing range for this component " << phasedComponent.minPositionInBubbleChain << + " " << phasedComponent.maxPositionInBubbleChain << endl; + } + + // Now remove from the PhasingGraph all vertices of this component + // plus any vertices with a positionInBubbleChain + // that overlaps this phased component. + vector<vertex_descriptor> verticesToBeRemoved = component; + BGL_FORALL_VERTICES(v, phasingGraph, PhasingGraph) { + const uint64_t positionInBubbleChain = phasingGraph[v].positionInBubbleChain; + if( positionInBubbleChain >= phasedComponent.minPositionInBubbleChain and + positionInBubbleChain <= phasedComponent.maxPositionInBubbleChain) { + verticesToBeRemoved.push_back(v); + } + } + deduplicate(verticesToBeRemoved); + for(const vertex_descriptor v: verticesToBeRemoved) { + clear_vertex(v, phasingGraph); + remove_vertex(v, phasingGraph); + } + } else { + + // Now remove from the PhasingGraph all vertices of this component. + for(const vertex_descriptor v: component) { + clear_vertex(v, phasingGraph); + remove_vertex(v, phasingGraph); + } + } + } + + + + // Sort the phased components in order of increasing position. + class SortHelper { + public: + bool operator()( + const shared_ptr<PhasedComponent>& p0, + const shared_ptr<PhasedComponent>& p1 + ) const + { + return p0->minPositionInBubbleChain < p1->minPositionInBubbleChain; + } + }; + sort(phasedComponents.begin(), phasedComponents.end(), SortHelper()); + + if(debug) { + cout << phasedComponents.size() << " phased components:" << endl; + for(const auto& phasedComponent: phasedComponents) { + cout << phasedComponent->size() << " diploid bubbles at positions " << + phasedComponent->minPositionInBubbleChain << "..." << + phasedComponent->maxPositionInBubbleChain << " in bubble chain." << endl; + + } + // phasingGraph.writeGraphviz("PhasingGraph.dot"); + } +} + + + +bool AssemblyGraph::PhasingGraph::isConsistent(edge_descriptor e) const +{ + const PhasingGraph& phasingGraph = *this; + const vertex_descriptor v0 = source(e, phasingGraph); + const vertex_descriptor v1 = target(e, phasingGraph); + const int64_t phase0 = phasingGraph[v0].phase; + const int64_t phase1 = phasingGraph[v1].phase; + const int64_t phase = phasingGraph[e].phase; + + SHASTA_ASSERT(phase0==+1 or phase0==-1); + SHASTA_ASSERT(phase1==+1 or phase1==-1); + SHASTA_ASSERT(phase==+1 or phase==-1); + + if(phase == +1) { + return phase0 == phase1; + } else { + return phase0 != phase1; + } +} + + + +void AssemblyGraph::PhasingGraph::writeGraphviz(const string& fileName) const +{ + const PhasingGraph& phasingGraph = *this; + + ofstream dot(fileName); + dot << "graph PhasingGraph {\n"; + + BGL_FORALL_EDGES(e, phasingGraph, PhasingGraph) { + const vertex_descriptor v0 = source(e, phasingGraph); + const vertex_descriptor v1 = target(e, phasingGraph); + dot << + phasingGraph[v0].positionInBubbleChain << "--" << + phasingGraph[v1].positionInBubbleChain; + if(phasingGraph[e].isSpanningTreeEdge) { + dot << " [color=green]"; + } else if(not isConsistent(e)) { + dot << " [color=red]"; + } + dot << ";\n"; + } + + dot << "}\n"; +} + + + +void AssemblyGraph::TangleMatrix::analyze( + uint64_t lowThreshold, + uint64_t highThreshold, + int64_t& phase, + uint64_t& minConcordant, + uint64_t& maxDiscordant, + uint64_t& total, + double epsilon, + double& logPin, // log[P(in-phase)/P(random)] in decibels + double& logPout // log[P(out-of-phase)/P(random)] in decibels + ) const +{ + const TangleMatrix& m = *this; + + // Classify matrix elements: + // 0 = low (<=lowThreshold) + // 1 = ambiguous (>lowThreshold, <highThreshold) + // 2 = high (>=highThreshold) + array< array<uint64_t, 2>, 2> c; + total = 0; + for(uint64_t i=0; i<2; i++) { + for(uint64_t j=0; j<2; j++) { + const uint64_t matrixElement = m[i][j]; + total += matrixElement; + uint64_t& classification = c[i][j]; + if(matrixElement <= lowThreshold) { + classification = 0; + } else if(matrixElement >= highThreshold) { + classification = 2; + } else { + classification = 1; + } + } + } + + // Check if this tangle matrix is unambiguously in phase. + if(c[0][0]==2 and c[1][1]==2 and c[0][1]==0 and c[1][0]==0) { + phase = +1; + minConcordant = min(m[0][0], m[1][1]); + maxDiscordant = max(m[0][1], m[1][0]); + } + + // Check if this tangle matrix is unambiguously out of phase. + else if(c[0][1]==2 and c[1][0]==2 and c[0][0]==0 and c[1][1]==0) { + phase = -1; + minConcordant = min(m[0][1], m[1][0]); + maxDiscordant = max(m[0][0], m[0][0]); + } + + // Otherwise, it is ambiguous. + else { + phase = 0; + minConcordant = 0; + maxDiscordant = 0; + } + + tie(logPin, logPout) = diploidBayesianPhase(m, epsilon); +} + + + +// Collapse consecutive haploid bubbles of a BubbleChain. +bool BubbleChain::compress() +{ + BubbleChain& bubbleChain = *this; + BubbleChain newBubbleChain; + + // If this bubble chain consists of a single bubble, there is nothing to compress. + if(size() == 1) { + return false; + } + + // Look for pairs of consecutive haploid bubbles. + // If none found, return. + bool found = false; + for(uint64_t i1=1; i1<size(); i1++) { + const uint64_t i0 = i1 - 1; + const Bubble& bubble0 = bubbleChain[i0]; + const Bubble& bubble1 = bubbleChain[i1]; + if(bubble0.isHaploid() and bubble1.isHaploid()) { + found = true; + break; + } + } + if(not found) { + return false; + } + + + + // Find sets of consecutive haploid bubbles. + for(uint64_t i=0; i<size(); i++) { + const Bubble& bubble = bubbleChain[i]; + + if(bubble.isHaploid()) { + + // This bubble is haploid. + // If the last bubble of the new bubble is haploid, append it to that. + // Otherwise apppend it to the last bubble. + if(not newBubbleChain.empty() and newBubbleChain.back().isHaploid()) { + const Chain& chain = bubble.front(); + Chain& newChain = newBubbleChain.back().front(); + copy(chain.begin()+1, chain.end(), back_inserter(newChain)); + } else { + newBubbleChain.push_back(bubble); + } + } else { + + // This bubble is not haploid. Just append it to the last bubble. + newBubbleChain.push_back(bubble); + } + + } + + // Replace it with the new one. + bubbleChain = newBubbleChain; + + return true; +} + + + +void AssemblyGraph::assembleChain( + Chain& chain, + uint64_t chainTerminalCommonThreshold) +{ + chain.stepSequences.resize(chain.size() - 1); + + // Do all the assembly steps. + for(uint64_t positionInChain=0; positionInChain<chain.size()-1; positionInChain++) { + runAssemblyStep(chain, positionInChain, chainTerminalCommonThreshold); + } + + combineStepSequences(chain); + chain.wasAssembled = true; +} + + + +// Multithreaded version of sequence assembly. +// This only assembles the chains that have the shouldBeAssembled flag set. +void AssemblyGraph::assembleChainsMultithreaded( + uint64_t chainTerminalCommonThreshold, + uint64_t threadCount) +{ + AssemblyGraph& assemblyGraph = *this; + + // Store the argument so the threads can see it. + assembleChainsMultithreadedData.chainTerminalCommonThreshold = chainTerminalCommonThreshold; + + // Gather AssemblySteps for all the Chains. + auto& assemblySteps = assembleChainsMultithreadedData.assemblySteps; + assemblySteps.clear(); + + // Loop over BubbleChains. + AssemblyStep assemblyStep; + BGL_FORALL_EDGES(e, assemblyGraph, AssemblyGraph) { + assemblyStep.e = e; + BubbleChain& bubbleChain = assemblyGraph[e]; + + // Loop over Bubbles in this BubbleChain. + for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) { + assemblyStep.positionInBubbleChain = positionInBubbleChain; + Bubble& bubble = bubbleChain[positionInBubbleChain]; + + // Loop over Chains in this Bubble. + for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) { + assemblyStep.indexInBubble = indexInBubble; + Chain& chain = bubble[indexInBubble]; + SHASTA_ASSERT(chain.size() >= 2); + + // If this Chain is not marked to be assembled, skip it. + if(not chain.shouldBeAssembled) { + continue; + } + + // Prepare the vectors where the threads will store + // the internal sequence assembled for each AssemblyStep. + // Each of these vectors will be modified by only one thread. + chain.stepSequences.resize(chain.size() - 1); + + // Loop over pairs of consecutive vertices in this Chain. + for(uint64_t positionInChain=0; positionInChain<chain.size()-1; positionInChain++) { + assemblyStep.positionInChain = positionInChain; + + // Compute the offset. + const MarkerGraphEdgeId edgeIdA = chain[positionInChain]; + const MarkerGraphEdgeId edgeIdB = chain[positionInChain + 1]; + MarkerGraphEdgePairInfo info; + SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair( + edgeIdA, edgeIdB, info)); + assemblyStep.offsetInBases = info.offsetInBases; + + // Store this assembly step. + assemblySteps.push_back(assemblyStep); + } + } + } + } + + // For better load balancing, sort them by decreasing offset. + sort(assemblySteps.begin(), assemblySteps.end()); + + + + // Assemble the steps in parallel. + setupLoadBalancing(assemblySteps.size(), 1); + performanceLog << timestamp << "Sequence assembly begins." << endl; + runThreads(&AssemblyGraph::assembleChainsMultithreadedTheadFunction, threadCount); + performanceLog << timestamp << "Sequence assembly ends." << endl; + + + + // Now that all the AssemblySteps have been computed, the stepSequences + // of each Chain have been filled in. + // Combine those with the marker graph edge sequences to obtain the + // complete sequence of each chain. + // This can be parallelized. + BGL_FORALL_EDGES(e, assemblyGraph, AssemblyGraph) { + assemblyStep.e = e; + BubbleChain& bubbleChain = assemblyGraph[e]; + + // Loop over Bubbles in this BubbleChain. + for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) { + assemblyStep.positionInBubbleChain = positionInBubbleChain; + Bubble& bubble = bubbleChain[positionInBubbleChain]; + + // Loop over Chains in this Bubble. + for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) { + assemblyStep.indexInBubble = indexInBubble; + Chain& chain = bubble[indexInBubble]; + if(chain.shouldBeAssembled) { + combineStepSequences(chain); + chain.wasAssembled = true; + } + } + } + } +} + + + +// This sets the shouldBeAssembled flag for all chains, then +// calls assembleChainsMultithreaded. +void AssemblyGraph::assembleAllChainsMultithreaded( + uint64_t chainTerminalCommonThreshold, + uint64_t threadCount) +{ + AssemblyGraph& assemblyGraph = *this; + + // Loop over all bubble chains. + BGL_FORALL_EDGES(e, assemblyGraph, AssemblyGraph) { + BubbleChain& bubbleChain = assemblyGraph[e]; + + // Loop over Bubbles in this BubbleChain. + for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) { + Bubble& bubble = bubbleChain[positionInBubbleChain]; + + // Loop over Chains in this Bubble. + for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) { + Chain& chain = bubble[indexInBubble]; + chain.shouldBeAssembled = true; + } + } + } + + assembleChainsMultithreaded(chainTerminalCommonThreshold, threadCount); +} + + + +// This clears the shouldBeAssembled flag from all Chains. +void AssemblyGraph::clearAllShouldBeAssembledFlags() +{ + AssemblyGraph& assemblyGraph = *this; + + // Loop over all bubble chains. + BGL_FORALL_EDGES(e, assemblyGraph, AssemblyGraph) { + BubbleChain& bubbleChain = assemblyGraph[e]; + + // Loop over Bubbles in this BubbleChain. + for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) { + Bubble& bubble = bubbleChain[positionInBubbleChain]; + + // Loop over Chains in this Bubble. + for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) { + Chain& chain = bubble[indexInBubble]; + chain.shouldBeAssembled = false; + } + } + } + +} + + + +// Combine stepSequences of a Chain with the marker graph edge sequences to obtain the +// complete sequence of the chain. +void AssemblyGraph::combineStepSequences(Chain& chain) +{ + chain.sequence.clear(); + for(uint64_t positionInChain=0; /* Check later */ ; positionInChain++) { + + // Add the sequence for the marker graph primary edge. + const MarkerGraphEdgeId edgeId = chain[positionInChain]; + const auto edgeSequence = assembler.markerGraph.edgeSequence[edgeId]; + copy(edgeSequence.begin(), edgeSequence.end(), back_inserter(chain.sequence)); + + // If this was the last primary edge for the chain, we are done. + if(positionInChain == chain.size() - 1) { + break; + } + + // Add assembled sequence between this marker graph primary edge and the next in the chain. + const vector<Base>& stepSequence = chain.stepSequences[positionInChain].sequence; + copy(stepSequence.begin(), stepSequence.end(), back_inserter(chain.sequence)); + + } +} + + + +// This writes the details of sequence assembly for all Chains in the AssemblyGraph. +void AssemblyGraph::writeAssemblyDetails() const +{ + const AssemblyGraph& assemblyGraph = *this; + + // Opeb the csv file and write the header. + ofstream csv("AssemblyDetails-" + to_string(componentId) + ".csv"); + csv << "Chain,Component,Bubble chain,Position in bubble chain,Index in bubble," + "Position in chain,Type,Marker graph edge id," + "Assembly status,Length,Sequence begin,Sequence end,Coverage,Common\n"; + + // Loop over all bubble chains. + BGL_FORALL_EDGES(e, assemblyGraph, AssemblyGraph) { + const BubbleChain& bubbleChain = assemblyGraph[e]; + + // Loop over Bubbles in this BubbleChain. + for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) { + const Bubble& bubble = bubbleChain[positionInBubbleChain]; + + // Loop over Chains in this Bubble. + for(uint64_t indexInBubble=0; indexInBubble<bubble.size(); indexInBubble++) { + const Chain& chain = bubble[indexInBubble]; + SHASTA_ASSERT(chain.wasAssembled); + SHASTA_ASSERT(chain.stepSequences.size() == chain.size() - 1); + const string chainString = chainStringId(e, positionInBubbleChain, indexInBubble); + + // Loop over positions in this Chain. + uint64_t positionInSequence = 0; + for(uint64_t positionInChain=0; /* Check later */ ; positionInChain++) { + + // Write one line to csv with information about the sequence + // contributed by this the marker graph primary edge. + { + const MarkerGraphEdgeId edgeId = chain[positionInChain]; + const uint64_t coverage = assembler.markerGraph.edgeMarkerIntervals[edgeId].size(); + const uint64_t edgeSequenceLength = assembler.markerGraph.edgeSequence[edgeId].size(); + const uint64_t beginInSequence = positionInSequence; + const uint64_t endInSequence = positionInSequence + edgeSequenceLength; + csv << chainString << ","; + csv << componentId << ","; + csv << assemblyGraph[e].id << ","; + csv << positionInBubbleChain << ","; + csv << indexInBubble << ","; + csv << positionInChain << ","; + csv << "E,"; + csv << edgeId << ",,"; + csv << edgeSequenceLength << ","; + csv << beginInSequence << ","; + csv << endInSequence << ","; + csv << coverage << ","; + csv << ","; + csv << "\n"; + positionInSequence = endInSequence; + } + + + // If this was the last primary edge for the chain, we are done. + if(positionInChain == chain.size() - 1) { + SHASTA_ASSERT(positionInSequence == chain.sequence.size()); + break; + } + + // Write one line to csv with information about the sequence + // contributed by the assemby step between this marker graph primary edge + // and the next in the chain. + { + const MarkerGraphEdgeId edgeId = chain[positionInChain]; + const MarkerGraphEdgeId nextEdgeId = chain[positionInChain + 1]; + const uint64_t commonCount = assembler.countCommonOrientedReadsUnsafe( + edgeId, nextEdgeId); + const auto& stepSequence = chain.stepSequences[positionInChain]; + const uint64_t stepSequenceLength = stepSequence.sequence.size(); + const bool success = stepSequence.success; + const uint64_t beginInSequence = positionInSequence; + const uint64_t endInSequence = positionInSequence + stepSequenceLength; + csv << chainString << ","; + csv << componentId << ","; + csv << assemblyGraph[e].id << ","; + csv << positionInBubbleChain << ","; + csv << indexInBubble << ","; + csv << ","; + csv << "S,"; + csv << ","; + csv << (success ? "Success," : "Failure,"); + csv << stepSequenceLength << ","; + csv << beginInSequence << ","; + csv << endInSequence << ","; + csv << ","; + csv << commonCount << ","; + csv << "\n"; + positionInSequence = endInSequence; + } + + } + } + } + } +} + + + +void AssemblyGraph::assembleChainsMultithreadedTheadFunction(uint64_t threadId) +{ + const uint64_t chainTerminalCommonThreshold = assembleChainsMultithreadedData.chainTerminalCommonThreshold; + + // Loop over all batches assigned to this thread. + uint64_t begin, end; + while(getNextBatch(begin, end)) { + + // Loop over all assembly steps assigned to this batch. + for(uint64_t i=begin; i!=end; ++i) { + const auto& assemblyStep = assembleChainsMultithreadedData.assemblySteps[i]; + runAssemblyStep(chainTerminalCommonThreshold, assemblyStep); + } + } +} + + + +void AssemblyGraph::runAssemblyStep( + uint64_t chainTerminalCommonThreshold, + const AssemblyStep& assemblyStep) +{ + AssemblyGraph& assemblyGraph = *this; + + // Get the BubbleChain. + BubbleChain& bubbleChain = assemblyGraph[assemblyStep.e]; + + // Get the Bubble. + Bubble& bubble = bubbleChain[assemblyStep.positionInBubbleChain]; + + // Get the Chain. + Chain& chain = bubble[assemblyStep.indexInBubble]; + SHASTA_ASSERT(chain.size() >= 2); + + // Do it. + runAssemblyStep(chain, assemblyStep.positionInChain, chainTerminalCommonThreshold); +} + + + +void AssemblyGraph::runAssemblyStep( + Chain& chain, + uint64_t positionInChain, + uint64_t chainTerminalCommonThreshold) +{ + + // Find the MarkerGraphEdgeIds for this local assembly. + const MarkerGraphEdgeId edgeIdA = chain[positionInChain]; + const MarkerGraphEdgeId edgeIdB = chain[positionInChain + 1]; + + // Suppress html output from LocalAssembly. + ostream html(0); + + + + // Figure out if we should use the oriented reads on edgeIdA and edgeIdB. + bool useA = true; + bool useB = true; + // For chains of length 2, we leave useA and useB set to true. + // For the usual case of longer chains, there is more checking. + if(chain.size() != 2) { + + // If we are at the beginning or end of the chain, we need to check + // the number of common oriented reads. + MarkerGraphEdgePairInfo info; + if((positionInChain == 0) or (positionInChain == chain.size() - 2)) { + SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(edgeIdA, edgeIdB, info)); + } + + // If this is the first step of the Chain, we want to set useA to false + // to avoid using reads that don't belong. But we only do it + // if this leaves us with enough reads to assemble. + if(positionInChain == 0) { + if(info.common >= chainTerminalCommonThreshold) { + useA = false; + } + } + + // If this is the last step of the Chain, we want to set useB to false + // to avoid using reads that don't belong. But we only do it + // if this leaves us with enough reads to assemble. + else if(positionInChain == chain.size() - 2) { + if(info.common >= chainTerminalCommonThreshold) { + useB = false; + } + } + } + + + + // Do the local assembly between these two MarkerGraphEdgeIds. + auto& stepSequence = chain.stepSequences[positionInChain]; + try { + LocalAssembly localAssembly(assembler, edgeIdA, edgeIdB, 0, html, options.localAssemblyOptions, useA, useB); + localAssembly.getSecondarySequence(stepSequence.sequence); + stepSequence.success = true; + } catch (...) { + // The local assembly failed. + // The sequence is empty and the success flag is false. + stepSequence.sequence.clear(); + stepSequence.success = false; + std::lock_guard<std::mutex> lock(mutex); + cout << "Error occurred in local assembly between marker graph edges " << + edgeIdA << " and " << edgeIdB << endl; + throw; + } +} + + + +// Make a copy of an edge, truncating it at its end by removing the last MarkerGraphEdgeId. +// Return the target vertex of the newly created edge. +// The last bubble of the bubble chain of the given edge must be haploid. +// If the bubble chain consists of just a single haploid bubble with a chain of length 2, +// no new edge is created, and this simply returns the source vertex of the given edge. +AssemblyGraph::vertex_descriptor + AssemblyGraph::cloneAndTruncateAtEnd(edge_descriptor ce) +{ + AssemblyGraph& cGraph = *this; + const AssemblyGraphEdge& edge = cGraph[ce]; + const vertex_descriptor cv0 = source(ce, cGraph); + const BubbleChain& bubbleChain = cGraph[ce]; + + // Sanity checks. + SHASTA_ASSERT(not bubbleChain.empty()); + SHASTA_ASSERT(bubbleChain.lastBubble().isHaploid()); + + + + // Case where the bubble chain consists of a single bubble, which must be haploid, + // that is, consist of a single chain. + if(bubbleChain.size() == 1) { + const Bubble& bubble = bubbleChain.lastBubble(); + SHASTA_ASSERT(bubble.isHaploid()); + const Chain& chain = bubble.front(); + SHASTA_ASSERT(chain.size() > 1); + + // If the Chain has length 2, we can't truncate it. + // So we don't create a new edge, and instead just return cv0. + // Detangling code will connect there, as prescribed by the tangle matrix. + if(chain.size() == 2) { + return cv0; + } + + // Create the new edge, without adding it to the graph for now. + AssemblyGraphEdge newEdge = edge; + newEdge.id = nextEdgeId++; + BubbleChain& newBubbleChain = newEdge; + SHASTA_ASSERT(newBubbleChain.size() == 1); + Bubble& newBubble = newBubbleChain.lastBubble(); + SHASTA_ASSERT(newBubble.isHaploid()); + Chain& newChain = newBubble.front(); + SHASTA_ASSERT(chain.size() > 2); + newChain.pop_back(); // Remove the last MarkerGraphEdgeId. + + // Add it to the graph. + // It will be dangling at its end. + // Detangling code will later connect it s prescribed by the tangle matrix. + const vertex_descriptor cv2 = createVertex(newBubbleChain.lastMarkerGraphEdgeId()); + add_edge(cv0, cv2, newEdge, cGraph); + return cv2; + } + + + + // Case where the bubble chain consists of more than one bubble. + else { + const Bubble& lastBubble = bubbleChain.lastBubble(); + SHASTA_ASSERT(lastBubble.isHaploid()); + const Chain& lastChain = lastBubble.front(); + SHASTA_ASSERT(lastChain.size() > 1); + + // Create the new edge, without adding it to the graph for now. + AssemblyGraphEdge newEdge = edge; + newEdge.id = nextEdgeId++; + BubbleChain& newBubbleChain = newEdge; + SHASTA_ASSERT(newBubbleChain.size() > 1); + Bubble& newLastBubble = newBubbleChain.lastBubble(); + SHASTA_ASSERT(newLastBubble.isHaploid()); + Chain& newLastChain = newLastBubble.front(); + + // If the last chain has length 2, just remove the last bubble from newBubbleChain. + // Otherwise, remove the last MarkerGraphEdgeId from the lastChain. + if(newLastChain.size() == 2) { + newBubbleChain.pop_back(); + } else { + newLastChain.pop_back(); + } + + // Add it to the graph. + // It will be dangling at its end. + // Detangling code will later connect it s prescribed by the tangle matrix. + const vertex_descriptor cv2 = createVertex(newBubbleChain.lastMarkerGraphEdgeId()); + add_edge(cv0, cv2, newEdge, cGraph); + return cv2; + } + +} + + + + + +// Make a copy of an edge, truncating it at its beginning by removing the first MarkerGraphEdgeId. +// Return the source vertex of the newly created edge. +// The first bubble of the bubble chain of the given edge must be haploid. +// If the bubble chain consists of just a single haploid bubble with a chain of length 2, +// no new edge is created, and this simply returns the target vertex of the given edge. +AssemblyGraph::vertex_descriptor + AssemblyGraph::cloneAndTruncateAtBeginning(edge_descriptor ce) +{ + AssemblyGraph& cGraph = *this; + const AssemblyGraphEdge& edge = cGraph[ce]; + const vertex_descriptor cv1 = target(ce, cGraph); + const BubbleChain& bubbleChain = cGraph[ce]; + + // Sanity checks. + SHASTA_ASSERT(not bubbleChain.empty()); + SHASTA_ASSERT(bubbleChain.firstBubble().isHaploid()); + + + + // Case where the bubble chain consists of a single bubble, which must be haploid, + // that is, consist of a single chain. + if(bubbleChain.size() == 1) { + const Bubble& bubble = bubbleChain.firstBubble(); + SHASTA_ASSERT(bubble.isHaploid()); + const Chain& chain = bubble.front(); + SHASTA_ASSERT(chain.size() > 1); + + // If the Chain has length 2, we can't truncate it. + // So we don't create a new edge, and instead just return cv1. + // Detangling code will connect there, as prescribed by the tangle matrix. + if(chain.size() == 2) { + return cv1; + } + + // Create the new edge, without adding it to the graph for now. + AssemblyGraphEdge newEdge = edge; + newEdge.id = nextEdgeId++; + BubbleChain& newBubbleChain = newEdge; + SHASTA_ASSERT(newBubbleChain.size() == 1); + Bubble& newBubble = newBubbleChain.firstBubble(); + SHASTA_ASSERT(newBubble.isHaploid()); + Chain& newChain = newBubble.front(); + SHASTA_ASSERT(chain.size() > 2); + newChain.erase(newChain.begin()); // Remove the first MarkerGraphEdgeId. + + // Add it to the graph. + // It will be dangling at its beginning. + // Detangling code will later connect it s prescribed by the tangle matrix. + const vertex_descriptor cv2 = createVertex(newBubbleChain.firstMarkerGraphEdgeId()); + add_edge(cv2, cv1, newEdge, cGraph); + return cv2; + } + + + + // Case where the bubble chain consists of more than one bubble. + else { + const Bubble& firstBubble = bubbleChain.firstBubble(); + SHASTA_ASSERT(firstBubble.isHaploid()); + const Chain& firstChain = firstBubble.front(); + SHASTA_ASSERT(firstChain.size() > 1); + + // Create the new edge, without adding it to the graph for now. + AssemblyGraphEdge newEdge = edge; + newEdge.id = nextEdgeId++; + BubbleChain& newBubbleChain = newEdge; + SHASTA_ASSERT(newBubbleChain.size() > 1); + Bubble& newFirstBubble = newBubbleChain.firstBubble(); + SHASTA_ASSERT(newFirstBubble.isHaploid()); + Chain& newFirstChain = newFirstBubble.front(); + + // If the last chain has length 2, just remove the first bubble from newBubbleChain. + // Otherwise, remove the first MarkerGraphEdgeId from the lastChain. + if(newFirstChain.size() == 2) { + newBubbleChain.erase(newBubbleChain.begin()); + } else { + newFirstChain.erase(newFirstChain.begin()); + } + + // Add it to the graph. + // It will be dangling at its end. + // Detangling code will later connect it s prescribed by the tangle matrix. + const vertex_descriptor cv2 = createVertex(newBubbleChain.firstMarkerGraphEdgeId()); + add_edge(cv2, cv1, newEdge, cGraph); + return cv2; + } + +} + + +// Create a new edge connecting the cv0 and cv1. +// The new edge will consist of a simple BubbleChain with a single +// haploid Bubble with a Chain of length 2. +AssemblyGraph::edge_descriptor AssemblyGraph::connect(vertex_descriptor cv0, vertex_descriptor cv1) +{ + AssemblyGraph& cGraph = *this; + + edge_descriptor ceNew; + tie(ceNew, ignore) = add_edge(cv0, cv1, cGraph); + AssemblyGraphEdge& newEdge = cGraph[ceNew]; + newEdge.id = nextEdgeId++; + BubbleChain& newBubbleChain = newEdge; + + // The new BubbleChain consists of a single Bubble. + newBubbleChain.resize(1); + Bubble& bubble = newBubbleChain.front(); + + // The new Bubble is haploid, that is, consists of a single Chain. + bubble.resize(1); + + // The new Bubble consists of just the two MarkerGraphEdgeIds + // corresponding to cv0 and cv1. + Chain& chain = bubble.front(); + chain.push_back(cGraph[cv0].edgeId); + chain.push_back(cGraph[cv1].edgeId); + + return ceNew; + +} + + + +void AssemblyGraph::save(const string& fileName) const +{ + ofstream file(fileName); + boost::archive::binary_oarchive archive(file); + archive << *this; +} + + + +void AssemblyGraph::load(const string& fileName) +{ + ifstream file(fileName); + boost::archive::binary_iarchive archive(file); + archive >> *this; +} + + + +// Optimize chains before assembly, to remove assembly steps with +// less that minCommon reads. +void AssemblyGraph::optimizeChains( + bool debug, + uint64_t minCommon, + uint64_t k) +{ + AssemblyGraph& cGraph = *this; + + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + BubbleChain& bubbleChain = cGraph[ce]; + + for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) { + Bubble& bubble = bubbleChain[positionInBubbleChain]; + const uint64_t ploidy = bubble.size(); + + for(uint64_t indexInBubble=0; indexInBubble<ploidy; indexInBubble++) { + Chain& chain = bubble[indexInBubble]; + SHASTA_ASSERT(chain.size() >= 2); + + if(debug) { + cout << "Optimizing chain " << chainStringId(ce, positionInBubbleChain, indexInBubble) << endl; + } + optimizeChain(debug, chain, minCommon, k); + } + } + } + +} + + + +// Optimize a chain before assembly, to remove assembly steps with +// less that minCommon reads. +void AssemblyGraph::optimizeChain( + bool debug, + Chain& chain, + uint64_t minCommon, + uint64_t k) +{ + if(debug) { + cout << "Optimizing a chain of length " << chain.size() << endl; + } + SHASTA_ASSERT(chain.size() >= 2); + + + // A directed graph describing the initial and final chains. + // Each vertex stores a MarkerGraphEdgeId. + // Each edge stores the number of common oriented reads. + class ChainGraphVertex { + public: + MarkerGraphEdgeId edgeId; + uint64_t immediateDominator = invalid<uint64_t>; + }; + class ChainGraphEdge { + public: + uint64_t commonCount; + bool keep = false; + }; + using ChainGraphBaseClass = boost::adjacency_list< + boost::listS, + boost::vecS, + boost::bidirectionalS, + ChainGraphVertex, + ChainGraphEdge>; + class ChainGraph : public ChainGraphBaseClass { + public: + }; + ChainGraph chainGraph; + + class PathInspector { + public: + PathInspector(ChainGraph& chainGraph, bool debug) : chainGraph(chainGraph), debug(debug) {} + ChainGraph& chainGraph; + bool debug; + using Path = vector<ChainGraph::edge_descriptor>; + Path bestPath; + uint64_t bestPathMinCommonCount = 0; + void operator()(const Path& path) + { + // Compute the minimum number of common oriented reads over edges of this path. + uint64_t minCommonCount = invalid<uint64_t>; + for(const ChainGraph::edge_descriptor e: path) { + minCommonCount = min(minCommonCount, chainGraph[e].commonCount); + } + + if(debug) { + cout << "Path with minCommonCount " << minCommonCount << ":"; + for(const ChainGraph::edge_descriptor e: path) { + cout << " " << source(e, chainGraph); + } + cout << " " << target(path.back(), chainGraph) << "\n"; + } + + // A Path is better if it has a higher minCommonCount or + // it has the same minCommonCount and is longer. + // + if( (minCommonCount > bestPathMinCommonCount) or + (minCommonCount == bestPathMinCommonCount and path.size() > bestPath.size())) { + bestPath = path; + bestPathMinCommonCount = minCommonCount; + } + } + + }; + + // Construct the initial ChainGraph. + + // Add the vertices. + // We are using vecS as the second template argument for ChainGraph, + // so positions in the chain are also vertex descriptors in the ChainGraph. + for(const MarkerGraphEdgeId edgeId: chain) { + add_vertex({edgeId}, chainGraph); + } + + // Add the edges that correspond to the initial Chain. + for(uint64_t i1=1; i1<chain.size(); i1++) { + const uint64_t i0 = i1 - 1; + const MarkerGraphEdgeId edgeId0 = chainGraph[i0].edgeId; + const MarkerGraphEdgeId edgeId1 = chainGraph[i1].edgeId; + MarkerGraphEdgePairInfo info; + SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(edgeId0, edgeId1, info)); + add_edge(i0, i1, {info.common}, chainGraph); + } + + + + // Add edges that skip around any edges with less than minCommon common oriented reads. + uint64_t totalAddedEdgesCount = 0; + uint64_t totalRemovedEdgesCount = 0; + for(uint64_t i1=1; i1<chain.size(); i1++) { + const uint64_t i0 = i1 - 1; + ChainGraph::edge_descriptor e; + bool edgeWasFound = false; + tie(e, edgeWasFound) = edge(i0, i1, chainGraph); + SHASTA_ASSERT(edgeWasFound); + + // If this edge has enough common reads, don't do anything. + if(chainGraph[e].commonCount >= minCommon) { + continue; + } + + if(debug) { + cout << i0 << "->" << i1 << " " << chainGraph[i0].edgeId << "->" << chainGraph[i1].edgeId << + " has " << chainGraph[e].commonCount << " common oriented reads, adding edges to skip it." << endl; + } + + // Loop over pairs of predecessors of v0 and successors of v1. + uint64_t addedEdgesCount = 0; + const uint64_t j0First = (k < i0) ? (i0 - k) : 0; + const uint64_t j0Last = i0; + const uint64_t j1First = i1; + const uint64_t j1Last = min(i1 + k, chain.size() - 1); + for(uint64_t j0=j0First; j0<=j0Last; j0++) { + for(uint64_t j1=j1First; j1<=j1Last; j1++) { + if(j0==i0 and j1 == i1) { + // We already have the edge between v0 and v1. + continue; + } + MarkerGraphEdgePairInfo info; + SHASTA_ASSERT(assembler.analyzeMarkerGraphEdgePair(chainGraph[j0].edgeId, chainGraph[j1].edgeId, info)); + + // If the number of common reads is better than for e, add this edge. + if(info.common > chainGraph[e].commonCount) { + add_edge(j0, j1, {info.common}, chainGraph); + ++addedEdgesCount; + if(debug) { + cout << " Added " << j0 << "->" << j1 << " " << chainGraph[j0].edgeId << "->" << chainGraph[j1].edgeId << + " with " << info.common << " common oriented reads." << endl; + } + } else { + if(debug) { + cout << "Found " << j0 << "->" << j1 << " " << chainGraph[j0].edgeId << "->" << chainGraph[j1].edgeId << + " with " << info.common << " common oriented reads." << endl; + + } + } + } + } + totalAddedEdgesCount += addedEdgesCount; + + // If we added any edges skipping e, we can remove e. + if(addedEdgesCount > 0) { + if(debug) { + cout << "Removed " << i0 << "->" << i1 << " " << chainGraph[i0].edgeId << "->" << chainGraph[i1].edgeId << + " with " << chainGraph[e].commonCount << " common oriented reads." << endl; + } + // DON'T REMOVE THE EDGE. THIS IS NECESSARY TO MAKE SURE WE + // STILL HAVE A PATH FROM THE ENTRANCE TO THE EXIT. + // boost::remove_edge(e, chainGraph); + // ++totalRemovedEdgesCount; + } else { + if(debug) { + cout << "Did not find any suitable replacement edges." << endl; + } + } + } + + + // If we did not add or remove any edges, leave this Chain alone. + if(totalAddedEdgesCount == 0) { + SHASTA_ASSERT(totalRemovedEdgesCount == 0); + if(debug) { + cout << "No edges were added or removed, so this Chain will be left unchanged." << endl; + } + return; + } + + if(debug) { + cout << "This chain will be optimized." << endl; + } + + + + // To find the optimized chain, we want to do path enumeration on the ChainGraph, + // looking for paths that only use edges with large numbers of common oriented reads. + // Specifically, we use as the new chain the path that maximizes the minimum + // number of common oriented reads encountered on edges along the path. + // For efficiency of the path enumeration, we compute a dominator tree + // for the ChainGraph, with entrance at the beginning of the chain. + // The unique path on that tree from the entrance to the exit + // divides the graph in segments, and we can do path enumeration on one segment at a time. + shasta::lengauer_tarjan_dominator_tree(chainGraph, 0, + boost::get(&ChainGraphVertex::immediateDominator, chainGraph)); + + // The unique path on the dominator tree from the entrance to the exit. + vector<ChainGraph::vertex_descriptor> dominatorTreePath; + ChainGraph::vertex_descriptor v = chain.size() - 1; + while(true) { + dominatorTreePath.push_back(v); + if(v == 0) { + break; + } + v = chainGraph[v].immediateDominator; + if(v == invalid<uint64_t>) { + cout << "Assertion failure at " << v << endl; + } + SHASTA_ASSERT(v != invalid<uint64_t>); + } + if(debug) { + cout << "Dominator tree path length " << dominatorTreePath.size() << endl; + } + reverse(dominatorTreePath.begin(), dominatorTreePath.end()); + + if(false) { + cout << "Dominator tree path:" << endl; + for(uint64_t i=0; i<dominatorTreePath.size(); i++) { + const uint64_t v = dominatorTreePath[i]; + cout << i << "," << v << "," << chainGraph[v].edgeId << "\n"; + } + } + + + + // The dominator tree path divides the graph in segments, + // and we can do path enumeration on one segment at a time. + // For each segment we find the best path and mark the edges on that + // best path as to be kept in the final chain. + for(uint64_t i1=1; i1<dominatorTreePath.size(); i1++) { + const uint64_t i0 = i1 - 1; + const ChainGraph::vertex_descriptor v0 = dominatorTreePath[i0]; + const ChainGraph::vertex_descriptor v1 = dominatorTreePath[i1]; + + // Fast handling of the most common case. + if(v1 == v0+1 and out_degree(v0, chainGraph)==1 and in_degree(v1, chainGraph)==1) { + ChainGraph::edge_descriptor e; + bool edgeWasFound = true; + tie(e, edgeWasFound) = edge(v0, v1, chainGraph); + if(edgeWasFound) { + chainGraph[e].keep = true; + continue; + } + } + + // If getting here, we have to do path enumeration. + if(debug) { + cout << "Starting path enumeration between " << v0 << " " << v1 << endl; + } + + // Enumerate paths starting at v0 and ending at v1. + PathInspector pathInspector(chainGraph, debug); + enumeratePathsBetween(chainGraph, v0, v1, pathInspector); + + if(debug) { + if(debug) { + cout << "The best path has minCommonCount " << pathInspector.bestPathMinCommonCount << ":"; + for(const ChainGraph::edge_descriptor e: pathInspector.bestPath) { + cout << " " << source(e, chainGraph); + } + cout << " " << target(pathInspector.bestPath.back(), chainGraph) << "\n"; + } + } + + // Mark as to be kept all edges on the best path. + for(const ChainGraph::edge_descriptor e: pathInspector.bestPath) { + chainGraph[e].keep = true; + } + } + + + // Remove all edges not marked to be kept. + vector<ChainGraph::edge_descriptor> edgesToBeRemoved; + BGL_FORALL_EDGES(e, chainGraph, ChainGraph) { + if(not chainGraph[e].keep) { + edgesToBeRemoved.push_back(e); + } + } + for(const ChainGraph::edge_descriptor e: edgesToBeRemoved) { + boost::remove_edge(e, chainGraph); + } + + // The remaining edges should form a path in the ChainGraph + // which defines the optimized Chain. + SHASTA_ASSERT(in_degree(0, chainGraph) == 0); + SHASTA_ASSERT(out_degree(0, chainGraph) == 1); + SHASTA_ASSERT(in_degree(chain.size()-1, chainGraph) == 1); + SHASTA_ASSERT(out_degree(chain.size()-1, chainGraph) == 0); + for(uint64_t i=1; i<chain.size()-1; i++) { + const uint64_t inDegree = in_degree(i, chainGraph); + const uint64_t outDegree = out_degree(i, chainGraph); + SHASTA_ASSERT( + (inDegree==1 and outDegree==1) or // In the new chain. + (inDegree==0 and outDegree==0) // Now isolated. + ); + } + + // Find the path from the entrance to the exit in the update ChainGraph. + vector<uint64_t> newPath; + v = 0; + while(true) { + newPath.push_back(v); + if(v == chain.size()-1) { + break; + } + + // Move forward. + SHASTA_ASSERT(out_degree(v, chainGraph) == 1); + ChainGraph::out_edge_iterator it; + tie(it, ignore) = out_edges(v, chainGraph); + const ChainGraph::edge_descriptor e = *it; + v = target(e, chainGraph); + } + + // Sanity check that the path is moving forward. + for(uint64_t i=1; i<newPath.size(); i++) { + SHASTA_ASSERT(newPath[i] > newPath[i-1]); + } + + // Construct the new Chain. + chain.clear(); + chain.sequence.clear(); + for(const uint64_t v: newPath) { + chain.push_back(chainGraph[v].edgeId); + } + +} + + + +bool AssemblyGraph::removeSelfComplementaryEdges() +{ + AssemblyGraph& cGraph = *this; + + vector<edge_descriptor> edgesToBeRemoved; + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + const vertex_descriptor v0 = source(ce, cGraph); + const vertex_descriptor v1 = target(ce, cGraph); + const MarkerGraphEdgeId edgeId0 = cGraph[v0].edgeId; + const MarkerGraphEdgeId edgeId1 = cGraph[v1].edgeId; + + if(assembler.markerGraph.reverseComplementEdge[edgeId0] == edgeId1) { + SHASTA_ASSERT(assembler.markerGraph.reverseComplementEdge[edgeId1] == edgeId0); + edgesToBeRemoved.push_back(ce); + } + } + + for(const edge_descriptor ce: edgesToBeRemoved) { + boost::remove_edge(ce, cGraph); + } + + return not edgesToBeRemoved.empty(); +} + + + +// Split terminal haploid bubbles out of bubble chains, to facilitate detangling. +void AssemblyGraph::splitTerminalHaploidBubbles() +{ + AssemblyGraph& cGraph = *this; + + vector<edge_descriptor> allEdges; + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + allEdges.push_back(ce); + } + + for(const edge_descriptor e: allEdges) { + splitTerminalHaploidBubbles(e); + } +} + + + +void AssemblyGraph::splitTerminalHaploidBubbles(edge_descriptor ce) +{ + AssemblyGraph& cGraph = *this; + BubbleChain& bubbleChain = cGraph[ce]; + + // Skip trivial bubble chains consisting of a single bubble. + if(bubbleChain.size() < 2) { + return; + } + + // Access the first and last bubble in the bubble chain. + // We already checked that the bubble chain has at least two bubbles, + // so these two are distinct. + const Bubble& firstBubble = bubbleChain.front(); + const Bubble& lastBubble = bubbleChain.back(); + + // Skip bubble chains consisting of two haploid bubbles. + // After compress() is called, there should be none of these. + if(bubbleChain.size() == 2 and firstBubble.isHaploid() and lastBubble.isHaploid()) { + return; + } + + // Figure out if we need to split the first or last bubble, or both. + bool splitFirstBubble = false; + bool splitLastBubble = false; + if(firstBubble.isHaploid()) { + splitFirstBubble = true; + } + if(lastBubble.isHaploid()) { + splitLastBubble = true; + } + if(splitFirstBubble and splitLastBubble) { + SHASTA_ASSERT(bubbleChain.size() > 2); + } + + // If there is nothing to do, we are done. + if(not (splitFirstBubble or splitLastBubble)) { + return; + } + + // The source and target vertices of the edge we are splitting. + const vertex_descriptor cv0 = source(ce, cGraph); + const vertex_descriptor cv1 = target(ce, cGraph); + vertex_descriptor cv2 = null_vertex(); + vertex_descriptor cv3 = null_vertex(); + + + + // Create a new edge with just the first bubble, if necessary. + if(splitFirstBubble) { + + // Get the target vertex for the new edge. + const Chain& firstChain = firstBubble.front(); + const MarkerGraphEdgeId markerGraphEdgeId2 = firstChain.back(); + cv2 = createVertex(markerGraphEdgeId2); + + // Add the new edge. + edge_descriptor eNew; + tie(eNew, ignore) = add_edge(cv0, cv2, cGraph); + AssemblyGraphEdge& newEdge = cGraph[eNew]; + newEdge.id = nextEdgeId++; + + // Copy the first bubble to the new edge. + newEdge.push_back(firstBubble); + + } + + + + // Create a new edge with just the last bubble, if necessary. + if(splitLastBubble) { + + // Get the source vertex for the new edge. + const Chain& lastChain = lastBubble.front(); + const MarkerGraphEdgeId markerGraphEdgeId3 = lastChain.front(); + cv3 = createVertex(markerGraphEdgeId3); + + // Add the new edge. + edge_descriptor eNew; + tie(eNew, ignore) = add_edge(cv3, cv1, cGraph); + AssemblyGraphEdge& newEdge = cGraph[eNew]; + newEdge.id = nextEdgeId++; + + // Copy the last bubble to the new edge. + newEdge.push_back(lastBubble); + + } + + + + // Create a new edge for the rest of the bubble chain. + edge_descriptor eNew; + tie(eNew, ignore) = add_edge( + splitFirstBubble ? cv2 : cv0, + splitLastBubble ? cv3 : cv1, + cGraph); + AssemblyGraphEdge& newEdge = cGraph[eNew]; + newEdge.id = nextEdgeId++; + + // Copy the rest of the bubble chain to the new edge. + auto it0 = bubbleChain.begin(); + auto it1 = bubbleChain.end(); + if(splitFirstBubble) { + ++it0; + } + if(splitLastBubble) { + --it1; + } + copy(it0, it1, back_inserter(newEdge)); + + + // Now we can remove the old BubbleChain we just split. + boost::remove_edge(ce, cGraph); + +} + + + +// Bubble cleanup (all bubbles), with the purpose of eliminating most bubbles caused by errors. +uint64_t AssemblyGraph::cleanupBubbles( + bool debug, + uint64_t maxOffset, + uint64_t chainTerminalCommonThreshold, + uint64_t threadCount) +{ + AssemblyGraph& graph = *this; + performanceLog << timestamp << "AssemblyGraph::cleanupBubbles begins." << endl; + + + + // First, assemble sequence for all the chains of diploid bubbles with a small offset. + clearAllShouldBeAssembledFlags(); + BGL_FORALL_EDGES(e, graph, AssemblyGraph) { + BubbleChain& bubbleChain = graph[e]; + for(Bubble& bubble: bubbleChain) { + + // If this bubble is not diploid, skip it. + if(bubble.size() != 2) { + continue; + } + + // The bubble is diploid. Compute its maxOffset. + uint64_t averageOffset; + uint64_t minOffset; + uint64_t bubbleMaxOffset; + const uint64_t offsetWasComputed = bubbleOffsetNoException( + bubble, averageOffset, minOffset, bubbleMaxOffset); + + // If the offset is large or could not be computed, we don't need to + // assemble this bubble. + if((not offsetWasComputed) or bubbleMaxOffset>maxOffset) { + continue; + } + + // We need to assemble the Chains of this bubble. + for(Chain& chain: bubble) { + chain.shouldBeAssembled = true; + } + } + } + assembleChainsMultithreaded(chainTerminalCommonThreshold, threadCount); + performanceLog << timestamp << "Sequence assembly for AssemblyGraph::cleanupBubbles ends." << endl; + + + + uint64_t removedCount = 0; + BGL_FORALL_EDGES(ce, graph, AssemblyGraph) { + removedCount += cleanupBubbles(debug, ce, maxOffset, chainTerminalCommonThreshold); + } + + performanceLog << timestamp << "AssemblyGraph::cleanupBubbles ends." << endl; + return removedCount; +} + + + +// Bubble cleanup for a bubble chain, with the purpose of eliminating most bubbles caused by errors. +uint64_t AssemblyGraph::cleanupBubbles(bool debug, edge_descriptor ce, + uint64_t maxOffset, uint64_t chainTerminalCommonThreshold) +{ + AssemblyGraph& cGraph = *this; + BubbleChain& bubbleChain = cGraph[ce]; + BubbleChain newBubbleChain; + + uint64_t removedCount = 0; + for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) { + Bubble& bubble = bubbleChain[positionInBubbleChain]; + + if(debug) { + cout << "cleanupBubbles working on Bubble " << bubbleStringId(ce, positionInBubbleChain) << + " ploidy " << bubble.size() << endl; + cout << "Entrance " << bubble.front().front() << ", exit " << bubble.front().back() << endl; + } + + bool keepBubble = false; + + if(bubble.isHaploid()) { + + // The bubble is haploid. Keep it. + keepBubble = true; + + if(debug) { + cout << "Keeping this bubble because it is haploid." << endl; + } + + } else { + + // The bubble is not haploid. Compute its maxOffset. + uint64_t averageOffset; + uint64_t minOffset; + uint64_t bubbleMaxOffset; + const bool offsetWasComputed = bubbleOffsetNoException(bubble, averageOffset, minOffset, bubbleMaxOffset); + + if((not offsetWasComputed) or bubbleMaxOffset>maxOffset) { + + // The bubble is not haploid but the offset is large. Keep it. + keepBubble = true; + + if(debug) { + cout << "Keeping this bubble because it is not haploid but its offset is large." << endl; + } + + } else { + + // The bubble is not haploid and has a small offset. + + if(bubble.size() > 2) { + + // The bubble has a small offset and ploidy greater than 2. Remove it. + keepBubble = false; + + if(debug) { + cout << "Removing this bubble because it has a small offset and ploidy greater than 2." << endl; + } + + } else { + + // The bubble has a small offset and ploidy 2. + // Check that we assembled the sequence of its two sides. + for(Chain& chain: bubble) { + SHASTA_ASSERT(chain.wasAssembled); + } + + if(debug) { + for(uint64_t indexInBubble=0; indexInBubble<2; indexInBubble++) { + const auto& sequence = bubble[indexInBubble].sequence; + cout << ">" << chainStringId(ce, positionInBubbleChain, indexInBubble) << + " " << sequence.size() << "\n"; + copy(sequence.begin(), sequence.end(), ostream_iterator<shasta::Base>(cout)); + cout << "\n"; + } + } + if(bubble[0].sequence == bubble[1].sequence) { + keepBubble = false; + if(debug) { + cout << "The two sides have identical sequence." << endl; + } + } else { + + // Figure out if they differ by a copy number of short periodicity. + const uint64_t period = isCopyNumberDifference(bubble[0].sequence, bubble[1].sequence, 4); + if(debug) { + cout << "Period " << period << "\n"; + } + keepBubble = (period == 0); + } + } + } + + + } + + if(keepBubble) { + newBubbleChain.push_back(bubble); + if(debug) { + cout << "Kept this bubble." << endl; + } + } else { + // Remove the bubble and replace it with a haploid bubble + // consisting of only the terminal MarkerGraphEdgeIds. + Chain newChain; + newChain.push_back(bubble.front().front()); + newChain.push_back(bubble.front().back()); + Bubble newBubble; + newBubble.push_back(newChain); + newBubbleChain.push_back(newBubble); + ++removedCount; + if(debug) { + cout << "Removed this bubble." << endl; + } + } + } + + bubbleChain.swap(newBubbleChain); + return removedCount; +} + + + +// This finds squares of the form: +// A->B +// A->B' +// B->A' +// B'->A' +// where a prime sign indicates reverse complementing. +// It then one of two pairs of self-complementary edges: +// A->B and B'->A' +// or +// A->B' and B->A' +// The pair to be removed is selected in such a way that its removal +// does not introduce any dead ends. +// The code uses the following names: +// A0 = A +// A1 = A' +// B0 = B +// B1 = B' +void AssemblyGraph::removeSelfComplementarySquares() +{ + AssemblyGraph& cGraph = *this; + const bool debug = true; + + vector< pair<edge_descriptor, vertex_descriptor> > outEdgesA0; + + + // Do this iteratively. + while(true) { + + + // Loop over all possible choices for A0. + bool done = false; + BGL_FORALL_VERTICES(A0, cGraph, AssemblyGraph) { + + // Gather the children of A. + outEdgesA0.clear(); + BGL_FORALL_OUTEDGES(A0, ce, cGraph, AssemblyGraph) { + outEdgesA0.push_back({ce, target(ce, cGraph)}); + } + + // Look for a reverse complementary pair (B0, B1) + // with edges B0->A1 and B1->A1. + for(uint64_t i1=0; i1<outEdgesA0.size(); i1++) { + const vertex_descriptor B1 = outEdgesA0[i1].second; + const uint64_t edgeIdB1 = cGraph[B1].edgeId; + const uint64_t edgeIdB0 = assembler.markerGraph.reverseComplementEdge[edgeIdB1]; + for(uint64_t i0=0; i0<i1; i0++) { + const vertex_descriptor B0 = outEdgesA0[i0].second; + if(cGraph[B0].edgeId == edgeIdB0) { + + // We found it. + + // Look for the edges B0->A1 and B1->A1. + const uint64_t edgeIdA0 = cGraph[A0].edgeId; + const uint64_t edgeIdA1 = assembler.markerGraph.reverseComplementEdge[edgeIdA0]; + + edge_descriptor B0A1; + vertex_descriptor A10 = null_vertex(); + BGL_FORALL_OUTEDGES(B0, ce, cGraph, AssemblyGraph) { + const vertex_descriptor v = target(ce, cGraph); + if(cGraph[v].edgeId == edgeIdA1) { + B0A1 = ce; + A10 = v; + break; + } + } + if(A10 == null_vertex()) { + continue; + } + + edge_descriptor B1A1; + vertex_descriptor A11 = null_vertex(); + BGL_FORALL_OUTEDGES(B1, ce, cGraph, AssemblyGraph) { + const vertex_descriptor v = target(ce, cGraph); + if(cGraph[v].edgeId == edgeIdA1) { + B1A1 = ce; + A11 = v; + break; + } + } + if(A11 == null_vertex()) { + continue; + } + + if(A10 != A11) { + continue; + } + const vertex_descriptor A1 = A10; + + // We found a self-complementary square. + const edge_descriptor A0B0 = outEdgesA0[i0].first; + const edge_descriptor A0B1 = outEdgesA0[i1].first; + + if(debug) { + cout << "Found a self-complementary square:\n" << + cGraph[A0].edgeId << " " << + cGraph[B0].edgeId << " " << + cGraph[B1].edgeId << " " << + cGraph[A1].edgeId << "\n" << + bubbleChainStringId(A0B0) << " " << + bubbleChainStringId(A0B1) << " " << + bubbleChainStringId(B0A1) << " " << + bubbleChainStringId(B1A1) << "\n"; + } + + // Remove two of the edges in the square, + // making sure to not introduce dead ends. + if(out_degree(A0, cGraph) > 1 and in_degree(A1, cGraph) > 1) { + if(in_degree (B0, cGraph) > 1 and out_degree(B1, cGraph) > 1) { + boost::remove_edge(A0B0, cGraph); + boost::remove_edge(B1A1, cGraph); + done = true; + } else if(in_degree(B1, cGraph) > 1 and out_degree(B0, cGraph) > 1) { + boost::remove_edge(A0B1, cGraph); + boost::remove_edge(B0A1, cGraph); + done = true; + } + } + + if(done) { + break; + } + } + + if(done) { + break; + } + + } + if(done) { + break; + } + } + if(done) { + break; + } + } + + // If nothing happened, stop the outer iteration. + if(not done) { + break; + } + } +} diff --git a/src/mode3-AssemblyGraph.hpp b/src/mode3-AssemblyGraph.hpp new file mode 100644 index 0000000..2468451 --- /dev/null +++ b/src/mode3-AssemblyGraph.hpp @@ -0,0 +1,886 @@ +#pragma once + +// Shasta +#include "Base.hpp" +#include "invalid.hpp" +#include "mode3-PhasedComponent.hpp" +#include "MultithreadedObject.hpp" +#include "shastaTypes.hpp" +#include "SHASTA_ASSERT.hpp" + +// Boost libraries. +#include <boost/graph/adjacency_list.hpp> +#include <boost/serialization/vector.hpp> + +// Standard library +#include "algorithm.hpp" +#include "array.hpp" +#include <map> +#include "memory.hpp" +#include "fstream.hpp" +#include "string.hpp" +#include "utility.hpp" +#include "vector.hpp" + + + +namespace shasta { + namespace mode3 { + + // Each edge of the CompressedPathGraph describes a BubbleChain. + + // A Chain is a sequence of MarkerGraphEdgeIds. + class Chain; + + // A Bubble is a set of Chains that begin and end at the same MarkerGraphEdgeId. + // It can consist of one or more Chains. + class Bubble; + + // A BubbleChain is a sequence of Bubbles. + class BubbleChain; + + class AssemblyGraph; + class AssemblyGraphVertex; + class AssemblyGraphEdge; + using AssemblyGraphBaseClass = boost::adjacency_list< + boost::listS, + boost::listS, + boost::bidirectionalS, + AssemblyGraphVertex, + AssemblyGraphEdge>; + + class PrimaryGraph; + } + class Assembler; + class Mode3AssemblyOptions; + class OrientedReadId; +} + + + +// A Chain is a sequence of MarkerGraphEdgeIds. +class shasta::mode3::Chain : public vector<MarkerGraphEdgeId> { +public: + + // Flag used to indicate that this Chain needs to be assembled. + // Used by assembleChainsMultithreaded. + bool shouldBeAssembled = false; + bool wasAssembled = false; + + // Assembled sequence, including the sequence of the first and + // last primary marker graph edges. + vector<Base> sequence; + + // The internal sequence assembled between consecutive pairs + // of MarkerGraphEdgeIds in the chain. + // If a local assembly fails, the success flag remains false and the sequence remains empty. + class StepSequence { + public: + vector<Base> sequence; + bool success = false; + }; + vector<StepSequence> stepSequences; + + + MarkerGraphEdgeId second() const + { + SHASTA_ASSERT(size() > 1); + return (*this)[1]; + } + MarkerGraphEdgeId secondToLast() const + { + SHASTA_ASSERT(size() > 1); + return (*this)[size() - 2]; + } + + template<class Archive> void serialize(Archive & ar, const unsigned int version) + { + ar & boost::serialization::base_object< vector<MarkerGraphEdgeId> >(*this); + } +}; + + + +class shasta::mode3::Bubble : public vector<Chain> { +public: + bool isHaploid() const + { + return size() == 1; + } + bool isDiploid() const + { + return size() == 2; + } + bool isGeneral() const + { + return size() > 2; + } + + // Remove duplicate chains. + void deduplicate(); + + template<class Archive> void serialize(Archive & ar, const unsigned int version) + { + ar & boost::serialization::base_object< vector<Chain> >(*this); + } +}; + + + +class shasta::mode3::BubbleChain : public vector<Bubble> { +public: + const Bubble& firstBubble() const + { + SHASTA_ASSERT(not empty()); + return front(); + } + Bubble& firstBubble() + { + SHASTA_ASSERT(not empty()); + return front(); + } + const Bubble& lastBubble() const + { + SHASTA_ASSERT(not empty()); + return back(); + } + Bubble& lastBubble() + { + SHASTA_ASSERT(not empty()); + return back(); + } + + uint64_t diploidBubbleCount() const + { + uint64_t n = 0; + for(const Bubble& bubble: *this) { + if(bubble.isDiploid()) { + ++n; + } + } + return n; + } + + // This returns true if this superbubble consists of a single haploid bubble. + bool isSimpleChain() const + { + return size() == 1 and firstBubble().isHaploid(); + } + Chain& getOnlyChain() + { + SHASTA_ASSERT(isSimpleChain()); + return firstBubble().front(); + } + + // Collapse consecutive haploid bubbles. + bool compress(); + + MarkerGraphEdgeId firstMarkerGraphEdgeId() const + { + SHASTA_ASSERT(not empty()); + const Bubble& firstBubble = front(); + const MarkerGraphEdgeId markerGraphEdgeId = firstBubble.front().front(); + for(const Chain& chain: firstBubble) { + SHASTA_ASSERT(chain.front() == markerGraphEdgeId); + } + return markerGraphEdgeId; + } + + MarkerGraphEdgeId lastMarkerGraphEdgeId() const + { + SHASTA_ASSERT(not empty()); + const Bubble& lastBubble = back(); + const MarkerGraphEdgeId markerGraphEdgeId = lastBubble.front().back(); + for(const Chain& chain: lastBubble) { + SHASTA_ASSERT(chain.back() == markerGraphEdgeId); + } + return markerGraphEdgeId; + } + + // Return the total lenght of this bubble chain. + uint64_t totalLength() const; + + + template<class Archive> void serialize(Archive & ar, const unsigned int version) + { + ar & boost::serialization::base_object< vector<Bubble> >(*this); + } + +}; + + + +class shasta::mode3::AssemblyGraphVertex { +public: + MarkerGraphEdgeId edgeId; + + // Numbering of vertices consecutively starting at zero. + // This is computed by renumberVertices, and becomes + // invalid as soon as a vertex is added or removed. + uint64_t index = invalid<uint64_t>; + + // The id of the Superbubble this vertex belongs to, if any. + // Stored by class Superbubbles. + uint64_t superbubbleId = invalid<uint64_t>; + + template<class Archive> void serialize(Archive & ar, const unsigned int version) + { + ar & edgeId; + } +}; + + + +class shasta::mode3::AssemblyGraphEdge : public BubbleChain { +public: + uint64_t id = invalid<uint64_t>; + + template<class Archive> void serialize(Archive & ar, const unsigned int version) + { + ar & boost::serialization::base_object<BubbleChain>(*this); + ar & id; + } +}; + + + +class shasta::mode3::AssemblyGraph: + public AssemblyGraphBaseClass, + public MultithreadedObject<shasta::mode3::AssemblyGraph> { +public: + + // Create from a connected component of the PrimaryGraph, then call run. + AssemblyGraph( + const PrimaryGraph&, + uint64_t componentId, + const Assembler&, + uint64_t threadCount, + const Mode3AssemblyOptions& options, + bool assembleSequence, + bool debug); + + // Load it from a binary archive, then call run. + AssemblyGraph( + const string& fileName, + const Assembler&, + uint64_t threadCount, + const Mode3AssemblyOptions& options, + bool assembleSequence, + bool debug); + +private: + + // Hide Base defined by the base class. + using Base = shasta::Base; + + // Information stored by the constructor. + uint64_t componentId; + const Assembler& assembler; + const Mode3AssemblyOptions& options; + + friend class boost::serialization::access; + template<class Archive> void serialize(Archive & ar, const unsigned int version) + { + ar & boost::serialization::base_object<AssemblyGraphBaseClass>(*this); + ar & componentId; + ar & nextEdgeId; + } + void save(const string& fileName) const; + void load(const string& fileName); + + void run( + uint64_t threadCount, + bool assembleSequence, + bool debug); + + + + // Initial creation from the PrimaryGraph. + // Each linear chain of edges in the PrimaryGraph after transitive reduction generates + // an AssemblyGraphEdge (BubbleChain) consisting of a single haploid bubble. + void create(const PrimaryGraph&, bool debug); + uint64_t nextEdgeId = 0; + void renumberEdges(); + + // Return the vertex corresponding to a given MarkerGraphEdgeId, + // creating it if it is not in the given vertexMap. + // This is only used in create(). + vertex_descriptor getVertex( + MarkerGraphEdgeId, + std::map<MarkerGraphEdgeId, vertex_descriptor>& vertexMap + ); + + // Create a new vertex with a given MarkerGraphEdgeId. + vertex_descriptor createVertex(MarkerGraphEdgeId); + + void removeVertex(vertex_descriptor); + + // Compute vertexIndex for every vertex. + // This numbers vertices consecutively starting at zero. + // This numbering becomes invalid as soon as a vertex is added or removed. + void numberVertices(); + void clearVertexNumbering(); + + // Create a new edge connecting cv0 and cv1. + // The new edge will consist of a simple BubbleChain with a single + // haploid Bubble with a Chain of length 2. + edge_descriptor connect(vertex_descriptor cv0, vertex_descriptor cv1); + + // Compress parallel edges into bubbles, where possible. + bool compressParallelEdges(); + + // Compress linear sequences of edges (BubbleChains) into longer BubbleChains. + bool compressSequentialEdges(); + + // Call compress on all BubbleChains to merge adjacent haploid bubbles. + bool compressBubbleChains(); + + // Call compressParallelEdges, compressSequentialEdges, and compressBubbleChains + // iteratively until nothing changes. + bool compress(); + + // This does the opposite of compress. All bubble chains that + // consist of more than one simple haploid bubble are expanded into one + // edge for each edge of each bubble. + // For optimal results it is best to call compressBubbleChains before expand. + void expand(); + + // Compute the tangle matrix given in-edges and out-edges. + // The last bubble of each in-edge and the first bubble + // of each out-edge must be haploid. + void computeTangleMatrix( + const vector<edge_descriptor>& inEdges, + const vector<edge_descriptor>& outEdges, + vector< vector<uint64_t> >& tangleMatrix, + bool setToZeroForComplementaryPairs + ) const; + + // Low level primitives used in detangling. + // See the implementation for details. + vertex_descriptor cloneAndTruncateAtEnd(edge_descriptor); + vertex_descriptor cloneAndTruncateAtBeginning(edge_descriptor); + + // Vertex detangling. + // bool detangleVerticesStrict(bool debug); + // bool detangleVertexStrict(vertex_descriptor, bool debug); + bool detangleVertices(bool debug, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP); + bool detangleVertex( + vertex_descriptor, + bool debug, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP); + + // Vertex detangling that can deal with non-haploid bubbles adjacent to the + // vertex to be detangled. + bool detangleVerticesGeneral(bool debug, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP); + bool detangleVertexGeneral( + vertex_descriptor, + bool debug, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP); + + // Split the first/last bubble of a bubble chain. + // Used by detangleVertexGeneral to eliminate + // non-haploid bubble adjacent to a vertex to be detangled. + void splitBubbleChainAtBeginning(edge_descriptor); + void splitBubbleChainAtEnd(edge_descriptor); + + + // Edge detangling. + bool detangleEdges( + bool debug, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP); + bool detangleEdge( + bool debug, + std::map<uint64_t, edge_descriptor>& edgeMap, + std::map<uint64_t, edge_descriptor>::iterator&, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP); + bool detangleEdgesGeneral( + bool debug, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP); + bool detangleEdgeGeneral( + bool debug, + std::map<uint64_t, edge_descriptor>& edgeMap, + std::map<uint64_t, edge_descriptor>::iterator&, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP); + bool detangleEdgesWithSearch( + bool debug, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh); + bool detangleEdgeWithSearch( + bool debug, + std::map<uint64_t, edge_descriptor>& edgeMap, + std::map<uint64_t, edge_descriptor>::iterator&, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh); + + bool removeSelfComplementaryEdges(); + + // Special treatment to detangle back edges that were too long + // to be handled by detangleEdges. + bool detangleBackEdges( + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh); + bool detangleBackEdge( + std::map<uint64_t, edge_descriptor>& edgeMap, + std::map<uint64_t, edge_descriptor>::iterator&, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh); + + // Bubble cleanup, with the purpose of eliminating most bubbles caused by errors. + // See the code for details of what this does. + uint64_t cleanupBubbles( + bool debug, + uint64_t maxOffset, + uint64_t chainTerminalCommonThreshold, + uint64_t threadCount); + uint64_t cleanupBubbles( + bool debug, + edge_descriptor ce, + uint64_t maxOffset, + uint64_t chainTerminalCommonThreshold); + + + + // Find short superbubbles in the AssemblyGraph. + class Superbubble : public vector<vertex_descriptor> { + public: + vector<vertex_descriptor> entrances; + vector<vertex_descriptor> exits; + + // Fill in the superbubble given a single entrance and exit. + void fillInFromEntranceAndExit(const AssemblyGraph&); + }; + class Superbubbles { + public: + + // This computes connected components using only edges with length up to maxOffset1. + Superbubbles( + AssemblyGraph&, + uint64_t maxOffset1 + ); + + // This uses dominator trees. + // It only finds superbubbles with one entrance and one ecit. + Superbubbles(AssemblyGraph&); + + ~Superbubbles(); + + // Return the number of superbubbbles. + uint64_t size() const + { + return superbubbles.size(); + } + + // Return the vertices in the specified superbubble. + Superbubble& getSuperbubble(uint64_t superBubbleId) + { + return superbubbles[superBubbleId]; + } + const Superbubble& getSuperbubble(uint64_t superBubbleId) const + { + return superbubbles[superBubbleId]; + } + + // Figure out if a vertex is in the specified superbubble. + bool isInSuperbubble(uint64_t superbubbleId, vertex_descriptor cv) const + { + return cGraph[cv].superbubbleId == superbubbleId; + } + + private: + + AssemblyGraph& cGraph; + + // The superbubbles are the connected components with size at least 2, + // computed using only the edges with offset up to maxOffset1. + vector<Superbubble> superbubbles; + }; + + + + + // Remove short superbubbles with one entry and one exit. + bool removeShortSuperbubbles( + bool debug, + uint64_t maxOffset1, // Used to define superbubbles + uint64_t maxOffset2 // Compared against the offset between entry and exit + ); + + // Detangle short superbubbles with any number of entrances and exits. + bool detangleShortSuperbubbles( + bool debug, + uint64_t maxOffset1, // Used to define superbubbles + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP); + bool detangleShortSuperbubble( + bool debug, + const Superbubbles&, + uint64_t superbubbleId, + uint64_t detangleToleranceLow, + uint64_t detangleToleranceHigh, + bool useBayesianModel, + double epsilon, + double minLogP); + + + // Cleanup/simplify superbubbles that are likely to be caused by errors, + // completely or in part. + void cleanupSuperbubbles( + bool debug, + uint64_t maxOffset1, // Used to define superbubbles + uint64_t maxOffset2, // Compared against the offset between entry and exit + uint64_t chainTerminalCommonThreshold); + void cleanupSuperbubble( + bool debug, + const Superbubbles&, + uint64_t superbubbleId, + uint64_t maxOffset2, // Compared against the offset between entry and exit + uint64_t chainTerminalCommonThreshold, + std::set<vertex_descriptor>& previousSuperbubblesVertices); + + // This version of superbubble cleanup uses dominator trees to define superbubbles, + // instead of computing connected components using edges of length uo tp maxOffset1. + void cleanupSuperbubbles( + bool debug, + uint64_t maxOffset2, // Compared against the offset between entry and exit + uint64_t chainTerminalCommonThreshold); + + // Split terminal haploid bubbles out of bubble chains, to facilitate detangling. + void splitTerminalHaploidBubbles(); + void splitTerminalHaploidBubbles(edge_descriptor); + + void removeSelfComplementarySquares(); + + // Phasing of bubble chains using the PhasingGraph. + void phaseBubbleChainsUsingPhasingGraph( + bool debug, + uint64_t n, // Maximum number of Chain MarkerGraphEdgeIds to use when computing tangle matrices. + uint64_t lowThreshold, + uint64_t highThreshold, + bool useBayesianModel, + double epsilon, + double minLogP, + uint64_t longBubbleThreshold); + void phaseBubbleChainUsingPhasingGraph( + edge_descriptor e, + uint64_t n, // Maximum number of Chain MarkerGraphEdgeIds to use when computing tangle matrices. + uint64_t lowThreshold, + uint64_t highThreshold, + bool useBayesianModel, + double epsilon, + double minLogP, + uint64_t longBubbleThreshold, + bool debug); + void phaseBubbleChainUsingPhasedComponents( + bool debug, + edge_descriptor e, + const vector< shared_ptr<PhasedComponent> >&, + uint64_t longBubbleThreshold); + + // In the phasing graph, each vertex corresponds to a diploid bubble + // in the BubbleChain being phased. + class TangleMatrix : public array< array<uint64_t, 2>, 2> { + public: + void analyze( + uint64_t lowThreshold, + uint64_t highThreshold, + int64_t& phase, + uint64_t& minConcordant, + uint64_t& maxDiscordant, + uint64_t& total, + double epsilon, + double& logPin, // log[P(in-phase)/P(random)] in decibels + double& logPout // log[P(out-of-phase)/P(random)] in decibels + ) const; + }; + + + // Compute the tangle matrix between two incoming chains + // and two outgoing chains, taking into account up to + // n MarkergraphEdgeIds for each Chain. + void computeTangleMatrix( + const array<const Chain*, 2> inChains, + const array<const Chain*, 2> outChains, + uint64_t n, + TangleMatrix&) const; + + // Gather OrientedReadIds from up to n MarkergraphEdgeIds + // near the beginning or end of a chain. + void gatherOrientedReadIdsAtBeginning( + const Chain&, + uint64_t n, + vector<OrientedReadId>&) const; + void gatherOrientedReadIdsAtEnd( + const Chain&, + uint64_t n, + vector<OrientedReadId>&) const; + + + + class PhasingGraphVertex { + public: + uint64_t positionInBubbleChain; + int64_t phase = 0; // +1 or -1 for phased vertices, 0 otherwise + }; + + class PhasingGraphEdge { + public: + int64_t phase; // +1 (in phase) or -1 (out of phase) + + // Tangle matrix metrics. + // If phase = +1, minConcordant = min(m00, m11), maxDiscordant = max(m01, m10). + // If phase = -1, minConcordant = min(m01, m10), maxDiscordant = max(m00, m11). + uint64_t minConcordant; + uint64_t maxDiscordant; + double logPInPhase; + double logPOutOfPhase; + double logP() const + { + return max(max(logPInPhase, logPOutOfPhase), fabs(logPInPhase - logPOutOfPhase)); + } + +#if 0 + bool sortByCounts(const PhasingGraphEdge& that) const + { + if(maxDiscordant < that.maxDiscordant) { + return true; + } + if(maxDiscordant > that.maxDiscordant) { + return false; + } + return minConcordant > that.minConcordant; + } + bool sortByProbabilities(const PhasingGraphEdge& that) const + { + return logP() > that.logP(); + } +#endif + bool isSpanningTreeEdge = false; + }; + using PhasingGraphBaseClass = boost::adjacency_list< + boost::listS, + boost::listS, + boost::undirectedS, + PhasingGraphVertex, + PhasingGraphEdge>; + class PhasingGraph : public PhasingGraphBaseClass { + public: + void phase(bool debug); + void phase1(bool debug, bool useBayesianModel); + bool isConsistent(edge_descriptor) const; + void writeGraphviz(const string& fileName) const; + vector< shared_ptr<PhasedComponent> > phasedComponents; + + // Sort edges in order of decreasing significance: + // - If using the Bayesian model, logP. + // - Otherwise, minConcordant/maxDiscordant. + void sortEdges(vector<edge_descriptor>& sortedEdges, bool useBayesianModel) const; + }; + + + + // Phasing of bubble chains using the PhasingTable. + void phaseBubbleChainsUsingPhasingTable( + const string& debugOutputFileNamePrefix, + double phaseErrorThreshold, + double bubbleErrorThreshold, + uint64_t longBubbleThreshold); + void phaseBubbleChainUsingPhasingTable( + const string& debugOutputFileNamePrefix, + edge_descriptor e, + double phaseErrorThreshold, + double bubbleErrorThreshold, + uint64_t longBubbleThreshold); + void cleanupBubbleChainUsingPhasingTable( + const string& debugOutputFileNamePrefix, + edge_descriptor e, + double phaseErrorThreshold, + double bubbleErrorThreshold, + uint64_t longBubbleThreshold); + + + + // Optimize chains before assembly, to remove assembly steps with + // less that minCommon reads. + void optimizeChains( + bool debug, + uint64_t minCommon, + uint64_t k + ); + void optimizeChain( + bool debug, + Chain&, + uint64_t minCommon, + uint64_t k + ); + + // Assemble sequence for a single Chain. + void assembleChain( + Chain&, + uint64_t chainTerminalCommonThreshold); + + // Multithreaded version of sequence assembly. + // This only assembles the chains that have the shouldBeAssembled flag set. + void assembleChainsMultithreaded( + uint64_t chainTerminalCommonThreshold, + uint64_t threadCount); + // This sets the shouldBeAssembled flag for all chains, then + // calls assembleChainsMultithreaded. + void assembleAllChainsMultithreaded( + uint64_t chainTerminalCommonThreshold, + uint64_t threadCount); + // This clears the shouldBeAssembled flag from all Chains. + void clearAllShouldBeAssembledFlags(); + + void assembleChainsMultithreadedTheadFunction(uint64_t threadId); + void combineStepSequences(Chain&); + class AssemblyStep { + public: + edge_descriptor e; // This identified the BubbleChain. + uint64_t positionInBubbleChain; // This identifies the Bubble. + uint64_t indexInBubble; // This identifies the Chain. + uint64_t positionInChain; + uint64_t offsetInBases; + + // For better load balancing, order them by decreasing offsetInBases. + bool operator<(const AssemblyStep& that) const + { + return offsetInBases > that.offsetInBases; + } + }; + void runAssemblyStep( + uint64_t chainTerminalCommonThreshold, + const AssemblyStep&); + void runAssemblyStep( + Chain& chain, + uint64_t positionInChain, + uint64_t chainTerminalCommonThreshold); + class AssembleChainsMultithreadedData { + public: + uint64_t chainTerminalCommonThreshold; + vector<AssemblyStep> assemblySteps; + }; + AssembleChainsMultithreadedData assembleChainsMultithreadedData; + + + + // Get the lengths of Chains assembled sequence for each Chain P-value. + // On return, chainLengths[pValue] contains the lengths of all + // Chains with that pValue, sorted in decreasing order. + // This can be used for N50 statistics. +public: + void getChainLengthsByPValue(vector< vector<uint64_t> >& chainLengths) const; + + // Get the lengths of all non-trivial bubble chains. + void getBubbleChainLengths(vector<uint64_t>&) const; + + // Given a vector of lengths in decreasing order, compute the total length and N50. + static pair<uint64_t, uint64_t> n50(const vector<uint64_t>&); +private: + + // Output. + void write(const string& name, bool writeSequence = false) const; + void writeCsv(const string& fileNamePrefix) const; +public: + void writeCsvSummary(ostream&) const; +private: + void writeBubbleChainsCsv(const string& fileNamePrefix) const; + void writeBubbleChainsPhasingTables(const string& fileNamePrefix, double phaseErrorThreshold) const; + void writeBubblesCsv(const string& fileNamePrefix) const; + void writeChainsCsv(const string& fileNamePrefix) const; + void writeChainsDetailsCsv(const string& fileNamePrefix) const; + void writeChainDetailsCsv(ostream&, edge_descriptor, bool writeHeader) const; + void writeGraphviz(const string& fileNamePrefix, bool labels) const; + void writeGfa(const string& fileNamePrefix) const; + void writeGfaExpanded( + const string& fileNamePrefix, + bool includeSequence, + bool useSequenceLength) const; + void writeGfaExpanded( + ostream&, + bool includeSequence, + bool useSequenceLength) const; + void writeAssemblyDetails() const; +public: + void writeGfaSegmentsExpanded( + ostream&, + bool includeSequence, + bool useSequenceLength) const; + void writeGfaLinksExpanded(ostream&) const; + static void writeGfaHeader(ostream&); + void writeFastaExpanded(ostream&) const; +private: + void writeFastaExpanded(const string& fileNamePrefix) const; + void writeSnapshot(uint64_t& snapshotNumber) const; + + string bubbleChainStringId(edge_descriptor) const; + string bubbleStringId(edge_descriptor, uint64_t positionInBubbleChain) const; + string chainStringId(edge_descriptor, uint64_t positionInBubbleChain, uint64_t indexInBubble) const; + + + // Return average coverage for the internal MarkerGraphEdgeIds of a Chain. + // For chain of length 2, this returns 0. + double primaryCoverage(const Chain&) const; + + // This returns a "P-value" for a Chain defined as follows: + // If the Chain is the only chain of a BubbleChain, the P-value is 0. + // Otherwise, the P-value is the ploidy of the Bubble that the Chain belongs to. + uint64_t chainPValue(edge_descriptor, uint64_t positionInBubbleChain, uint64_t indexInBubble) const; + + uint64_t chainOffset(const Chain&) const; + void bubbleOffset( + const Bubble&, + uint64_t& averageOffset, + uint64_t& minOffset, + uint64_t& maxOffset + ) const; + bool bubbleOffsetNoException( + const Bubble&, + uint64_t& averageOffset, + uint64_t& minOffset, + uint64_t& maxOffset + ) const; + void bubbleChainOffset( + const BubbleChain&, + uint64_t& averageOffset, + uint64_t& minOffset, + uint64_t& maxOffset + ) const; +}; + diff --git a/src/mode3-AssemblyPath.cpp b/src/mode3-AssemblyPath.cpp deleted file mode 100644 index 4ac9dc2..0000000 --- a/src/mode3-AssemblyPath.cpp +++ /dev/null @@ -1,1269 +0,0 @@ -// Shasta. -#include "mode3-AssemblyPath.hpp" -#include "mode3-SegmentPairInformation.hpp" -#include "assembleMarkerGraphPath.hpp" -#include "ConsensusCaller.hpp" -#include "deduplicate.hpp" -#include "html.hpp" -#include "Marker.hpp" -#include "MarkerGraph.hpp" -#include "Reads.hpp" -#include "mode3.hpp" -#include "timestamp.hpp" -using namespace shasta; -using namespace mode3; - -// Spoa. -#include "spoa/spoa.hpp" - -// Seqan. -#include <seqan/align.h> - -// Standard library. -#include "fstream.hpp" - - - -// Assemble sequence for an AssemblyPath. -void AssemblyPath::assemble(const AssemblyGraph& assemblyGraph) -{ - const bool debug = false; - if(debug) { - cout << timestamp << "AssemblyPath::assemble begins." << endl; - } - - // Assemble each segment on the path. - assembleSegments(assemblyGraph); - - // Assemble links in this assembly path. - initializeLinks(assemblyGraph); - assembleLinks(assemblyGraph); - - if(debug) { - writeSegmentSequences(); - writeLinkSequences(assemblyGraph); - } - - assemble(); - - if(debug) { - cout << timestamp << "AssemblyPath::assemble ends." << endl; - } -} - -// Initialize the links. -// This only resizes the links vector and fills in the id and isTrivial -// fields of each link. -void AssemblyPath::initializeLinks(const AssemblyGraph& assemblyGraph) -{ - SHASTA_ASSERT(segments.size() > 1); - links.resize(segments.size()-1); - - // Fill in the id and isTrivial fields of each link. - for(uint64_t position0=0; position0<links.size(); position0++) { - const uint64_t position1 = position0 + 1; - - // Access the source and target segments of this link. - // We will process the link between segmentId0 and segmentId1. - AssemblyPathSegment& segment0 = segments[position0]; - AssemblyPathSegment& segment1 = segments[position1]; - - // Fill in the id and isTrivial fields. - AssemblyPathLink& assemblyPathLink = links[position0]; - assemblyPathLink.id = assemblyGraph.findLink(segment0.id, segment1.id); - const AssemblyGraph::Link& link = assemblyGraph.links[assemblyPathLink.id]; - assemblyPathLink.isTrivial = link.segmentsAreAdjacent; - - SHASTA_ASSERT(segment0.id == link.segmentId0); - SHASTA_ASSERT(segment1.id == link.segmentId1); - } - - - // Fill in the previousPrimarySegmentId field of each link. - SHASTA_ASSERT(segments.front().isPrimary); - uint64_t lastPrimarySegmentSeen = invalid<uint64_t>; - for(uint64_t position=0; position<links.size(); position++) { - const AssemblyPathSegment& segment = segments[position]; - if(segment.isPrimary) { - lastPrimarySegmentSeen = segment.id; - } - links[position].previousPrimarySegmentId = lastPrimarySegmentSeen; - } - - - - // Fill in the nextPrimarySegmentId field of each link. - SHASTA_ASSERT(segments.back().isPrimary); - lastPrimarySegmentSeen = invalid<uint64_t>; - for(uint64_t position = links.size() - 1; /* Check later */; position--) { - const AssemblyPathSegment& segment = segments[position + 1]; - if(segment.isPrimary) { - lastPrimarySegmentSeen = segment.id; - } - links[position].nextPrimarySegmentId = lastPrimarySegmentSeen; - - if(position == 0) { - break; - } - } -} - - - -// Assemble links in this assembly path. -void AssemblyPath::assembleLinks(const AssemblyGraph& assemblyGraph) -{ - const bool debug = false; - - SHASTA_ASSERT((assemblyGraph.k % 2) == 0); - - // Don't skip any bases at the beginning of the first - // segment and at the end of the last segment. - segments.front().leftTrim = 0; - segments.back().rightTrim = 0; - - ofstream html; - if(debug) { - html.open("Msa.html"); - } - - // Loop over links in the path. - links.resize(segments.size()-1); - for(uint64_t position0=0; position0<links.size(); position0++) { - assembleLinkAtPosition(assemblyGraph, position0, html); - } -} - - - -void AssemblyPath::assembleLinkAtPosition( - const AssemblyGraph& assemblyGraph, - uint64_t position0, - ostream& html) -{ - const bool debug = false; - - AssemblyPathLink& link = links[position0]; - const uint64_t position1 = position0 + 1; - - // Access the source and target segments of this link. - // We will process the link between segmentId0 and segmentId1. - AssemblyPathSegment& segment0 = segments[position0]; - AssemblyPathSegment& segment1 = segments[position1]; - - if(debug) { - cout << "Assembling link " << link.id << " " << segment0.id << "->" << segment1.id << - " at position " << position0 << " in the assembly path." << endl; - } - - if(link.isTrivial) { - - // The two segments are consecutive in the marker graph. - // This is a trivial link because the two segments share a terminal - // marker graph vertex. - // Just trim from the assembly the last k/2 RLE bases of segmentId0 - // and the first k/2 RLE bases of segmentId1. - assembleTrivialLink(segment0, segment1, link, assemblyGraph.k); - - } else { - - assembleNonTrivialLink( - assemblyGraph, - segment0, - segment1, - link, - html); - } -} - - - -void AssemblyPath::assembleNonTrivialLink( - const AssemblyGraph& assemblyGraph, - AssemblyPathSegment& segment0, - AssemblyPathSegment& segment1, - AssemblyPathLink& link, - ostream& html) -{ - const bool debug = false; - - - // First, find: - // - The position in segmentId0 of the leftmost transition. - // - The position in segmentId1 of the rightmost transition. - uint64_t minEdgePosition0 = assemblyGraph.markerGraphPaths[segment0.id].size(); - uint64_t maxEdgePosition1 = 0; - for(const auto& p: assemblyGraph.transitions[link.id]) { - const OrientedReadId orientedReadId = p.first; - - // If not in previousPrimarySegmentId or nextPrimarySegmentId, skip it. - if(not( - assemblyGraph.segmentContainsOrientedRead(link.previousPrimarySegmentId, orientedReadId) - or - assemblyGraph.segmentContainsOrientedRead(link.nextPrimarySegmentId, orientedReadId) - )) { - continue; - } - - // Access the transition from segmentId0 to segmentId1 for this oriented read. - const Transition& transition = p.second; - - minEdgePosition0 = min(minEdgePosition0, uint64_t(transition[0].position)); - maxEdgePosition1 = max(maxEdgePosition1, uint64_t(transition[1].position)); - } - - // When getting here: - // - minEdgePosition0 is the leftmost position of the transitions in path0. - // - maxEdgePosition1 is the rightmost position of the transitions in path1. - // These positions are edge positions in markerGraphPath0 and markerGraphPath1. - // We will do a multiple sequence alignment of the oriented reads, - // using the sequence of segmentId0 to extend to the left all reads to minEdgePosition0, - // and using the sequence of segmentId1 to extend to the right all reads to maxEdgePosition1, - - // Get the corresponding vertex positions in segmentId0 and segmentId1. - const uint64_t minVertexPosition0 = minEdgePosition0 + 1; - const uint64_t maxVertexPosition1 = maxEdgePosition1; - - // To compute an MSA anchored at both sides,we will extend the - // sequence of each read to the left/right using the sequence of - // adjacent segments. - const AssembledSegment& assembledSegment0 = segment0.assembledSegment; - SHASTA_ASSERT(not assembledSegment0.runLengthSequence.empty()); - const AssembledSegment& assembledSegment1 = segment1.assembledSegment; - SHASTA_ASSERT(not assembledSegment1.runLengthSequence.empty()); - - - // Now extract the portion of each oriented read sequence that - // will be used to assemble this link. - vector<OrientedReadId> orientedReadIdsForAssembly; - vector< vector<Base> > orientedReadsSequencesForAssembly; - vector< vector<uint32_t> > orientedReadsRepeatCountsForAssembly; - for(const auto& p: assemblyGraph.transitions[link.id]) { - const OrientedReadId orientedReadId = p.first; - - // If not in previousPrimarySegmentId or nextPrimarySegmentId, skip it. - if(not( - assemblyGraph.segmentContainsOrientedRead(link.previousPrimarySegmentId, orientedReadId) - or - assemblyGraph.segmentContainsOrientedRead(link.nextPrimarySegmentId, orientedReadId) - )) { - continue; - } - - // Access the transition from segmentId0 to segmentId1 for this oriented read. - const Transition& transition = p.second; - - // Get the ordinals of the last appearance of this oriented - // read on segmentId0 and the first on segmentId1, - // and the corresponding markers. - const uint32_t ordinal0 = transition[0].ordinals[1]; - const uint32_t ordinal1 = transition[1].ordinals[0]; - const CompressedMarker& marker0 = assemblyGraph.markers[orientedReadId.getValue()][ordinal0]; - const CompressedMarker& marker1 = assemblyGraph.markers[orientedReadId.getValue()][ordinal1]; - - // Get the positions of these markers on the oriented read. - // If using RLE, these are RLE positions. - const uint32_t position0 = marker0.position; - const uint32_t position1 = marker1.position; - - // Extract the sequence between these markers (including the markers). - vector<Base> orientedReadSequence; - vector<uint8_t> orientedReadRepeatCounts; - if(assemblyGraph.readRepresentation == 1) { - // RLE. - for(uint64_t position=position0; position<position1+assemblyGraph.k; position++) { - Base b; - uint8_t r; - tie(b, r) = assemblyGraph.reads.getOrientedReadBaseAndRepeatCount(orientedReadId, uint32_t(position)); - orientedReadSequence.push_back(b); - orientedReadRepeatCounts.push_back(r); - } - } else { - // Raw sequence. - for(uint64_t position=position0; position<position1+assemblyGraph.k; position++) { - const Base b = assemblyGraph.reads.getOrientedReadBase(orientedReadId, uint32_t(position)); - orientedReadSequence.push_back(b); - orientedReadRepeatCounts.push_back(uint8_t(1)); - } - } - - // We need to extend the sequence of this read to the left, - // using segmentId0 sequence, up to minVertexPosition0, - // so the portions of all reads we will be using for the MSA - // all begin in the same place. - vector<Base> leftSequence; - vector<uint32_t> leftRepeatCounts; - const uint64_t vertexPosition0 = transition[0].position + 1; // Add 1 to get vertex position. - const uint64_t begin0 = assembledSegment0.vertexOffsets[minVertexPosition0]; - const uint64_t end0 = assembledSegment0.vertexOffsets[vertexPosition0]; - for(uint64_t position=begin0; position!=end0; position++) { - leftSequence.push_back(assembledSegment0.runLengthSequence[position]); - leftRepeatCounts.push_back(assembledSegment0.repeatCounts[position]); - } - - vector<Base> rightSequence; - vector<uint32_t> rightRepeatCounts; - const uint64_t vertexPosition1 = transition[1].position; - const uint64_t begin1 = assembledSegment1.vertexOffsets[vertexPosition1] + assemblyGraph.k; - const uint64_t end1 = assembledSegment1.vertexOffsets[maxVertexPosition1] + assemblyGraph.k; - for(uint64_t position=begin1; position!=end1; position++) { - rightSequence.push_back(assembledSegment1.runLengthSequence[position]); - rightRepeatCounts.push_back(assembledSegment1.repeatCounts[position]); - } - - // Construct the extended sequence for this oriented read, - // to be used in the MSA. - vector<Base> orientedReadExtendedSequence; - vector<uint32_t> orientedReadExtendedRepeatCounts; - const auto addToExtendedSequence = back_inserter(orientedReadExtendedSequence); - copy(leftSequence, addToExtendedSequence); - copy(orientedReadSequence, addToExtendedSequence); - copy(rightSequence, addToExtendedSequence); - const auto addToRepeatCounts = back_inserter(orientedReadExtendedRepeatCounts); - copy(leftRepeatCounts, addToRepeatCounts); - copy(orientedReadRepeatCounts, addToRepeatCounts); - copy(rightRepeatCounts, addToRepeatCounts); - - orientedReadIdsForAssembly.push_back(orientedReadId); - orientedReadsSequencesForAssembly.push_back(orientedReadExtendedSequence); - orientedReadsRepeatCountsForAssembly.push_back(orientedReadExtendedRepeatCounts); - - if(debug) { - copy(orientedReadExtendedSequence, ostream_iterator<Base>(cout)); - cout << " " << orientedReadId << endl; - } - } - - // Store coverage for this link. - link.coverage = orientedReadIdsForAssembly.size(); - - // Compute the consensus sequence for the link. - if(html) { - html << "<h2>Link " << link.id << "</h2>\n"; - } - computeLinkConsensusUsingSpoa( - orientedReadIdsForAssembly, - orientedReadsSequencesForAssembly, - orientedReadsRepeatCountsForAssembly, - assemblyGraph.readRepresentation, - assemblyGraph.consensusCaller, - debug, - html, - link.msaRleSequence, - link.msaRepeatCounts - ); - SHASTA_ASSERT(link.msaRleSequence.size() == link.msaRepeatCounts.size()); - - if(debug) { - cout << "Consensus RLE sequence length before trimming " << link.msaRleSequence.size() << endl; - cout << "Portion of segment on left involved in the MSA begins at position " << - assembledSegment0.vertexOffsets[minVertexPosition0] << endl; - cout << "Portion of segment on right involved in the MSA ends at position " << - assembledSegment1.vertexOffsets[maxVertexPosition1] + assemblyGraph.k << endl; - } - - // Count the number of identical (RLE) bases at the beginning of the - // link consensus sequence and of the segmentId0 sequence portion - // involved in assembling this link. - uint64_t identicalOnLeft = 0; - const uint64_t begin0 = assembledSegment0.vertexOffsets[minVertexPosition0]; - const uint64_t end0 = assembledSegment0.runLengthSequence.size(); - for(uint64_t i=begin0; (i!=end0 and (i-begin0)<link.msaRleSequence.size()); i++) { - if(link.msaRleSequence[i-begin0] == assembledSegment0.runLengthSequence[i]) { - // cout << "*** " << begin0 << " " << end0 << " " << i << endl; - ++identicalOnLeft; - } else { - break; - } - } - if(debug) { - cout << "Identical on left: " << identicalOnLeft << endl; - } - - // Count the number of identical (RLE) bases at the end of the - // link consensus sequence and the beginning of segmentId1 . - uint64_t identicalOnRight = 0; - const uint64_t end1 = assembledSegment1.vertexOffsets[maxVertexPosition1] + assemblyGraph.k; - for(uint64_t i=end1-1; ; i--) { - const uint64_t j = link.msaRleSequence.size() - (end1 - i); - if(link.msaRleSequence[j] == assembledSegment1.runLengthSequence[i]) { - // cout << "*** " << i << " " << assembledSegment1.runLengthSequence[i] << " " << - // j << " " << consensusRleSequence[j] << endl; - ++identicalOnRight; - } else { - break; - } - if(i == 0) { - break; - } - if(j == 0) { - break; - } - } - identicalOnRight = min(identicalOnRight, link.msaRleSequence.size()-identicalOnLeft); - if(debug) { - cout << "Identical on right: " << identicalOnRight << endl; - } - - // Trim these identical bases from the link consensus sequence. - link.leftTrim = identicalOnLeft; - link.rightTrim = identicalOnRight; - - // Compute and store the number of bases to be trimmed at the end of segmentId0 - // and at the beginning of segmentId1. - segment0.rightTrim = - assembledSegment0.runLengthSequence.size() - - assembledSegment0.vertexOffsets[minVertexPosition0] - - identicalOnLeft; - segment1.leftTrim = - assembledSegment1.vertexOffsets[maxVertexPosition1] + assemblyGraph.k - - identicalOnRight; -} - - - -void AssemblyPath::assembleTrivialLink( - AssemblyPathSegment& segment0, - AssemblyPathSegment& segment1, - AssemblyPathLink& link, - uint64_t k) -{ - SHASTA_ASSERT(link.isTrivial); - SHASTA_ASSERT(link.msaRleSequence.empty()); - SHASTA_ASSERT(link.msaRepeatCounts.empty()); - SHASTA_ASSERT(link.leftTrim == 0); - SHASTA_ASSERT(link.rightTrim == 0); - - // Just trim k/2 bases from the adjacent segments, - // because they are adjacent in the marker graph. - segment0.rightTrim = k/2; - segment1.leftTrim = k/2; -} - - - -void AssemblyPath::clear() -{ - segments.clear(); - links.clear(); -} - - - -// Assemble each segment on the path. -void AssemblyPath::assembleSegments(const AssemblyGraph& assemblyGraph) -{ - for(uint64_t i=0; i<segments.size(); i++) { - AssemblyPathSegment& segment = segments[i]; - assembleMarkerGraphPath( - assemblyGraph.readRepresentation, - assemblyGraph.k, - assemblyGraph.markers, - assemblyGraph.markerGraph, - assemblyGraph.markerGraphPaths[segment.id], - false, - segment.assembledSegment); - } -} - - - -void AssemblyPath::writeSegmentSequences() -{ - ofstream fasta("PathSegmentsSequence.fasta"); - ofstream txt("PathSegmentsRleSequence.txt"); - - for(uint64_t i=0; i<segments.size(); i++) { - const AssemblyPathSegment& segment = segments[i]; - const uint64_t segmentId = segment.id; - const AssembledSegment& assembledSegment = segment.assembledSegment; - - if(segment.leftTrim + segment.rightTrim > assembledSegment.runLengthSequence.size()) { - continue; - } - - // Write the trimmed RLE sequence to txt. - const auto trimmedRleSequence = segment.trimmedRleSequence(); - const auto trimmedRepeatCounts = segment.trimmedRepeatCounts(); - txt << "S" << i << " " << segmentId << "\n"; - copy( trimmedRleSequence, ostream_iterator<Base>(txt)); - txt << "\n"; - for(const uint32_t r: trimmedRepeatCounts) { - txt << repeatCountCharacter(r); - } - txt << "\n"; - - // Write the trimmed raw sequence to fasta. - vector<Base> trimmedRawSequence; - segment.getTrimmedRawSequence(trimmedRawSequence); - fasta << - ">S" << i << - " segment " << segmentId << - ", length " << trimmedRawSequence.size() << "\n"; - copy(trimmedRawSequence, ostream_iterator<Base>(fasta)); - fasta << "\n"; - } -} - - - -void AssemblyPath::writeLinkSequences(const AssemblyGraph& assemblyGraph) -{ - ofstream fasta("PathLinksSequence.fasta"); - ofstream txt("PathLinksRleSequence.txt"); - - for(uint64_t i=0; i<segments.size()-1; i++) { - const uint64_t segmentId0 = segments[i].id; - const uint64_t segmentId1 = segments[i+1].id; - const uint64_t linkId = assemblyGraph.findLink(segmentId0, segmentId1); - const span<const Base> rleSequence = links[i].trimmedRleSequence(); - const span<const uint32_t> repeatCounts = links[i].trimmedRepeatCounts(); - SHASTA_ASSERT(rleSequence.size() == repeatCounts.size()); - if(rleSequence.empty()) { - continue; - } - - fasta << - ">L" << i << - " link " << linkId << " " << segmentId0 << "->"<< segmentId1 << "\n"; - for(uint64_t j=0; j<rleSequence.size(); j++) { - const Base b = rleSequence[j]; - const uint64_t repeatCount = repeatCounts[j]; - for(uint64_t k=0; k<repeatCount; k++) { - fasta << b; - } - } - fasta << "\n"; - - txt << "L" << i << - " link " << linkId << " " << segmentId0 << "->"<< segmentId1 << "\n"; - copy(rleSequence, ostream_iterator<Base>(txt)); - txt << "\n"; - for(const uint32_t r: repeatCounts) { - txt << repeatCountCharacter(r); - } - txt << "\n"; - } -} - - - -// Compute consensus sequence for Link, given sequences of -// the oriented reads, which must all be anchored on both sides.// Lower level version. -void AssemblyPath::computeLinkConsensusUsingSpoa( - const vector<OrientedReadId> orientedReadIds, - const vector< vector<Base> > rleSequences, - const vector< vector<uint32_t> > repeatCounts, - uint64_t readRepresentation, - const ConsensusCaller& consensusCaller, - bool debug, - ostream& html, - vector<Base>& consensusRleSequence, - vector<uint32_t>& consensusRepeatCounts - ) -{ - SHASTA_ASSERT(rleSequences.size() == orientedReadIds.size()); - SHASTA_ASSERT(repeatCounts.size() == orientedReadIds.size()); - - // Create the spoa alignment engine and elignment graph. - const spoa::AlignmentType alignmentType = spoa::AlignmentType::kNW; - const int8_t match = 1; - const int8_t mismatch = -1; - const int8_t gap = -1; - auto spoaAlignmentEngine = spoa::AlignmentEngine::Create(alignmentType, match, mismatch, gap); - spoa::Graph spoaAlignmentGraph; - - // Add the oriented read sequences to the alignment. - string sequenceString; - for(const vector<Base>& sequence: rleSequences) { - - // Add it to the alignment. - sequenceString.clear(); - for(const Base base: sequence) { - sequenceString += base.character(); - } - auto alignment = spoaAlignmentEngine->Align(sequenceString, spoaAlignmentGraph); - spoaAlignmentGraph.AddAlignment(alignment, sequenceString); - } - - // Compute the multiple sequence alignment. - const vector<string> msa = spoaAlignmentGraph.GenerateMultipleSequenceAlignment(); - const string consensus = spoaAlignmentGraph.GenerateConsensus(); - const uint64_t msaLength = msa.front().size(); - if(debug) { - cout << "Multiple sequence alignment has length " << msaLength << ":" << endl; - for(const string& s: msa) { - cout << s << endl; - } - } - - - // Compute coverage for each base at each position of the MSA. - // Use position 4 for gaps. - vector<Coverage> coverage(msaLength); - for(uint64_t i=0; i<orientedReadIds.size(); i++) { - const OrientedReadId orientedReadId = orientedReadIds[i]; - const vector<Base>& rleSequence = rleSequences[i]; - const vector<uint32_t>& repeatCount = repeatCounts[i]; - const string& msaString = msa[i]; - - // Here: - // rPosition = position in rle sequence of oriented read. - // aPosition = position in alignment - uint64_t rPosition = 0; - for(uint64_t aPosition=0; aPosition<msaLength; aPosition++) { - const AlignedBase alignedBase = AlignedBase::fromCharacter(msaString[aPosition]); - if(alignedBase.isGap()) { - coverage[aPosition].addRead(alignedBase, orientedReadId.getStrand(), 0); - } else { - SHASTA_ASSERT(AlignedBase(rleSequence[rPosition]) == alignedBase); - if(readRepresentation == 1) { - coverage[aPosition].addRead( - alignedBase, - orientedReadId.getStrand(), - repeatCount[rPosition]); - } else { - coverage[aPosition].addRead( - alignedBase, - orientedReadId.getStrand(), - 1); - } - ++rPosition; - } - } - SHASTA_ASSERT(rPosition == rleSequence.size()); - } - - - - // Compute consensus base and repeat count at every position in the alignment. - vector<AlignedBase> msaConsensusSequence(msaLength); - vector<uint32_t> msaConsensusRepeatCount(msaLength); - vector<uint64_t> msaConsensusDiscordantCount(msaLength); - for(uint64_t aPosition=0; aPosition<msaLength; aPosition++) { - const Coverage& c = coverage[aPosition]; - const Consensus consensus = consensusCaller(c); - msaConsensusSequence[aPosition] = consensus.base; - msaConsensusRepeatCount[aPosition] = uint32_t(consensus.repeatCount); - - // Compute discordant count at this position of the alignment. - msaConsensusDiscordantCount[aPosition] = 0; - for(uint64_t b=0; b<5; b++) { - if(b != consensus.base.value) { - msaConsensusDiscordantCount[aPosition] += c.coverage(AlignedBase::fromInteger(b)); - } - } - } - - - - // Fill in the output arguments. - // These are the same as msaConsensusSequence and msaConsensusRepeatCount, - // but with the gap bases removed. - consensusRleSequence.clear(); - consensusRepeatCounts.clear(); - for(uint64_t aPosition=0; aPosition<msaLength; aPosition++) { - const AlignedBase alignedBase = msaConsensusSequence[aPosition]; - if(not alignedBase.isGap()) { - consensusRleSequence.push_back(Base(alignedBase)); - consensusRepeatCounts.push_back(msaConsensusRepeatCount[aPosition]); - } - } - - - - // Html output of the alignment. - if(html) { - html << "Coverage " << rleSequences.size() << "<br>\n"; - html << "Alignment length " << msaLength << "<br>\n"; - html << "<div style='font-family:monospace;white-space:nowrap;'>\n"; - for(uint64_t i=0; i<orientedReadIds.size(); i++) { - const OrientedReadId orientedReadId = orientedReadIds[i]; - const string& msaString = msa[i]; - - for(const char c: msaString) { - const AlignedBase alignedBase = AlignedBase::fromCharacter(c); - if(alignedBase.isGap()) { - html << alignedBase; - } else { - html << "<span style='background-color:" << alignedBase.htmlColor() << - "'>" << alignedBase << "</span>"; - } - } - - // If using the RLE representation, also write the - // repeat count at each position. - if(readRepresentation == 1) { - const vector<Base>& rleSequence = rleSequences[i]; - const vector<uint32_t>& repeatCount = repeatCounts[i]; - - // Here: - // rPosition = position in RLE sequence of oriented read. - // aPosition = position in alignment - uint64_t rPosition = 0; - html << " "; - for(uint64_t aPosition=0; aPosition<msaLength; aPosition++) { - const AlignedBase alignedBase = AlignedBase::fromCharacter(msaString[aPosition]); - if(alignedBase.isGap()) { - html << alignedBase; - } else { - SHASTA_ASSERT(AlignedBase(rleSequence[rPosition]) == alignedBase); - const uint64_t r = repeatCount[rPosition]; - html << "<span style='background-color:" << alignedBase.htmlColor() << - "'>"; - if(r < 10) { - html << r; - } else { - html << "*"; - } - html << "</span>"; - ++rPosition; - } - } - SHASTA_ASSERT(rPosition == rleSequence.size()); - } - - html << " " << orientedReadId << "<br>\n"; - } - - - - // Also write the consensus. - html << "<br>\n"; - for(uint64_t aPosition=0; aPosition<msaLength; aPosition++) { - const AlignedBase alignedBase = msaConsensusSequence[aPosition]; - if(alignedBase.isGap()) { - html << alignedBase; - } else { - html << "<span style='background-color:" << alignedBase.htmlColor() << - "'>" << alignedBase << "</span>"; - } - } - if(readRepresentation == 1) { - html << " "; - for(uint64_t aPosition=0; aPosition<msaLength; aPosition++) { - const AlignedBase alignedBase = msaConsensusSequence[aPosition]; - if(alignedBase.isGap()) { - html << alignedBase; - } else { - const uint64_t r = msaConsensusRepeatCount[aPosition]; - html << "<span style='background-color:" << alignedBase.htmlColor() << - "'>"; - if(r < 10) { - html << r; - } else { - html << "*"; - } - html << "</span>"; - } - } - } - html << " Consensus<br>\n"; - - // Write the discordant count. - for(uint64_t aPosition=0; aPosition<msaLength; aPosition++) { - const uint64_t d = msaConsensusDiscordantCount[aPosition]; - const double errorRate = double(d) / double(orientedReadIds.size()); - int hue; - if(errorRate < .01) { - hue = 120; // Q>=20, green. - } else { - const double Q = -10. * log10(errorRate); - hue = int(std::round(6. * Q)); // 60 at Q=10 (yellow), 120 at Q=20 (green). - } - const string color = "hsl(" + to_string(hue) + ",100%, 70%)"; - html << "<span style='background-color:" << color << "'>"; - html << repeatCountCharacter(uint32_t(d)); - html << "</span>"; - } - html << " "; - for(uint64_t aPosition=0; aPosition<msaLength; aPosition++) { - html << " "; - } - html << " Discordant<br>\n"; - - html << "</div>\n"; - - - - html << "<h3>Consensus</h3>"; - html << "<div style='font-family:monospace;white-space:nowrap;'>\n"; - for(const Base b: consensusRleSequence) { - html << b; - } - html << "<br>\n"; - for(const uint64_t r: consensusRepeatCounts) { - if(r < 10) { - html << r; - } else { - html << "*"; - } - } - html << "<br>\n"; - html << "<br>\n"; - for(uint64_t i=0; i<consensusRleSequence.size(); i++) { - const Base b = consensusRleSequence[i]; - const uint64_t r = consensusRepeatCounts[i]; - for(uint64_t j=0; j<r; j++) { - html << b; - } - } - html << "<br>\n"; - html << "</div>\n"; - } -} - - - -// Final assembly of segments and links sequence into the path sequence. -void AssemblyPath::assemble() -{ - - rleSequence.clear(); - repeatCounts.clear(); - rawSequence.clear(); - - // Assemble RLE sequence. - for(uint64_t i=0; i<segments.size(); i++) { - AssemblyPathSegment& segment = segments[i]; - const AssembledSegment& assembledSegment = segment.assembledSegment; - segment.rlePosition = rleSequence.size(); - segment.rawPosition = rawSequence.size(); - - - if(segment.leftTrim + segment.rightTrim > assembledSegment.runLengthSequence.size()) { - // The left and right trim of this segment overlap. - // To handle this case, just take the excess number of bases out of the sequence - // we already assembled. - // This is not a great solution, but better than nothing. - const uint64_t excessTrim = - (segment.leftTrim + segment.rightTrim) - assembledSegment.runLengthSequence.size(); - SHASTA_ASSERT(excessTrim <= rleSequence.size()); - SHASTA_ASSERT(repeatCounts.size() == rleSequence.size()); - - // Compute the excess trim in the raw sequence. - uint64_t excessTrimRaw = 0; - for(uint64_t i=0; i<excessTrim; i++) { - excessTrimRaw += repeatCounts[repeatCounts.size() - 1 - i]; - } - SHASTA_ASSERT(excessTrimRaw <= rawSequence.size()); - - // Remove the excess trim from the sequence we already assembled. - rleSequence.resize(rleSequence.size() - excessTrim); - repeatCounts.resize(repeatCounts.size() - excessTrim); - rawSequence.resize(rawSequence.size() - excessTrimRaw); - } else { - - // This is the normal case. - - // Add the RLE sequence of this segment. - const auto segmentTrimmedRleSequence = segment.trimmedRleSequence(); - const auto segmentTrimmedRepeatCounts = segment.trimmedRepeatCounts(); - copy(segmentTrimmedRleSequence, back_inserter(rleSequence)); - copy(segmentTrimmedRepeatCounts, back_inserter(repeatCounts)); - - // Add the raw sequence of this segment. - for(uint64_t i=0; i<segmentTrimmedRleSequence.size(); i++) { - const Base b = segmentTrimmedRleSequence[i]; - const uint64_t r = segmentTrimmedRepeatCounts[i]; - for(uint64_t k=0; k<r; k++) { - rawSequence.push_back(b); - } - } - } - - - - // Add the sequence of the link following this segment. - if(i != segments.size() - 1) { - AssemblyPathLink& link = links[i]; - link.rlePosition = rleSequence.size(); - link.rawPosition = rawSequence.size(); - - // Add the RLE sequence of this link. - const auto trimmedRleSequence = link.trimmedRleSequence(); - const auto trimmedRepeatCounts = link.trimmedRepeatCounts(); - copy(trimmedRleSequence, back_inserter(rleSequence)); - copy(trimmedRepeatCounts, back_inserter(repeatCounts)); - - // Add the raw sequence of this link. - for(uint64_t i=0; i<trimmedRleSequence.size(); i++) { - const Base b = trimmedRleSequence[i]; - const uint64_t r = trimmedRepeatCounts[i]; - for(uint64_t k=0; k<r; k++) { - rawSequence.push_back(b); - } - } - } - } - SHASTA_ASSERT(rleSequence.size() == repeatCounts.size()); - -#if 0 - // For now, write it out. - ofstream fasta("PathSequence.fasta"); - fasta << ">Path" << endl; - copy(rawSequence, ostream_iterator<Base>(fasta)); - fasta << "\n"; -#endif -} - - - - -AssemblyPathSegment::AssemblyPathSegment( - uint64_t id, - bool isPrimary) : - id(id), - isPrimary(isPrimary) - {} - - - -span<const Base> AssemblyPathSegment::trimmedRleSequence() const -{ - const auto begin = assembledSegment.runLengthSequence.begin() + leftTrim; - const auto end = assembledSegment.runLengthSequence.end() - rightTrim; - SHASTA_ASSERT(begin <= end); - return span<const Base>(begin, end); -} - - - -span<const uint32_t> AssemblyPathSegment::trimmedRepeatCounts() const -{ - const auto begin = assembledSegment.repeatCounts.begin() + leftTrim; - const auto end = assembledSegment.repeatCounts.end() - rightTrim; - SHASTA_ASSERT(begin <= end); - return span<const uint32_t>(begin, end); -} - - - -span<const Base> AssemblyPathLink::trimmedRleSequence() const -{ - const auto begin = msaRleSequence.begin() + leftTrim; - const auto end = msaRleSequence.end() - rightTrim; - SHASTA_ASSERT(begin <= end); - return span<const Base>(begin, end); -} - - - -span<const uint32_t> AssemblyPathLink::trimmedRepeatCounts() const -{ - const auto begin = msaRepeatCounts.begin() + leftTrim; - const auto end = msaRepeatCounts.end() - rightTrim; - SHASTA_ASSERT(begin <= end); - return span<const uint32_t>(begin, end); -} - - - -void AssemblyPathSegment::getTrimmedRawSequence(vector<Base>& trimmedRawSequence) const -{ - - // Get the trimed RLE sequence and repeat counts. - const span<const Base> trimmedRleSequenceSpan = trimmedRleSequence(); - const span<const uint32_t> trimmedRepeatCountsSpan = trimmedRepeatCounts(); - SHASTA_ASSERT(trimmedRleSequenceSpan.size() == trimmedRepeatCountsSpan.size()); - - // Construct the raw sequence. - trimmedRawSequence.clear(); - for(uint64_t i=0; i<trimmedRleSequenceSpan.size(); i++) { - const Base b = trimmedRleSequenceSpan[i]; - const uint32_t r = trimmedRepeatCountsSpan[i]; - for(uint64_t k=0; k<r; k++) { - trimmedRawSequence.push_back(b); - } - } -} - - - -// Return a character to represent a repeat count -// when writing out RLE sequence. -char AssemblyPath::repeatCountCharacter(uint32_t r) { - if(r < 10) { - return '0' + char(r); - } else if(r < 36) { - return 'A' + char(r - 10); - } else { - return '*'; - } -} - - -void AssemblyPath::writeHtml(ostream& html, const AssemblyGraph& assemblyGraph) const -{ - SHASTA_ASSERT(segments.size() > 1); - SHASTA_ASSERT(links.size() == segments.size() - 1); - - writeHtmlSummary(html); - writeSequenceDialog(html); - writeHtmlDetail(html, assemblyGraph); -} - - - -void AssemblyPath::writeHtmlSummary(ostream& html) const -{ - html << - "<table>" << - "<tr><th class=left>First segment id<td class=centered>" << segments.front().id << - "<tr><th class=left>Last segment id<td class=centered>" << segments.back().id << - "<tr><th class=left>Number of segments<td class=centered>" << segments.size() << - "<tr><th class=left>Number of links<td class=centered>" << segments.size() - 1 << - "<tr><th class=left>Length of RLE sequence assembled<td class=centered>" << rleSequence.size() << - "<tr><th class=left>Length of raw sequence assembled<td class=centered>" << rawSequence.size() << - "</table>"; -} - - - -void AssemblyPath::writeHtmlDetail(ostream& html, const AssemblyGraph& assemblyGraph) const -{ - // Table legend. - html << - "<p>Hover on table headers for information on the meaning of each column."; - - - - // Table header. - html << - "<p>" - "<table style='table-layout:fixed;font-family:monospace;font-size:9'>" - "<tr>" - - "<th title ='S (segment) or L (link). " - "Primary segments have a light blue background. " - "Trivial links have a grey background.'>" - "<span class=rotated>Type" - - "<th title='Segment or link id'>" - "<span class=rotated>Id" - - "<th title='The number of oriented reads contributing to assembly of this segment or link. " - "This is not the same as average coverage on marker graph vertices or edges.'>" - "<span class=rotated>Coverage" - - "<th title='The id of the previous primary segment.'>" - "<span class=rotated>Previous<br>primary segment" - - "<th title='The id of the next primary segment.'>" - "<span class=rotated>Next<br>primary segment" - - "<th title='The fraction of oriented reads that appear on the " - "previous primary segment and are long enough to appear on this segment, but do not.'>" - "<span class=rotated>Unexplained fraction<br>on previous<br>primary segment" - - "<th title='The fraction of oriented reads that appear on the " - "next primary segment and are long enough to appear on this segment, but do not.'>" - "<span class=rotated>Unexplained fraction<br>on next<br>primary segment" - - "<th title='The position of the trimmed raw sequence of this segment or link " - "in the raw assembled sequence of the path.'>" - "<span class=rotated>Raw<br>position" - - "<th title='The complete raw sequence for this segment or link. " - "The red portion is trimmed out and not used for assembly.'>" - "Raw sequence" - - "<th title='Assembly details for non-trivial links.'>" - "<span class=rotated>Detail"; - - - - // Main body of the table. - // There is one row for each segment and one row for each link. - for(uint64_t position=0; position<segments.size(); position++) { - const AssemblyPathSegment& segment = segments[position]; - - // If not a primary segment, evaluate this segment against - // the previous and next primary segment. - AssemblyGraph::SegmentOrientedReadInformation info; - AssemblyGraph::SegmentOrientedReadInformation previousInfo; - AssemblyGraph::SegmentOrientedReadInformation nextInfo; - SegmentPairInformation previousSegmentPairInfo; - SegmentPairInformation nextSegmentPairInfo; - if(not segment.isPrimary) { - assemblyGraph.getOrientedReadsOnSegment(segment.id, info); - assemblyGraph.getOrientedReadsOnSegment(segment.previousPrimarySegmentId, previousInfo); - assemblyGraph.getOrientedReadsOnSegment(segment.nextPrimarySegmentId, nextInfo); - assemblyGraph.analyzeSegmentPair( - segment.previousPrimarySegmentId, segment.id, - previousInfo, info, - assemblyGraph.markers, previousSegmentPairInfo); - assemblyGraph.analyzeSegmentPair( - segment.nextPrimarySegmentId, segment.id, - nextInfo, info, - assemblyGraph.markers, nextSegmentPairInfo); - } - - // Write a row for the segment at this position. - const AssembledSegment& assembledSegment = segment.assembledSegment; - html << "<tr"; - if(segment.isPrimary) { - html << " style='background-color:LightCyan' title='Primary segment'"; - } else { - html << " title='Secondary segment'"; - } - html << - ">" - "<td class=centered>S" - "<td class=centered>" << segment.id << - "<td>" << assemblyGraph.coverage(segment.id); - if(segment.isPrimary) { - html << "<td><td><td><td>"; - } else { - const auto oldPrecision = html.precision(2); - const auto oldFlags = html.setf(std::ios_base::fixed, std::ios_base::floatfield); - html << - "<td class=centered>" << segment.previousPrimarySegmentId << - "<td class=centered>" << segment.nextPrimarySegmentId << - "<td class=centered>" << previousSegmentPairInfo.unexplainedFraction(0) << - "<td class=centered>" << nextSegmentPairInfo.unexplainedFraction(0); - html.precision(oldPrecision); - html.flags(oldFlags); - } - html << "<td class=centered>" << segment.rawPosition; - - - - // Raw sequence for this segment. - html << "<td class=centered style='max-width:500px;word-wrap:break-word'>"; - if(segment.leftTrim + segment.rightTrim > assembledSegment.runLengthSequence.size()) { - - // Exceptional case where the left and right trim overlap. - html << "<span style='background-color:LightCoral'>"; - for(uint64_t i=0; i<assembledSegment.runLengthSequence.size(); i++) { - const Base b = assembledSegment.runLengthSequence[i]; - const uint32_t r = assembledSegment.repeatCounts[i]; - if(i == assembledSegment.runLengthSequence.size() - segment.rightTrim) { - html << "</span><span style='background-color:Fuchsia'>"; - } - for(uint32_t k=0; k<r; k++) { - html << b; - } - if(i == segment.leftTrim - 1) { - html << "</span><span style='background-color:LightCoral'>"; - } - } - html << "</span><td>"; - - } else { - - // Normal case. - html << "<span style='background-color:LightCoral'>"; - for(uint64_t i=0; i<assembledSegment.runLengthSequence.size(); i++) { - const Base b = assembledSegment.runLengthSequence[i]; - const uint32_t r = assembledSegment.repeatCounts[i]; - if(i == segment.leftTrim) { - html << "</span>"; - } - for(uint32_t k=0; k<r; k++) { - html << b; - } - if(i == assembledSegment.runLengthSequence.size() -1 - segment.rightTrim) { - html << "<span style='background-color:LightCoral'>"; - } - - } - html << "</span><td>"; - } - - - - // Write a row for the link. - if(position == links.size()) { - break; - } - const AssemblyPathLink& link = links[position]; - html << "<tr"; - if(link.isTrivial) { - html << " style='background-color:LightGray' title='Trivial link'"; - } else { - html << " title='Non-trivial link'"; - } - html << - "><td class=centered>L" << - "<td class=centered>" << link.id << - "<td class=centered>"; - - if(not link.isTrivial) { - html << link.coverage; - } - - html << - "<td class=centered>" << link.previousPrimarySegmentId << - "<td class=centered>" << link.nextPrimarySegmentId << - "<td><td><td class=centered>"; - - if(not link.isTrivial) { - html << link.rawPosition; - } - - // Raw sequence for this link. - html << "<td class=centered style='max-width:300px;word-wrap:break-word'>"; - html << "<span style='background-color:LightCoral'>"; - for(uint64_t i=0; i<link.msaRleSequence.size(); i++) { - const Base b = link.msaRleSequence[i]; - const uint32_t r = link.msaRepeatCounts[i]; - if(i == link.leftTrim) { - html << "</span>"; - } - for(uint32_t k=0; k<r; k++) { - html << b; - } - if(i == link.msaRleSequence.size() -1 - link.rightTrim) { - html << "<span style='background-color:LightCoral'>"; - } - } - html << "</span>"; - - html << "<td class=centered>"; - if(not link.isTrivial) { - html << "<a href='exploreMode3LinkAssembly?linkId=" << link.id << - "&previousPrimarySegmentId=" << link.previousPrimarySegmentId << - "&nextPrimarySegmentId=" << link.nextPrimarySegmentId << - "'>Detail</a>"; - } - } - - // End the table. - html << "</table>"; - -} - - - -// This writes out a dialog that permit displaying -// selected portions of the path assembled sequence. -void AssemblyPath::writeSequenceDialog(ostream& html) const -{ - html << "<script>var assembledSequence = '"; - copy(rawSequence, ostream_iterator<Base>(html)); - html << "';</script>"; - - html << R"zzz( -<form onsubmit="displaySequence(); return false;"> -<br><input type=submit value='Display assembled sequence'> - in the position range <input type=text id=begin> - to <input type=text id=end> -</form> -<script> -function displaySequence() -{ - var beginString = document.getElementById('begin').value; - var endString = document.getElementById('end').value; - var begin = parseInt(beginString); - var end = parseInt(endString); - if((end < begin) || (end > assembledSequence.length)) { - document.getElementById("assembledSequence").innerText = ""; - } else { - document.getElementById("assembledSequence").innerText = assembledSequence.substring(begin, end); - } -} -</script> -<p id=assembledSequence style='font-family:monospace;font-size:9pt;word-wrap:break-word;'> - )zzz"; -} diff --git a/src/mode3-AssemblyPath.hpp b/src/mode3-AssemblyPath.hpp deleted file mode 100644 index 5e4d023..0000000 --- a/src/mode3-AssemblyPath.hpp +++ /dev/null @@ -1,206 +0,0 @@ -#ifndef SHASTA_MODE3_ASSEMBLY_PATH_HPP -#define SHASTA_MODE3_ASSEMBLY_PATH_HPP - -// Shasta. -#include "AssembledSegment.hpp" -#include "invalid.hpp" - -// Standard library. -#include "cstdint.hpp" -#include "span.hpp" -#include "utility.hpp" -#include "vector.hpp" - -namespace shasta { - namespace mode3 { - class AssemblyPath; - class AssemblyPathLink; - class AssemblyPathSegment; - - class AssemblyGraph; - class Transition; - } - - class Base; - class ConsensusCaller; - class OrientedReadId; -} - - - -// A segment in an AssemblyPath. -class shasta::mode3::AssemblyPathSegment { -public: - - // The id of this segment, in the AssemblyGraph. - uint64_t id; - - // Each primary segment in the path has high Jaccard similarity - // with the previous primary segment. - // The first and last segment are always primary segments. - bool isPrimary; - - // For secondary segments only (isPrimary is false) we store the - // segment id of the previous and next primary segment. - uint64_t previousPrimarySegmentId = invalid<uint64_t>; - uint64_t nextPrimarySegmentId = invalid<uint64_t>; - - // The AssembledSegment contains the sequence for this segment - // plus information on how the sequence was extracted from the - // marker graph. - // The sequence includes the first and last marker graph vertex - // of this segment. - AssembledSegment assembledSegment; - - // For assembly of the path sequence, we don't use the entire - // sequence of the AssembledSegment. - // We trim some bases at each end to avoid overlap - // with adjacent segments and links. - // When a segment is adjacent to a non-trivial link, - // we give priority to link sequence over segment sequence. - // The reason is that sequence assembled from links - // is generally more accurate because it is assembled - // using only a restricted set of - // oriented reads that are believed to originate from the - // sequence copy we are assembling. - uint64_t leftTrim = 0; - uint64_t rightTrim = 0; - span<const Base> trimmedRleSequence() const; - span<const uint32_t> trimmedRepeatCounts() const; - void getTrimmedRawSequence(vector<Base>&) const; - - // The position of the trimmed sequence of this segment - // in the assembled sequence of the path. - uint64_t rlePosition = 0; - uint64_t rawPosition = 0; - - // Constructor. - AssemblyPathSegment(uint64_t id, bool isPrimary); -}; - - - -// A link in an AssemblyPath. -class shasta::mode3::AssemblyPathLink { -public: - - // The id of this segment, in the AssemblyGraph. - uint64_t id; - - // A link is trivial if the last marker graph vertex - // of the source segment coincides with the first marker - // graph vertex of the target segment. - // In this case the link does not need to be assembled - // and all the next fields are left empty. - bool isTrivial; - - // The number of oriented reads used to assemble this link. - // This is only filled in for non-trivial links. - uint64_t coverage = 0; - - // The last primary segment in the path preceding this link. - uint64_t previousPrimarySegmentId = invalid<uint64_t>; - - // The next primary segment in the path following this link. - uint64_t nextPrimarySegmentId = invalid<uint64_t>; - - // The RLE sequence as computed by the MSA - // of oriented reads in the link. - // This overlaps with adjacent segments. - vector<Base> msaRleSequence; - vector<uint32_t> msaRepeatCounts; - - // The trimmed RLE sequence, to be used for assembly, is obtained from - // the MSA sequence by removing bases at the two ends - // that are identical with the adjacent segments. - uint64_t leftTrim = 0; - uint64_t rightTrim = 0; - span<const Base> trimmedRleSequence() const; - span<const uint32_t> trimmedRepeatCounts() const; - - // The position of the trimmed sequence of this link - // in the assembled sequence of the path. - uint64_t rlePosition = 0; - uint64_t rawPosition = 0; -}; - - - -// An assembly path in the mode3::AssemblyGraph -class shasta::mode3::AssemblyPath { -public: - - // The segments and links on the path. - vector<AssemblyPathSegment> segments; - vector<AssemblyPathLink> links; - - // Top level function to assemble sequence for this path. - void assemble(const AssemblyGraph&); - - // Assemble the sequence of each segment. - void assembleSegments(const AssemblyGraph&); - void writeSegmentSequences(); - - // Initialize the links. - // This only resizes the links vector and fills in the following fields of each link. - // - id - // - isTrivial - // - previousPrimarySegmentId - // - nextPrimarySegmentId - void initializeLinks(const AssemblyGraph&); - - // Assemble links in this assembly path. - void assembleLinks(const AssemblyGraph&); - void assembleLinkAtPosition( - const AssemblyGraph& assemblyGraph, - uint64_t position0, - ostream& html); - static void assembleTrivialLink( - AssemblyPathSegment& segment0, - AssemblyPathSegment& segment1, - AssemblyPathLink& link, - uint64_t k); - static void assembleNonTrivialLink( - const AssemblyGraph& assemblyGraph, - AssemblyPathSegment& segment0, - AssemblyPathSegment& segment1, - AssemblyPathLink& link, - ostream& html); - void writeLinkSequences(const AssemblyGraph&); - - // Final assembly of segments and links sequence into the path sequence. - void assemble(); - vector<Base> rleSequence; - vector<uint64_t> repeatCounts; - vector<Base> rawSequence; - - void clear(); - - // Use spoa to compute consensus sequence for a link. - static void computeLinkConsensusUsingSpoa( - const vector<OrientedReadId> orientedReadIds, - const vector< vector<Base> > rleSequences, - const vector< vector<uint32_t> > repeatCounts, - uint64_t readRepresentation, - const ConsensusCaller&, - bool debug, - ostream& html, - vector<Base>& consensusRleSequence, - vector<uint32_t>& consensusRepeatCounts - ); - - // Return a character to represent a repeat count - // when writing out RLE sequence. - static char repeatCountCharacter(uint32_t); - - // Html output. - void writeHtml(ostream&, const AssemblyGraph& assemblyGraph) const; - void writeHtmlSummary(ostream&) const; - void writeSequenceDialog(ostream&) const; - void writeHtmlDetail(ostream&, const AssemblyGraph& assemblyGraph) const; - -}; - - -#endif - diff --git a/src/mode3-Detangler.cpp b/src/mode3-Detangler.cpp deleted file mode 100644 index e659a14..0000000 --- a/src/mode3-Detangler.cpp +++ /dev/null @@ -1,415 +0,0 @@ -#include "mode3-Detangler.hpp" -#include "Base.hpp" -#include "deduplicate.hpp" -#include "mode3.hpp" -using namespace shasta; -using namespace mode3; - -#include "fstream.hpp" - - - -Detangler::Detangler(const AssemblyGraph& assemblyGraph) -{ - // ****** EXPOSE WHEN CODE STABILIZES - const uint64_t minLinkCoverage = 6; - - createJourneys(assemblyGraph); - createInitialClusters(); - cout << "The initial Detangler has " << clusters.size() << " clusters." << endl; - - uint64_t count = 0; - for(auto& p: clusters) { - for(Cluster& cluster: p.second) { - if(simpleDetangle(&cluster, minLinkCoverage)) { - ++count; - } - } - } - cout << "Detangled " << count << " clusters out of " << clusters.size() << endl; - - writeGfa("Detangler.gfa", minLinkCoverage, assemblyGraph.segmentSequences, assemblyGraph.k); -} - - - - -// To create the journeys, simply extract the segmentIds from the assemblyGraphJourneys. -void Detangler::createJourneys(const AssemblyGraph& assemblyGraph) -{ - const uint64_t journeyCount = assemblyGraph.assemblyGraphJourneys.size(); - - journeys.clear(); - journeys.resize(journeyCount); - for(uint64_t i=0; i<journeyCount; i++) { - const span<const AssemblyGraphJourneyEntry> assemblyGraphJourney = assemblyGraph.assemblyGraphJourneys[i]; - Journey& journey = journeys[i]; - - for(const AssemblyGraphJourneyEntry& assemblyGraphJourneyEntry: assemblyGraphJourney) { - journey.push_back(Step(assemblyGraphJourneyEntry.segmentId)); - } - } -} - - - -// Initially, we create a Cluster for each segmentId. -void Detangler::createInitialClusters() -{ - - // Loop over all oriented reads. - const ReadId readCount = ReadId(journeys.size() / 2); - for(ReadId readId=0; readId<readCount; readId++) { - for(Strand strand=0; strand<2; strand++) { - const OrientedReadId orientedReadId(readId, strand); - - // Get the Journey for this oriented read. - Journey& journey = journeys[orientedReadId.getValue()]; - - // Loop over Step(s) in this Journey. - StepInfo stepInfo; - stepInfo.orientedReadId = orientedReadId; - for(uint64_t position=0; position<journey.size(); position++) { - stepInfo.position = position; - Step& step = journey[position]; - const uint64_t segmentId = step.segmentId; - - // Locate the Cluster corresponding to this segment, - // creating it if necessary. - ClusterContainer::iterator it = clusters.find(segmentId); - if(it == clusters.end()) { - tie(it, ignore) = clusters.insert(make_pair(segmentId, std::list<Cluster>())); - it->second.push_back(Cluster(segmentId, 0)); - } - std::list<Cluster>& segmentClusters = it->second; - - // Sanity check: this segmentId must correspond to exactly one Cluster. - SHASTA_ASSERT(segmentClusters.size() == 1); - Cluster& cluster = segmentClusters.front(); - - // Add this Step to the Cluster. - cluster.steps.push_back(stepInfo); - step.cluster = &cluster; - } - } - } -} - - - -// Find the next/previous cluster for each of the steps in a given cluster. -// The output vector has size equal to the number of steps in this cluster, -// and the corresponding OrientedReadId(s) are the same -// as the ones in the steps vector for the given cluster. -// Some of the pointers returned can be zero. This can happen if this -// cluster is the first or last cluster in the journey of an oriented read. -void Detangler::findNextClusters( - const Cluster* cluster0, - vector<const Cluster*>& nextClusters - ) const -{ - nextClusters.clear(); - - // Loop over the steps of this cluster. - for(const StepInfo& stepInfo: cluster0->steps) { - const OrientedReadId orientedReadId = stepInfo.orientedReadId; - const uint64_t position = stepInfo.position; - - // Get journey for this oriented read. - const Journey& journey = journeys[orientedReadId.getValue()]; - - // Locate the cluster at the next position in the journey. - // There is none if we are at the end of the journey. - const Cluster* cluster1 = 0; - const uint64_t nextPosition = position + 1; - if(nextPosition < journey.size()) { - cluster1 = journey[nextPosition].cluster; - } - - // Store it in the output vector. - nextClusters.push_back(cluster1); - }; - -} -void Detangler::findPreviousClusters( - const Cluster* cluster0, - vector<const Cluster*>& previousClusters - ) const -{ - previousClusters.clear(); - - // Loop over the steps of this cluster. - for(const StepInfo& stepInfo: cluster0->steps) { - const OrientedReadId orientedReadId = stepInfo.orientedReadId; - const uint64_t position = stepInfo.position; - - // Get the journey for this oriented read. - const Journey& journey = journeys[orientedReadId.getValue()]; - - // Locate the cluster at the previous position in the journey. - // There is none if we are at the end of the journey. - const Cluster* cluster1 = 0; - if(position > 0) { - const uint64_t previousPosition = position - 1; - cluster1 = journey[previousPosition].cluster; - } - - // Store it in the output vector. - previousClusters.push_back(cluster1); - }; - -} - - - -// Simple, classical detangling of a single cluster. -bool Detangler::simpleDetangle(Cluster* cluster0, uint64_t minLinkCoverage) -{ - // ****** EXPOSE WHEN CODE STABILIZES - const uint64_t maxDiscordantCount = 2; - const uint64_t minConcordantCount = 8; - - const bool debug = true; - - // Find the previous clusters for each of the steps in this cluster. - vector<const Cluster*> previousClusters; - findPreviousClusters(cluster0, previousClusters); - SHASTA_ASSERT(previousClusters.size() == cluster0->steps.size()); - - - // Find the next clusters for each of the steps in this cluster. - vector<const Cluster*> nextClusters; - findNextClusters(cluster0, nextClusters); - SHASTA_ASSERT(nextClusters.size() == cluster0->steps.size()); - - // Count the distinct previous clusters. - // They are stored sorted. - vector<const Cluster*> distinctPreviousClusters = previousClusters; - vector<uint64_t > distinctPreviousClustersCoverage; - deduplicateAndCount(distinctPreviousClusters, distinctPreviousClustersCoverage); - SHASTA_ASSERT(distinctPreviousClusters.size() == distinctPreviousClustersCoverage.size()); - - // If less than two, do nothing. - if(distinctPreviousClusters.size() < 2) { - return false; - } - - // Count the distinct previous clusters. - // They are stored sorted. - vector<const Cluster*> distinctNextClusters = nextClusters; - vector<uint64_t > distinctNextClustersCoverage; - deduplicateAndCount(distinctNextClusters, distinctNextClustersCoverage); - SHASTA_ASSERT(distinctNextClusters.size() == distinctNextClustersCoverage.size()); - - // If less than two, do nothing. - if(distinctPreviousClusters.size() < 2) { - return false; - } - - // Only keep the previous clusters that have sufficient coverage and are not null. - vector< pair<const Cluster*, uint64_t> > previousWithCoverage; - for(uint64_t i=0; i<distinctPreviousClusters.size(); i++) { - const Cluster* cluster1 = distinctPreviousClusters[i]; - if(cluster1) { - const uint64_t coverage = distinctPreviousClustersCoverage[i]; - if(coverage >= minLinkCoverage) { - previousWithCoverage.push_back(make_pair(cluster1, coverage)); - } - } - } - - // Only keep the next clusters that have sufficient coverage and are not null. - vector< pair<const Cluster*, uint64_t> > nextWithCoverage; - for(uint64_t i=0; i<distinctNextClusters.size(); i++) { - const Cluster* cluster1 = distinctNextClusters[i]; - if(cluster1) { - const uint64_t coverage = distinctNextClustersCoverage[i]; - if(coverage >= minLinkCoverage) { - nextWithCoverage.push_back(make_pair(cluster1, coverage)); - } - } - } - - // Compute the tangle matrix. - // tangleMatrix[i][j] contains the number of oriented reads - // that come from the i-th previous cluster and go to the j-th previous cluster. - vector< vector<uint64_t> > tangleMatrix(previousWithCoverage.size(), vector<uint64_t>(nextWithCoverage.size(), 0)); - for(uint64_t i=0; i<previousWithCoverage.size(); i++) { - const Cluster* previousCluster = previousWithCoverage[i].first; - for(uint64_t j=0; j<nextWithCoverage.size(); j++) { - const Cluster* nextCluster = nextWithCoverage[j].first; - for(uint64_t k=0; k<previousClusters.size(); k++) { - if((previousClusters[k] == previousCluster) and (nextClusters[k] == nextCluster)) { - ++tangleMatrix[i][j]; - } - } - } - } - - // For now, only handle the 2 by 2 case. - if(not(previousWithCoverage.size() == 2 and nextWithCoverage.size() == 2)) { - return false; - } - - // Compute the sum of diagonal and off-diagonal terms. - const uint64_t diagonalSum = tangleMatrix[0][0] + tangleMatrix[1][1]; - const uint64_t offDiagonalSum = tangleMatrix[0][1] + tangleMatrix[1][0]; - - // Check if the criteria for detangle are satisfied. - const uint64_t concordantCount = max(diagonalSum, offDiagonalSum); - const uint64_t discordantCount = min(diagonalSum, offDiagonalSum); - if(concordantCount < minConcordantCount or discordantCount > maxDiscordantCount) { - return false; - } - - if(debug) { - cout << "Detangling " << cluster0->stringId() << "\n"; - cout << "Previous:\n"; - for(const auto& p: previousWithCoverage) { - cout << p.first->stringId() << " " << p.second << "\n"; - } - cout << "Next:\n"; - for(const auto& p: nextWithCoverage) { - cout << p.first->stringId() << " " << p.second << "\n"; - } - cout << "Tangle matrix:\n"; - for(uint64_t i=0; i<previousWithCoverage.size(); i++) { - const Cluster* previousCluster = previousWithCoverage[i].first; - for(uint64_t j=0; j<nextWithCoverage.size(); j++) { - const Cluster* nextCluster = nextWithCoverage[j].first; - cout << previousCluster->stringId() << " "; - cout << nextCluster->stringId() << " "; - cout << tangleMatrix[i][j] << "\n"; - } - } - cout << "Diagonal " << diagonalSum << "\n"; - cout << "Off-diagonal " << offDiagonalSum << "\n"; - - } - - - - // If getting here, we can detangle this cluster. - // This generates two new clusters for this segment. - const bool inPhase = diagonalSum > offDiagonalSum; - - // The new steps for cluster0. - vector<StepInfo> newSteps0; - - // Create the two new clusters. - const uint64_t segmentId = cluster0->segmentId; - std::list<Cluster>& segmentClusters = clusters[segmentId]; - segmentClusters.push_back(Cluster(segmentId, segmentClusters.size())); - Cluster& cluster1 = segmentClusters.back(); - segmentClusters.push_back(Cluster(segmentId, segmentClusters.size())); - Cluster& cluster2 = segmentClusters.back(); - - // Do the detangling. The steps that correspond to the dominant portion of the - // tangle matrix are moved to the new clusters. - for(uint64_t k=0; k<previousClusters.size(); k++) { - const StepInfo& step = cluster0->steps[k]; - const OrientedReadId orientedReadId = step.orientedReadId; - Journey& journey = journeys[orientedReadId.getValue()]; - const uint64_t position = step.position; - const Cluster* previousCluster = previousClusters[k]; - const Cluster* nextCluster = nextClusters[k]; - if(inPhase) { - if(previousCluster == previousWithCoverage[0].first and nextCluster == nextWithCoverage[0].first) { - // Add it to the steps of cluster1. - cluster1.steps.push_back(StepInfo(orientedReadId, position)); - journey[position].cluster = &cluster1; - } else if(previousCluster == previousWithCoverage[1].first and nextCluster == nextWithCoverage[1].first) { - // Add it to the steps of cluster2. - cluster2.steps.push_back(StepInfo(orientedReadId, position)); - journey[position].cluster = &cluster2; - } else { - // Leave it in cluster0. - newSteps0.push_back(StepInfo(orientedReadId, position)); - } - } else { - if(previousCluster == previousWithCoverage[0].first and nextCluster == nextWithCoverage[1].first) { - // Add it to the steps of cluster1. - cluster1.steps.push_back(StepInfo(orientedReadId, position)); - journey[position].cluster = &cluster1; - } else if(previousCluster == previousWithCoverage[1].first and nextCluster == nextWithCoverage[0].first) { - // Add it to the steps of cluster2. - cluster2.steps.push_back(StepInfo(orientedReadId, position)); - journey[position].cluster = &cluster2; - } else { - // Leave it in cluster0. - newSteps0.push_back(StepInfo(orientedReadId, position)); - } - } - } - - - - - // Update the steps of the cluster we just detangled. - cluster0->steps.swap(newSteps0); - - return true; -} - - - -void Detangler::writeGfa( - const string& fileName, - uint64_t minLinkCoverage, - const MemoryMapped::VectorOfVectors<Base, uint64_t>& segmentSequences, - uint64_t k) const -{ - ofstream gfa(fileName); - writeGfa(gfa, minLinkCoverage, segmentSequences, k); -} -void Detangler::writeGfa( - ostream& gfa, - uint64_t minLinkCoverage, - const MemoryMapped::VectorOfVectors<Base, uint64_t>& segmentSequences, - uint64_t k) const -{ - // Write the header line. - gfa << "H\tVN:Z:1.0\n"; - - // Write one segment for each cluster. - for(const auto& p: clusters) { - const uint64_t segmentId = p.first; - const auto sequence = segmentSequences[segmentId]; - for(const Cluster& cluster: p.second) { - gfa << "S\t" << cluster.stringId() << "\t"; - copy(sequence.begin()+k/2, sequence.end()-k/2, ostream_iterator<Base>(gfa)); - gfa << "\n"; - } - } - - // Write the links. - for(const auto& p: clusters) { - for(const Cluster& cluster0: p.second) { - - // Find the next clusters for each of the steps in this cluster. - vector<const Cluster*> nextClusters; - findNextClusters(&cluster0, nextClusters); - SHASTA_ASSERT(nextClusters.size() == cluster0.steps.size()); - - // Count the distinct previous clusters. - // They are stored sorted. - vector<const Cluster*> distinctNextClusters = nextClusters; - vector<uint64_t > distinctNextClustersCoverage; - deduplicateAndCount(distinctNextClusters, distinctNextClustersCoverage); - SHASTA_ASSERT(distinctNextClusters.size() == distinctNextClustersCoverage.size()); - - for(uint64_t i=0; i<distinctNextClusters.size(); i++) { - const Cluster* cluster1 = distinctNextClusters[i]; - if(cluster1) { - const uint64_t coverage = distinctNextClustersCoverage[i]; - if(coverage >= minLinkCoverage) { - gfa << "L\t" << cluster0.stringId() << "\t+\t" << cluster1->stringId() << "\t+\t*\n"; - } - } - } - } - } - -} - - diff --git a/src/mode3-Detangler.hpp b/src/mode3-Detangler.hpp deleted file mode 100644 index efe9477..0000000 --- a/src/mode3-Detangler.hpp +++ /dev/null @@ -1,153 +0,0 @@ -#ifndef SHASTA_MODE3_DETANGLER_HPP -#define SHASTA_MODE3_DETANGLER_HPP - -// Shasta. -#include "ReadId.hpp" - -// Standard library. -#include <list> -#include <map> -#include "utility.hpp" -#include "vector.hpp" - -/******************************************************************************* - -Class mode3::Detangler contains data structures and code used to detangle the -mode3::AssemblyGraph. - -In the Detangler, each oriented read is represented by the sequence -of AssemblyGraph segments it visits. This sequence is not necessarily a path -in the AssemblyGraph, unless the assembly graph was created with -minCoverage for links <=1. - -This sequence is called a Journey. In the AssemblyGraph, -it is represented as a sequence of AssemblyGraphJourneyEntry objects -and is stored in AssemblyGraph::assemblyGraphJourneys. - -In Detangler code, the journey is represented as a sequence of Step objects. -Step(s) are grouped into Cluster(s). All Step(s) in a Cluster refer to -the same segmentId, but there can be more than one Cluster for each segmentId. -At the beginning, there is exactly one Cluster for each segmentId, -but during the detangling process Cluster(s) can be split. - -Each Step stores the segmentId it refers to, and an iterator pointing to -the Cluster the Step currently belongs to. The segmentId for a Step never -changes, but the Cluster it points to can change during the detangling process. - -*******************************************************************************/ - -namespace shasta { - class Base; - namespace mode3 { - class Detangler; - class AssemblyGraph; - } - namespace MemoryMapped { - template<class T, class Int> class VectorOfVectors; - } -} - - - -class shasta::mode3::Detangler { -public: - - // See the comments at the top of this file for the meanings - // of Step, Journey, Cluster. - - class Cluster; - - class Step { - public: - const uint64_t segmentId; - const Cluster* cluster = 0; - - Step(uint64_t segmentId) : - segmentId(segmentId) {} - }; - - using Journey = vector<Step>; - - // The journey of each oriented read. - // Obtained from the AssemblyGraph::assemblyGraphJourneys. - // Indexed by OrientedReadId::getValue(). - vector<Journey> journeys; - - // Type used to identify a step in a journey. - class StepInfo { - public: - OrientedReadId orientedReadId; - - // The position of this entry in the journey of this oriented read. - uint64_t position; - - StepInfo() {} - StepInfo(OrientedReadId orientedReadId, uint64_t position) : - orientedReadId(orientedReadId), - position(position) {} - }; - - // A cluster is a set of Step(s) all corresponding to the same - // segment id. - class Cluster { - public: - uint64_t segmentId; - uint64_t id = 0; // Within that segmentId. - vector<StepInfo> steps; // Sorted by orientedReadId. - Cluster(uint64_t segmentId, uint64_t id) : - segmentId(segmentId), id(id) {} - string stringId() const - { - return to_string(segmentId) + "." + to_string(id); - } - }; - - // Store the clusters keyed by segmentId. - // Clusters are never removed. - // However, during detangling, the steps of a cluster - // can be moved to other clusters for the same segmentId. - // We use a list so pointers to Cluster(s) are not invalidated - // when elements are added. - using ClusterContainer = std::map<uint64_t, std::list<Cluster> >; - ClusterContainer clusters; - - - Detangler(const AssemblyGraph&); -private: - void createJourneys(const AssemblyGraph&); - void createInitialClusters(); - - // Find the next/previous cluster for each of the steps in a given cluster. - // The output vector has size equal to the number of steps in this cluster, - // and the corresponding OrientedReadId(s) are the same - // as the ones in the steps vector for the given cluster. - // Some of the pointers returned can be zero. This can happen if this - // cluster is the first or last cluster in the journey of an oriented read. - void findNextClusters( - const Cluster*, - vector<const Cluster*>& - ) const; - void findPreviousClusters( - const Cluster*, - vector<const Cluster*>& - ) const; - - // Simple, classical detangling of a single cluster. - bool simpleDetangle(Cluster*, uint64_t minLinkCoverage); - - void writeGfa( - const string& fileName, - uint64_t minLinkCoverage, - const MemoryMapped::VectorOfVectors<Base, uint64_t>& segmentSequences, - uint64_t k) const; - void writeGfa( - ostream&, - uint64_t minLinkCoverage, - const MemoryMapped::VectorOfVectors<Base, uint64_t>& segmentSequences, - uint64_t k) const; -}; - - - -#endif - diff --git a/src/mode3-JaccardGraph.cpp b/src/mode3-JaccardGraph.cpp deleted file mode 100644 index f4e3db4..0000000 --- a/src/mode3-JaccardGraph.cpp +++ /dev/null @@ -1,957 +0,0 @@ -#include "mode3-JaccardGraph.hpp" -#include "deduplicate.hpp" -#include "mode3.hpp" -#include "orderPairs.hpp" -#include "orderVectors.hpp" -#include "timestamp.hpp" -using namespace shasta; -using namespace mode3; - -// Boost libraries. -#include <boost/pending/disjoint_sets.hpp> -#include <boost/graph/topological_sort.hpp> - -// Standard library. -#include "fstream.hpp" - - - -// Create a JaccardGraph with the given number of vertices -// (one for each segment) and no edges. -JaccardGraph::JaccardGraph(uint64_t segmentCount) -{ - for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) { - vertexTable.push_back(add_vertex(JaccardGraphVertex(segmentId), *this)); - } -} - - -void AssemblyGraph::createJaccardGraph( - size_t threadCount - ) -{ - // EXPOSE WHEN CODE STABILIZES. - const uint64_t minComponentSize = 10; // Likely needs to be decreased. Keep high for debugging. - - cout << timestamp << "createJaccardGraph begins." << endl; - - // Create the JaccardGraph and its vertices. - const uint64_t segmentCount = markerGraphPaths.size(); - cout << "The total number of segments in the assembly graph is " << segmentCount << endl; - jaccardGraphPointer = make_shared<JaccardGraph>(segmentCount); - JaccardGraph& jaccardGraph = *jaccardGraphPointer; - - // Compute edges, in parallel. - jaccardGraph.threadEdges.resize(threadCount); - const uint64_t batchSize = 100; - setupLoadBalancing(segmentCount, batchSize); - runThreads(&AssemblyGraph::createJaccardGraphThreadFunction, threadCount); - jaccardGraph.storeEdges(); - jaccardGraph.writeGraphviz("JaccardGraph0.dot", false, false); - jaccardGraph.writeGraphviz("JaccardGraph0-Labeled.dot", false, true); - jaccardGraph.writeEdgesCsv("JaccardGraph0Edges.csv"); - cout << "The initial Jaccard graph has " << num_vertices(jaccardGraph) << - " vertices (segments) and " << num_edges(jaccardGraph) << " edges." << endl; - - // Clear all weak vertices. - jaccardGraph.clearWeakVertices(); - cout << "After clearing weak vertices, the Jaccard graph has " << num_vertices(jaccardGraph) << - " vertices (segments) and " << num_edges(jaccardGraph) << " edges." << endl; - jaccardGraph.writeGraphviz("JaccardGraph1.dot", false, false); - jaccardGraph.writeGraphviz("JaccardGraph1-Labeled.dot", false, true); - jaccardGraph.writeEdgesCsv("JaccardGraph1Edges.csv"); - - // Compute all connected components of size at least minComponentSize. - jaccardGraph.computeConnectedComponents(minComponentSize); - - // Store the cluster id of each segment. - // Each connected component of the Jaccard graph with sufficient size - // generates a cluster. - createNew(clusterIds, "Mode3-ClusterIds"); - jaccardGraph.findClusters(clusterIds); - - // Compute assembly paths. - jaccardGraph.computeAssemblyPaths(); - - // Create the ExpandedJaccardGraph. - ExpandedJaccardGraph expandedJaccardGraph(jaccardGraph); - expandedJaccardGraph.writeGraphviz("ExpandedJaccardGraph0.dot"); - expandedJaccardGraph.merge(); - expandedJaccardGraph.writeGraphviz("ExpandedJaccardGraph1.dot"); - - cout << timestamp << "createJaccardGraph ends." << endl; -} - - - -void AssemblyGraph::createJaccardGraphThreadFunction(size_t threadId) -{ - // Loop over all batches assigned to this thread. - uint64_t begin, end; - while(getNextBatch(begin, end)) { - - // Loop over all segments assigned to this batch. - for(uint64_t segmentId=begin; segmentId!=end; ++segmentId) { - createJaccardGraphEdges(segmentId, jaccardGraphPointer->threadEdges[threadId]); - } - } -} - - - -void AssemblyGraph::createJaccardGraphEdges( - uint64_t segmentId, - vector<JaccardGraphEdgeInfo>& edges) -{ - for(uint64_t direction=0; direction<2; direction++) { - createJaccardGraphEdges(segmentId, direction, edges); - } -} - - - -// This follows an algorithm similar to the one used by createAssemblyPath3. -void AssemblyGraph::createJaccardGraphEdges( - uint64_t primarySegmentId, - uint64_t direction, - vector<JaccardGraphEdgeInfo>& edges) -{ - // EXPOSE WHEN CODE STABILIZES. - // FOR NOW THESE SHOULD BE THE SAME AS IN AssemblyGraph::createAssemblyPath3. - const uint64_t minCommonForLink = 3; - const uint64_t minCommonForPrimary = 3; - const double minJaccard = 0.75; - const int32_t minLinkSeparation = -20; - - // We start from primarySegmentId - // and move in the specified direction until we find segmentId1 with - // sufficiently high Jaccard similarity and number of - // common oriented reads with primarySegmentId. - // At each step, we choose the link that has the most common oriented - // reads with the primarySegmentId. - SegmentOrientedReadInformation infoPrimary; - getOrientedReadsOnSegment(primarySegmentId, infoPrimary); - JaccardGraphEdgeInfo edge; - edge.direction = direction; - uint64_t segmentId0 = primarySegmentId; - std::set<uint64_t> previousSegments; - while(true) { - - // Loop over outgoing or incoming links of segmentId0. - // Find the link with the most common reads with the primarySegmentId. - const auto linkIds = (direction == 0) ? linksBySource[segmentId0] : linksByTarget[segmentId0]; - if(linkIds.empty()) { - return; - } - uint64_t linkIdBest = invalid<uint64_t>; - uint64_t commonOrientedReadCountBest = 0; - for(const uint64_t linkId: linkIds) { - - // If link separation is too negative, skip it. - // The goal here is to avoid cycles in paths. - const Link& link = links[linkId]; - if(link.separation < minLinkSeparation) { - continue; - } - - // Count the number of common oriented reads between the reference segment and this link. - uint64_t commonOrientedReadCount; - analyzeSegmentLinkPair(primarySegmentId, linkId, commonOrientedReadCount); - - // If better than the one we have it, record it. - if(commonOrientedReadCount > commonOrientedReadCountBest) { - linkIdBest = linkId; - commonOrientedReadCountBest = commonOrientedReadCount; - } - } - if(commonOrientedReadCountBest < minCommonForLink) { - return; - } - const uint64_t linkId = linkIdBest; - - // Get the segment at the other side of this link. - const Link& link = links[linkId]; - const uint64_t segmentId1 = (direction==0) ? link.segmentId1 : link.segmentId0; - - // Check that we haven't been here before. - if(previousSegments.contains(segmentId1)) { - break; - } - previousSegments.insert(segmentId1); - - // Check segmentId1 against the primary segment. - SegmentOrientedReadInformation info1; - getOrientedReadsOnSegment(segmentId1, info1); - if(direction == 0) { - analyzeSegmentPair( - primarySegmentId, segmentId1, - infoPrimary, info1, - markers, edge.segmentPairInformation); - } else { - analyzeSegmentPair( - segmentId1, primarySegmentId, - info1, infoPrimary, - markers, edge.segmentPairInformation); - } - - // If the Jaccard similarity is high, we found the Jaccard graph edge - // we were looking for. - if( edge.segmentPairInformation.commonCount >= minCommonForPrimary and - edge.segmentPairInformation.jaccard() >= minJaccard) { - if(direction == 0) { - edge.segmentId0 = primarySegmentId; - edge.segmentId1 = segmentId1; - } else { - edge.segmentId0 = segmentId1; - edge.segmentId1 = primarySegmentId; - reverse(edge.segmentIds.begin(), edge.segmentIds.end()); - } - edges.push_back(edge); - return; - } - - edge.segmentIds.push_back(segmentId1); - segmentId0 = segmentId1; - } -} - - - -// This storesin the Jaccard graph the edges found by all threads. -void JaccardGraph::storeEdges() -{ - JaccardGraph& jaccardGraph = *this; - - for(const auto& threadEdges: threadEdges) { - for(const JaccardGraphEdgeInfo& info: threadEdges) { - - const uint64_t segmentId0 = info.segmentId0; - const uint64_t segmentId1 = info.segmentId1; - const JaccardGraph::vertex_descriptor v0 = vertexTable[segmentId0]; - const JaccardGraph::vertex_descriptor v1 = vertexTable[segmentId1]; - - edge_descriptor e; - bool edgeExists = false; - tie(e, edgeExists) = boost::edge(v0, v1, jaccardGraph); - if(not edgeExists) { - boost::add_edge(v0, v1, - JaccardGraphEdge(info.segmentPairInformation, info.direction, info.segmentIds), - jaccardGraph); - } else { - jaccardGraph[e].wasFoundInDirection[info.direction] = true; - } - } - } - threadEdges.clear(); -} - - - -// A strong vertex is one that is incident to at least one strong edge. -bool JaccardGraph::isStrongVertex(vertex_descriptor v) const -{ - const JaccardGraph& jaccardGraph = *this; - - // Check the out-edges. - BGL_FORALL_OUTEDGES(v, e, jaccardGraph, JaccardGraph) { - if(jaccardGraph[e].isStrong()) { - return true; - } - } - - // Check the in-edges. - BGL_FORALL_INEDGES(v, e, jaccardGraph, JaccardGraph) { - if(jaccardGraph[e].isStrong()) { - return true; - } - } - - // We did not find any strong edges. - return false; -} - - - - -// Remove all weak vertices. -void JaccardGraph::removeWeakVertices() -{ - JaccardGraph& jaccardGraph = *this; - - // Find the vertices we are going to remove. - vector<vertex_descriptor> verticesToBeRemoved; - BGL_FORALL_VERTICES(v, jaccardGraph, JaccardGraph) { - if(not isStrongVertex(v)) { - verticesToBeRemoved.push_back(v); - } - } - - // Remove the vertices we flagged. - for(const vertex_descriptor v: verticesToBeRemoved) { - removeVertex(v); - } - -} - - - -// Remove all edges to/from weak vertices. -void JaccardGraph::clearWeakVertices() -{ - JaccardGraph& jaccardGraph = *this; - - vector<vertex_descriptor> verticesToBeCleared; - BGL_FORALL_VERTICES(v, jaccardGraph, JaccardGraph) { - if(not isStrongVertex(v)) { - verticesToBeCleared.push_back(v); - } - } - - for(const vertex_descriptor v: verticesToBeCleared) { - clear_vertex(v, jaccardGraph); - } - -} - - - -// Remove a vertex, making sure to update the vertexTable. -void JaccardGraph::removeVertex(vertex_descriptor v) -{ - JaccardGraph& jaccardGraph = *this; - const uint64_t segmentId = jaccardGraph[v].segmentId; - vertexTable[segmentId] = null_vertex(); - clear_vertex(v, jaccardGraph); - remove_vertex(v, jaccardGraph); -} - - - -void JaccardGraph::writeGraphviz( - const string& fileName, - bool includeIsolatedVertices, - bool writeLabels) const -{ - ofstream file(fileName); - writeGraphviz(file, includeIsolatedVertices, writeLabels); -} - - - -void JaccardGraph::writeGraphviz( - ostream& graphOut, - bool includeIsolatedVertices, - bool writeLabels) const -{ - const JaccardGraph& jaccardGraph = *this; - - graphOut << "digraph JaccardGraph {" << endl; - - BGL_FORALL_VERTICES(v, jaccardGraph, JaccardGraph) { - if( includeIsolatedVertices or - in_degree(v, jaccardGraph) or - out_degree(v, jaccardGraph)) { - graphOut << jaccardGraph[v].segmentId; - if(writeLabels) { - graphOut << " [label=" << jaccardGraph[v].segmentId << "]"; - } - graphOut << ";\n"; - } - } - - BGL_FORALL_EDGES(e, jaccardGraph, JaccardGraph) { - const JaccardGraphEdge& edge = jaccardGraph[e]; - const JaccardGraph::vertex_descriptor v0 = source(e, jaccardGraph); - const JaccardGraph::vertex_descriptor v1 = target(e, jaccardGraph); - const uint64_t segmentId0 = jaccardGraph[v0].segmentId; - const uint64_t segmentId1 = jaccardGraph[v1].segmentId; - - graphOut << segmentId0 << "->" << segmentId1 << "["; - - // Color the edge based on the direction flags. - if(edge.wasFoundInDirection[0]) { - if(edge.wasFoundInDirection[1]) { - // Found in both directions. - graphOut << " color=black"; - } else { - // Only found in the forward direction. - graphOut << " color=red"; - } - } else { - if(edge.wasFoundInDirection[1]) { - // Only found in the backward direction. - graphOut << " color=green"; - } else { - SHASTA_ASSERT(0); - } - } - - if(writeLabels) { - graphOut << " label=\""; - for(const uint64_t segmentId: edge.segmentIds) { - graphOut << segmentId << "\\n"; - } - graphOut << "\""; - } - graphOut << "];\n"; - } - - graphOut << "}" << endl; - -} - - - -// Write edges in csv format. -void JaccardGraph::writeEdgesCsv(const string& fileName) const -{ - ofstream file(fileName); - writeEdgesCsv(file); -} -void JaccardGraph::writeEdgesCsv(ostream& csv) const -{ - const JaccardGraph& jaccardGraph = *this; - - csv << "SegmentId0,SegmentId1,FoundForward,FoundBackward,SegmentId\n"; - BGL_FORALL_EDGES(e, jaccardGraph, JaccardGraph) { - const JaccardGraphEdge& edge = jaccardGraph[e]; - const JaccardGraph::vertex_descriptor v0 = source(e, jaccardGraph); - const JaccardGraph::vertex_descriptor v1 = target(e, jaccardGraph); - const uint64_t segmentId0 = jaccardGraph[v0].segmentId; - const uint64_t segmentId1 = jaccardGraph[v1].segmentId; - - for(const uint64_t segmentId: edge.segmentIds) { - csv << segmentId0 << ","; - csv << segmentId1 << ","; - csv << int(edge.wasFoundInDirection[0]) << ","; - csv << int(edge.wasFoundInDirection[1]) << ","; - csv << segmentId << "\n"; - } - } -} - - - -// Compute all connected components of size at least minComponentSize. -// They are stored in order of decreasing size. -void JaccardGraph::computeConnectedComponents(uint64_t minComponentSize) -{ - const JaccardGraph& jaccardGraph = *this; - - // This must be called without removing any vertices. - const uint64_t segmentCount = num_vertices(jaccardGraph); - - // Compute connected components. - vector<uint64_t> rank(segmentCount); - vector<uint64_t> parent(segmentCount); - boost::disjoint_sets<uint64_t*, uint64_t*> disjointSets(&rank[0], &parent[0]); - for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) { - disjointSets.make_set(segmentId); - } - BGL_FORALL_EDGES(e, jaccardGraph, JaccardGraph) { - const JaccardGraph::vertex_descriptor v0 = source(e, jaccardGraph); - const JaccardGraph::vertex_descriptor v1 = target(e, jaccardGraph); - const uint64_t segmentId0 = jaccardGraph[v0].segmentId; - const uint64_t segmentId1 = jaccardGraph[v1].segmentId; - disjointSets.union_set(segmentId0, segmentId1); - } - - // Gather the segments in each connected component. - vector< vector<uint64_t> > allComponents(segmentCount); - for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) { - const uint64_t componentId = disjointSets.find_set(segmentId); - allComponents[componentId].push_back(segmentId); - } - - // Create a table of the components of size at least minComponentSize, - // sorted by decreasing size. - vector< pair<uint64_t, uint64_t> > componentTable; // pair(componentId, componentSize) - for(uint64_t componentId=0; componentId<segmentCount; componentId++) { - const uint64_t componentSize = allComponents[componentId].size(); - if(componentSize >= minComponentSize) { - componentTable.push_back(make_pair(componentId, componentSize)); - } - } - sort(componentTable.begin(), componentTable.end(), - OrderPairsBySecondOnlyGreater<uint64_t, uint64_t>()); - - // Store the connected components of size at least minComponentSize. - components.clear(); - for(uint64_t newComponentId=0; newComponentId<componentTable.size(); newComponentId++) { - const auto& p = componentTable[newComponentId]; - const uint64_t oldComponentId = p.first; - const uint64_t componentSize = p.second; - const vector<uint64_t>& component = allComponents[oldComponentId]; - SHASTA_ASSERT(component.size() == componentSize); - components.push_back(component); - } - - - // Write a histogram of component sizes. - vector<uint64_t> histogram; - for(const auto& p: componentTable) { - const uint64_t componentSize = p.second; - if(componentSize >= histogram.size()) { - histogram.resize(componentSize + 1, 0); - } - ++histogram[componentSize]; - } - ofstream csv("JaccardGraphComponentSizeHistogram.csv"); - csv << "Size,Frequency,Vertices,\n"; - for(uint64_t componentSize=1; componentSize<histogram.size(); componentSize++) { - const uint64_t frequency = histogram[componentSize]; - if(frequency > 0) { - csv << componentSize << ","; - csv << frequency << ","; - csv << frequency * componentSize << ","; - csv << "\n"; - } - } - -} - - - -// Compute connected component and store the component -// (define as a cluster) that each segment belongs to. -void JaccardGraph::findClusters( - MemoryMapped::Vector<uint64_t>& clusterIds) -{ - const JaccardGraph& jaccardGraph = *this; - - // This must be called without removing any vertices. - const uint64_t segmentCount = num_vertices(jaccardGraph); - - clusterIds.resize(segmentCount); - fill(clusterIds.begin(), clusterIds.end(), invalid<uint64_t>); - for(uint64_t componentId=0; componentId<components.size(); componentId++) { - const vector<uint64_t>& component = components[componentId]; - for(const uint64_t segmentId: component) { - clusterIds[segmentId] = componentId; - } - } - -} - - - -// Construction of the ExpandedJaccardGraph. -// Each vertex of the JaccardGraph generates a vertex in the ExpandedJaccardGraph. -// Each edge of the JaccardGraph generates a linear chain of vertices -// in the ExpandedJaccardGraph. -ExpandedJaccardGraph::ExpandedJaccardGraph(const JaccardGraph& jaccardGraph) -{ - using Graph = ExpandedJaccardGraph; - Graph& graph = *this; - - // Generate the vertices. - std::map<JaccardGraph::vertex_descriptor, Graph::vertex_descriptor> vertexMap; - BGL_FORALL_VERTICES(v, jaccardGraph, JaccardGraph) { - const Graph::vertex_descriptor u = add_vertex( - ExpandedJaccardGraphVertex(jaccardGraph[v].segmentId, true), graph); - vertexMap.insert(make_pair(v, u)); - } - - - - // Each edge of the JaccardGraph generates a linear chain of vertices - // in the ExpandedJaccardGraph. - BGL_FORALL_EDGES(e, jaccardGraph, JaccardGraph) { - const JaccardGraph::vertex_descriptor v0 = source(e, jaccardGraph); - const JaccardGraph::vertex_descriptor v1 = target(e, jaccardGraph); - const Graph::vertex_descriptor u0 = vertexMap[v0]; - const Graph::vertex_descriptor u1 = vertexMap[v1]; - const vector<uint64_t>& segmentIds = jaccardGraph[e].segmentIds; - - Graph::vertex_descriptor u = u0; - for(const uint64_t segmentId: segmentIds) { - const Graph::vertex_descriptor w = add_vertex( - ExpandedJaccardGraphVertex(segmentId, false), graph); - add_edge(u, w, graph); - u = w; - } - add_edge(u, u1, graph); - } -} - - - -void ExpandedJaccardGraph::writeGraphviz(const string& fileName) const -{ - ofstream s(fileName); - writeGraphviz(s); -} -void ExpandedJaccardGraph::writeGraphviz(ostream& s) const -{ - using Graph = ExpandedJaccardGraph; - const Graph& graph = *this; - - const bool debug = false; - - s << "digraph ExpandedJaccardGraph {" << endl; - - // We can't use the segment ids to identify vertices - // because each segment id can appear multiple times. - BGL_FORALL_VERTICES(v, graph, Graph) { - const ExpandedJaccardGraphVertex& vertex = graph[v]; - const double primaryFraction = vertex.primaryFraction(); - s << "\"" << v << "\" [label=\"" << vertex.segmentId; - if(debug) { - s << "\\n" << v; - } - s << "\\n" << vertex.primaryCount << "/" << vertex.totalCount << "\""; - const double H = primaryFraction / 3.; - const double S = 0.5; - const double V = 1.; - s << " style=filled fillcolor=\"" << H << " " << " " << S << " "<< V << "\""; - s << "];\n"; - } - - BGL_FORALL_EDGES(e, graph, Graph) { - const Graph::vertex_descriptor v0 = source(e, graph); - const Graph::vertex_descriptor v1 = target(e, graph); - - s << "\"" << v0 << "\"->\"" << v1 << "\";\n"; - } - - s << "}" << endl; - -} - - - -// Recursively merge pairs of vertices that have a common parent or child -// and that refer to the same segmentId. -void ExpandedJaccardGraph::merge() -{ - using Graph = ExpandedJaccardGraph; - Graph& graph = *this; - - const bool debug = false; - if(debug) { - cout << "ExpandedJaccardGraph::merge begins." << endl; - } - - std::set<Branch> branches; - BGL_FORALL_VERTICES(v, graph, Graph) { - if(out_degree(v, graph) > 1) { - branches.insert(make_pair(v, 0)); - } - if(in_degree(v, graph) > 1) { - branches.insert(make_pair(v, 1)); - } - } - - - - // Recursive merge. - vector<vertex_descriptor> neighbors; - while(not branches.empty()) { - const auto it = branches.begin(); - const vertex_descriptor v0 = it->first; - const uint64_t direction = it->second; - branches.erase(it); - - if(debug) { - cout << "Working on branch " << v0 << " " << direction << endl; - } - - // Gather the children or parents. - neighbors.clear(); - if(direction == 0) { - BGL_FORALL_OUTEDGES(v0, e, graph, Graph) { - neighbors.push_back(target(e, graph)); - } - } else if(direction == 1) { - BGL_FORALL_INEDGES(v0, e, graph, Graph) { - neighbors.push_back(source(e, graph)); - } - - } else { - SHASTA_ASSERT(0); - } - if(debug) { - cout << neighbors.size() << " neighbors:"; - for(const vertex_descriptor v: neighbors) { - cout << " " << v; - } - cout << endl; - } - SHASTA_ASSERT(neighbors.size() > 1); - - // Find a pair of neighbors with the same segmentId. - vertex_descriptor v1, v2; - bool found = false; - for(uint64_t i1=0; i1<neighbors.size()-1; i1++) { - v1 = neighbors[i1]; - for(uint64_t i2=i1+1; i2<neighbors.size(); i2++) { - v2 = neighbors[i2]; - if(graph[v1].segmentId == graph[v2].segmentId) { - found = true; - break; - } - } - if(found) { - break; - } - } - - // If we did not find a pair of neighbors with the same segmentId, - // there is nothing to do. We already removed this branch, so we - // are done. - if(not found) { - if(debug) { - cout << "No pair can be merged for this branch." << endl; - } - continue; - } - if(debug) { - cout << "Merging " << v1 << " " << v2 << endl; - } - - // Merge v1 and v2, and update the branches. - merge(v1, v2, branches, debug); - - } - - if(debug) { - cout << "ExpandedJaccardGraph::merge ends." << endl; - } -} - - - -// Merge v1 and v2 while updating the set of branches. -void ExpandedJaccardGraph::merge( - vertex_descriptor v1, - vertex_descriptor v2, - std::set<Branch>& branches, - bool debug) -{ - using Graph = ExpandedJaccardGraph; - Graph& graph = *this; - - const ExpandedJaccardGraphVertex& vertex1 = graph[v1]; - const ExpandedJaccardGraphVertex& vertex2 = graph[v2]; - - // Check the segmentId. - const uint64_t segmentId = vertex1.segmentId; - SHASTA_ASSERT(segmentId == vertex2.segmentId); - - // Find the children of v1 and v2. - // These will be the children of the merged vertex v3. - vector<vertex_descriptor> children; - BGL_FORALL_OUTEDGES(v1, e, graph, Graph) { - children.push_back(target(e, graph)); - } - BGL_FORALL_OUTEDGES(v2, e, graph, Graph) { - children.push_back(target(e, graph)); - } - deduplicate(children); - - // Find the parents of v1 and v2. - // These will be the parents of the merged vertex v3. - vector<vertex_descriptor> parents; - BGL_FORALL_INEDGES(v1, e, graph, Graph) { - parents.push_back(source(e, graph)); - } - BGL_FORALL_INEDGES(v2, e, graph, Graph) { - parents.push_back(source(e, graph)); - } - deduplicate(parents); - - if(debug) { - cout << "Merging " << v1 << " " << v2 << endl; - cout << "Children:"; - for(const vertex_descriptor v: children) { - cout << " " << v; - } - cout << endl; - cout << "Parents:"; - for(const vertex_descriptor v: parents) { - cout << " " << v; - } - cout << endl; - } - - // Remove the branches that will be affected by the merge. - // We will add branches back as necessary. - for(const vertex_descriptor v: children) { - branches.erase(make_pair(v, 1)); - } - for(const vertex_descriptor v: parents) { - branches.erase(make_pair(v, 0)); - } - branches.erase(make_pair(v1, 0)); - branches.erase(make_pair(v1, 1)); - branches.erase(make_pair(v2, 0)); - branches.erase(make_pair(v2, 1)); - - // Create the merged vertex. - ExpandedJaccardGraphVertex vertex3; - vertex3.segmentId = segmentId; - vertex3.totalCount = vertex1.totalCount + vertex2.totalCount; - vertex3.primaryCount = vertex1.primaryCount + vertex2.primaryCount; - const vertex_descriptor v3 = add_vertex(vertex3, graph); - if(debug) { - cout << "Created merged vertex " << v3 << endl; - } - - // Remove the vertices that were merged, v1 and v2. - clear_vertex(v1, graph); - clear_vertex(v2, graph); - remove_vertex(v1, graph); - remove_vertex(v2, graph); - if(debug) { - cout << "Removed the merged vertices " << v1 << " " << v2 << endl; - } - - // Add the edges to/from the merged vertex. - for(const vertex_descriptor v: children) { - add_edge(v3, v, graph); - if(debug) { - cout << "Added edge " << v3 << " " << v << endl; - } - } - for(const vertex_descriptor v: parents) { - add_edge(v, v3, graph); - if(debug) { - cout << "Added edge " << v << " " << v3 << endl; - } - } - - // Add back any necessary branches. - if(out_degree(v3, graph) > 1) { - branches.insert(make_pair(v3, 0)); - if(debug) { - cout << "Added branch " << v3 << " " << 0 << endl; - } - } - if(in_degree(v3, graph) > 1) { - branches.insert(make_pair(v3, 1)); - if(debug) { - cout << "Added branch " << v3 << " " << 1 << endl; - } - } - for(const vertex_descriptor v: children) { - if(in_degree(v, graph) > 1) { - branches.insert(make_pair(v, 1)); - if(debug) { - cout << "Added branch " << v << " " << 1 << endl; - } - } - } - for(const vertex_descriptor v: parents) { - if(out_degree(v, graph) > 1) { - branches.insert(make_pair(v, 0)); - if(debug) { - cout << "Added branch " << v << " " << 0 << endl; - } - } - } -} - - - -// Compute assembly paths. -void JaccardGraph::computeAssemblyPaths() -{ - assemblyPaths.clear(); - for(uint64_t componentId=0; componentId<components.size(); componentId++) { - computeAssemblyPaths(componentId); - } -} -void JaccardGraph::computeAssemblyPaths(uint64_t componentId) -{ - const JaccardGraph& jaccardGraph = *this; - - const bool debug = true; - const vector<uint64_t>& component = components[componentId]; - if(debug) { - cout << "Computing assembly paths for component " << componentId << - " of size " << component.size() << endl; - } - - // Create a Graph to represent just this component. - // Each vertex of the Graph stores the corresponding - // vertex descriptor in the JaccardGraph. - using Graph = boost::adjacency_list< - boost::listS, boost::vecS, boost::bidirectionalS, - JaccardGraph::vertex_descriptor>; - Graph graph; - std::map<JaccardGraph::vertex_descriptor, Graph::vertex_descriptor> vertexMap; - for(uint64_t segmentId: component) { - const JaccardGraph::vertex_descriptor jv = vertexTable[segmentId]; - const Graph::vertex_descriptor gv = add_vertex(jv, graph); - vertexMap.insert(make_pair(jv, gv)); - } - BGL_FORALL_VERTICES(gv0, graph, Graph) { - const JaccardGraph::vertex_descriptor jv0 = graph[gv0]; - BGL_FORALL_OUTEDGES(jv0, e, jaccardGraph, JaccardGraph) { - const JaccardGraph::vertex_descriptor jv1 = target(e, jaccardGraph); - add_edge(vertexMap[jv0], vertexMap[jv1], graph); - } - } - if(debug) { - cout << "This component has " << num_vertices(graph) << - " vertices and " << num_edges(graph) << " edges." << endl; - } - - // Topological sort of this connected component. - vector<Graph::vertex_descriptor> reverseTopologicalSort; - try { - boost::topological_sort(graph, back_inserter(reverseTopologicalSort)); - } catch (boost::not_a_dag&) { - if(debug) { - cout << "Topological sort for this connected component failed." << endl; - cout << "Computation of assembly path will skip this connected component." << endl; - } - return; - } - - - - // Find the longest path in this component. - // See https://en.wikipedia.org/wiki/Longest_path_problem#Acyclic_graphs - vector<uint64_t> pathLength(component.size(), 0); - vector<Graph::vertex_descriptor> successor(component.size(), Graph::null_vertex()); - - // Process vertices in reverse topological order. - for(const Graph::vertex_descriptor gv0: reverseTopologicalSort) { - BGL_FORALL_OUTEDGES(gv0, e, graph, Graph) { - const Graph::vertex_descriptor gv1 = target(e, graph); - if(pathLength[gv1] + 1 > pathLength[gv0]) { - pathLength[gv0] = pathLength[gv1] + 1; - successor[gv0] = gv1; - } - } - } - - // Find the vertex with the longest pathLength. - // This will be the first vertex of the longest path. - Graph::vertex_descriptor gv0 = - std::max_element(pathLength.begin(), pathLength.end()) - pathLength.begin(); - - // Find the longest path by following the successors. - vector<uint64_t> longestPath; - longestPath.push_back(jaccardGraph[graph[gv0]].segmentId); - while(true) { - const Graph::vertex_descriptor gv1 = successor[gv0]; - if(gv1 == Graph::null_vertex()) { - break; - } - longestPath.push_back(jaccardGraph[graph[gv1]].segmentId); - gv0 = gv1; - } - - // Store the longest path. - assemblyPaths.push_back(longestPath); - - if(debug) { - cout << "Longest path has " << longestPath.size() << " segments:" << endl; - for(const uint64_t segmentId: longestPath) { - cout << segmentId << " "; - } - cout << endl; - } -} diff --git a/src/mode3-JaccardGraph.hpp b/src/mode3-JaccardGraph.hpp deleted file mode 100644 index f01f881..0000000 --- a/src/mode3-JaccardGraph.hpp +++ /dev/null @@ -1,256 +0,0 @@ -#ifndef SHASTA_MODE3_JACCARD_GRAPH_HPP -#define SHASTA_MODE3_JACCARD_GRAPH_HPP - -/******************************************************************************* - -The mode3::JaccardGraph is a directed graph in which each vertex represents -a segment in the mode3::AssemblyGraph. - -A directed edge S0->S1 is created if S0 and S1 have: -- A sufficient number of common reads. -- High Jaccard similarity. -- Low unexplained fractions. -(The above quantities defined as computed by -mode3::AssemblyGraph::analyzeSegmentPair). -For the edge to be created, we also require one of the following: -1. S1 is the first primary segment encountered starting from S0, - and performing a forward path search using the algorithm defined by - mode3::AssemblyGraph::createAssemblyPath3. -2. S0 is the first primary segment encountered starting from S1, - and performing a backward path search using the algorithm defined by - mode3::AssemblyGraph::createAssemblyPath3. - -*******************************************************************************/ - -// Shasta. -#include "mode3-SegmentPairInformation.hpp" - -// Boost libraries. -#include <boost/graph/adjacency_list.hpp> -#include <boost/graph/iteration_macros.hpp> - -// Standard library. -#include "cstdint.hpp" -#include "iosfwd.hpp" -#include <map> -#include "string.hpp" -#include "tuple.hpp" -#include "utility.hpp" -#include "vector.hpp" - - - -namespace shasta { - namespace mode3 { - class JaccardGraph; - class JaccardGraphEdge; - class JaccardGraphEdgeInfo; - class JaccardGraphVertex; - - using JaccardGraphBaseClass = boost::adjacency_list< - boost::listS, boost::listS, boost::bidirectionalS, - JaccardGraphVertex, JaccardGraphEdge>; - - class ExpandedJaccardGraph; - class ExpandedJaccardGraphVertex; - using ExpandedJaccardGraphBaseClass = boost::adjacency_list< - boost::setS, boost::listS, boost::bidirectionalS, - ExpandedJaccardGraphVertex>; - - } - - namespace MemoryMapped { - template<class T> class Vector; - } -} - - - -class shasta::mode3::JaccardGraphVertex { -public: - - // The assembly graph segment corresponding to this vertex. - uint64_t segmentId; -}; - - - -class shasta::mode3::JaccardGraphEdge { -public: - - // The SegmentPairInformation computed by - // mode3::AssemblyGraph::analyzeSegmentPair - // when called for (segmentId0, segmentId1), in this order. - SegmentPairInformation segmentPairInformation; - - // The segments encountered on the way. - vector<uint64_t> segmentIds; - - // Flags for the directions in which this edge was found - // (0=forward, 1=backward). - array<bool, 2> wasFoundInDirection = {false, false}; - - // A strong edge is one that was found in both directions. - bool isStrong() const - { - return wasFoundInDirection[0] and wasFoundInDirection[1]; - } - - JaccardGraphEdge( - const SegmentPairInformation& segmentPairInformation, - uint64_t direction, - const vector<uint64_t>& segmentIds) : - segmentPairInformation(segmentPairInformation), - segmentIds(segmentIds) - { - wasFoundInDirection[direction] = true; - } -}; - - - -// This is only used during parallel creation of the edges. -class shasta::mode3::JaccardGraphEdgeInfo { -public: - uint64_t segmentId0; - uint64_t segmentId1; - - // The direction in which we found this (0=forward, 1=backward). - uint64_t direction; - - // SegmentPairInformation between segmentId0 and segmentId1. - SegmentPairInformation segmentPairInformation; - - // The segments encountered on the way. - vector<uint64_t> segmentIds; -}; - - - -class shasta::mode3::JaccardGraph : public JaccardGraphBaseClass { -public: - - // Create a JaccardGraph with the given number of vertices - // (one for each segment) and no edges. - JaccardGraph(uint64_t segmentCount); - - // Map segment ids to vertices. - // If vertex is removed, the corresponding entry will be null_vertex(). - vector<vertex_descriptor> vertexTable; - - // Remove a vertex, making sure to update the vertexTable. - void removeVertex(vertex_descriptor v); - - // The edges found by each thread. - // Only used during edge creation. - vector< vector<JaccardGraphEdgeInfo> > threadEdges; - - // Use the threadEdges to add edges to the graph. - void storeEdges(); - - // A strong vertex is one that is incident to at least one strong edge. - bool isStrongVertex(vertex_descriptor) const; - - // Remove all weak vertices. - void removeWeakVertices(); - - // Remove all edges to/from weak vertices. - void clearWeakVertices(); - - // Write the JaccardGraph in graphviz format. - void writeGraphviz( - const string& fileName, - bool includeIsolatedVertices, - bool writeLabels) const; - void writeGraphviz( - ostream&, - bool includeIsolatedVertices, - bool writeLabels) const; - - // Write edges in csv format. - void writeEdgesCsv(const string& fileName) const; - void writeEdgesCsv(ostream&) const; - - // Compute all connected components of size at least minComponentSize. - // They are stored in order of decreasing size. - // The vectors contain segmentIds. Use the vertexMap - // to convert to file decriptors. - void computeConnectedComponents(uint64_t minComponentSize); - vector< vector<uint64_t> > components; - - // Each stored connected component generates a cluster. - void findClusters( - MemoryMapped::Vector<uint64_t>& clusterIds); - - // Compute assembly paths. - void computeAssemblyPaths(); - void computeAssemblyPaths(uint64_t componentId); - vector< vector<uint64_t> > assemblyPaths; - -}; - - - -class shasta::mode3::ExpandedJaccardGraphVertex { -public: - - // The assembly graph segment corresponding to this vertex. - uint64_t segmentId; - - // The total number of JaccardGraph vertices that were merged - // into this vertex. - uint64_t totalCount; - - // The number of primary JaccardGraph vertices that were merged - // into this vertex. - uint64_t primaryCount; - - // Construction - ExpandedJaccardGraphVertex() {} - ExpandedJaccardGraphVertex( - uint64_t segmentId, - bool isPrimary) : - segmentId(segmentId), - totalCount(1), - primaryCount(isPrimary ? 1 : 0) - {} - - double primaryFraction() const - { - return double(primaryCount) / double(totalCount); - } - -}; - - - -// The ExpandedJaccardGraph is constructed starting with vertices -// of the JaccardGraph, and expanding each of the edges into a linear -// chain of vertices. The graph is then cleaned up by merging equivalent branches. -class shasta::mode3::ExpandedJaccardGraph : public ExpandedJaccardGraphBaseClass { -public: - ExpandedJaccardGraph(const JaccardGraph&); - - // Write in graphviz format. - void writeGraphviz(const string& fileName) const; - void writeGraphviz(ostream&) const; - - // Recursively merge pairs of vertices that have a common parent or child - // and that refer to the same segmentId. - void merge(); -private: - - // Each Branch represents a pair (vertex_descriptor, direction) - // where direction can be: - // - 0 (forward). In this case the vertex has out_degree>1. - // or - // - 1 (backward). In this case the vertex has in_degree>1. - using Branch = pair<vertex_descriptor, uint64_t>; - - // Merge v1 and v2 while updating the set of branches. - void merge(vertex_descriptor v1, vertex_descriptor v2, std::set<Branch>&, bool debug); -}; - - - -#endif diff --git a/src/mode3-LocalAssembly.cpp b/src/mode3-LocalAssembly.cpp new file mode 100644 index 0000000..8b62f00 --- /dev/null +++ b/src/mode3-LocalAssembly.cpp @@ -0,0 +1,1997 @@ +// Shasta. +#include "mode3-LocalAssembly.hpp" +#include "Assembler.hpp" +#include "globalMsa.hpp" +#include "markerAccessFunctions.hpp" +#include "MarkerGraph.hpp" +#include "orderPairs.hpp" +#include "performanceLog.hpp" +#include "platformDependent.hpp" +#include "runCommandWithTimeout.hpp" +#include "Reads.hpp" +#include "timestamp.hpp" +using namespace shasta; +using namespace mode3; + +// Seqan. +#include <seqan/align.h> + +// Boost libraries. +#include <boost/pending/disjoint_sets.hpp> +#include <boost/graph/iteration_macros.hpp> +#include <boost/graph/strong_components.hpp> +#include <boost/uuid/uuid.hpp> +#include <boost/uuid/uuid_generators.hpp> +#include <boost/uuid/uuid_io.hpp> + +// Standard library. +#include "fstream.hpp" + + + +// The oriented reads common between edgeIdA and edgeIdB are always +// used for assembly. The oriented reads that appear only +// on edgeIdA or edgeIdB are used for assembly under control +// of useA and useB. +// So, if useA and useB are both true (the default), the assembly uses the +// union of the oriented reads on edgeIdA and edgeIdB. +// If they are both false, the assembly uses the +// intersection of the oriented reads on edgeIdA and edgeIdB. +// If useA is true and useB is false, the assembly uses the +// oriented reads on edgeIdA, regardless of whether they appear on edgeIdB. +// If useA is false and useB is true, the assembly uses the +// oriented reads on edgeIdB, regardless of whether they appear on edgeIdA. +LocalAssembly::LocalAssembly( + const Assembler& assembler, + MarkerGraphEdgeId edgeIdA, + MarkerGraphEdgeId edgeIdB, + uint64_t minVertexCoverage, // 0 = automatic + const LocalAssemblyDisplayOptions& displayOptions, + const Mode3AssemblyOptions::LocalAssemblyOptions& options, + bool useA, + bool useB) : + assembler(assembler), + edgeIdA(edgeIdA), + edgeIdB(edgeIdB), + options(displayOptions), + html(displayOptions.html) +{ + + + // Store the source target of edgeIdA and the source vertex of edgeIdB. + const MarkerGraph::Edge& edgeA = assembler.markerGraph.edges[edgeIdA]; + const MarkerGraph::Edge& edgeB = assembler.markerGraph.edges[edgeIdB]; + vertexIdA = edgeA.target; + vertexIdB = edgeB.source; + + // If the edges are adjacent, stop here, leaving the AssembnlyPath empty. + // This results in empty secondary sequence. + if(vertexIdA == vertexIdB) { + if(html) { + html << "<br>The two edges are adjacent. Intervening sequence is empty."; + } + return; + } + + // Check assumptions here as this used vertexIdA and vertexIdB. + checkAssumptions(); + + // Oriented reads. + gatherOrientedReads(useA, useB); + + // Use the oriented reads that appear both on vertexIdA and vertexIdB + // to estimate the base offset between vertexIdA and vertexIdB. + estimateOffset(); + + // If the offset is negative or cannot be estimated, stop here. + // This is pathological and results in empty assembled sequence. + if((estimatedABOffset == invalid<int64_t>) or (estimatedABOffset <= 0)) { + if(html) { + html << "<br>The estimated offset is not positive." << endl; + } + return; + } + + // Markers. + gatherMarkers(options.estimatedOffsetRatio); + writeOrientedReads(); + writeOrientedReadsSequences(); + + // Assembly graph. + alignAndDisjointSets( + options.matchScore, options.mismatchScore, options.gapScore, + options.maxSkipBases, + options.maxDrift, options.minHalfBand, options.minScoreRatio); + writeMarkers(); + + // Iteration to reduce minVertexCoverage if a long MSA is encountered. + while(true) { + + minVertexCoverage = createVertices(minVertexCoverage, options.vertexSamplingRate); + createEdges(); + writeGraph("Initial assembly graph"); + + // Remove strongly connected components, then regenerate + // edges from scratch with the remaining vertices. + if(removeStrongComponents() > 0) { + removeAllEdges(); + createEdges(); + writeGraph("Assembly graph after removal of strong connected components"); + } + + // This must be done after removing strongly connected components. + // Otherwise we can have inaccessible vertices that cause the + // assembly path to encounter dead ends. + if(removeInaccessibleVertices()) { + writeGraph("Assembly graph after removal of inaccessible vertices."); + } + + // Assemble. + findAssemblyPath(); + if(minVertexCoverage > 2) { + try { + assembleAssemblyPathEdges(options.maxMsaLength, LongMsaPolicy::throwException); + } catch(...) { + --minVertexCoverage; + clear(); + if(html and displayOptions.showDebugInformation) { + html << "<br>minVertexCoverage reduced to " << minVertexCoverage; + } + continue; + } + + } else { + assembleAssemblyPathEdges(options.maxMsaLength, LongMsaPolicy::assembleAtLowCoverage); + } + writeGraph("Assembly graph after assembly"); + + // Write assembled sequence. + if(html) { + vector<Base> sequence; + getSecondarySequence(sequence); + + html << + "<h2>Assembled sequence</h2>" + "Assembled sequence not including the first and last edge is " << + sequence.size() << " bases long." + "<pre style='font-family:monospace'>\n"; + copy(sequence.begin(), sequence.end(), ostream_iterator<Base>(html)); + html << "</pre>"; + + ofstream fasta("LocalAssembly.fasta"); + fasta << ">LocalAssembly " << sequence.size() << endl; + copy(sequence.begin(), sequence.end(), ostream_iterator<Base>(fasta)); + + getCompleteSequence(sequence); + + html << + "Assembled sequence including the first and last edge is " << + sequence.size() << " bases long." + "<pre style='font-family:monospace'>\n"; + copy(sequence.begin(), sequence.end(), ostream_iterator<Base>(html)); + html << "</pre>"; + } + + break; + } + +} + + + +void LocalAssembly::checkAssumptions() const +{ + SHASTA_ASSERT(edgeIdA != edgeIdB); + SHASTA_ASSERT(assembler.assemblerInfo->assemblyMode == 3); + SHASTA_ASSERT(assembler.getReads().representation == 0); + SHASTA_ASSERT(not assembler.markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdA)); + SHASTA_ASSERT(not assembler.markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdB)); + + const MarkerGraph& markerGraph = assembler.markerGraph; + const auto& markers = assembler.markers; + + // edgeIdA and edgeIdB cannot have duplicate oriented reads. + if(markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdA)) { + throw runtime_error("Duplicated oriented read on edgeIdA."); + } + if(markerGraph.edgeHasDuplicateOrientedReadIds(edgeIdB)) { + throw runtime_error("Duplicated oriented read on edgeIdB."); + } + + // Neither can their source and target vertices. + if(markerGraph.vertexHasDuplicateOrientedReadIds(vertexIdA, markers)) { + throw runtime_error("Duplicated oriented read on target vertex of edgeIdA."); + } + if(markerGraph.vertexHasDuplicateOrientedReadIds(vertexIdB, markers)) { + throw runtime_error("Duplicated oriented read on source vertex of edgeIdB."); + } +} + + + +void LocalAssembly::gatherOrientedReads(bool useA, bool useB) +{ + // Joint loop over marker intervals that appear in edgeIdA and/or edgeIdB. + const auto markerIntervalsA = assembler.markerGraph.edgeMarkerIntervals[edgeIdA]; + const auto markerIntervalsB = assembler.markerGraph.edgeMarkerIntervals[edgeIdB]; + const auto beginA = markerIntervalsA.begin(); + const auto beginB = markerIntervalsB.begin(); + const auto endA = markerIntervalsA.end(); + const auto endB = markerIntervalsB.end(); + auto itA = beginA; + auto itB = beginB; + while(true) { + if((itA == endA) and (itB == endB)) { + break; + } + + // Oriented reads that appear only in edgeIdA. + if((itB == endB) or (itA != endA and itA->orientedReadId < itB->orientedReadId)) { + + if(useA) { + const MarkerInterval& markerIntervalA = *itA; + const OrientedReadId orientedReadIdA = markerIntervalA.orientedReadId; + const uint32_t ordinalA = markerIntervalA.ordinals[1]; // Because vertexIdA is the target of edgeIdA + + OrientedReadInfo info(orientedReadIdA); + info.ordinalA = ordinalA; + orientedReadInfos.push_back(info); + } + + ++itA; + } + + + + // Oriented reads that appear only in edgeIdB. + else if((itA == endA) or (itB != endB and itB->orientedReadId < itA->orientedReadId)) { + + if(useB) { + const MarkerInterval& markerIntervalB = *itB; + const OrientedReadId orientedReadIdB = markerIntervalB.orientedReadId; + const uint32_t ordinalB = markerIntervalB.ordinals[0]; // Because vertexIdB is the source of edgeIdB + + OrientedReadInfo info(orientedReadIdB); + info.ordinalB = ordinalB; + orientedReadInfos.push_back(info); + } + + ++itB; + } + + + + // Oriented reads that appear in both edgeIdA and edgeIdB. + // They are always used for assembly regardless of the settings of useA and useB. + else { + SHASTA_ASSERT(itA != endA); + SHASTA_ASSERT(itB != endB); + + const MarkerInterval& markerIntervalA = *itA; + const OrientedReadId orientedReadIdA = markerIntervalA.orientedReadId; + + const MarkerInterval& markerIntervalB = *itB; + const OrientedReadId orientedReadIdB = markerIntervalB.orientedReadId; + + SHASTA_ASSERT(orientedReadIdA == orientedReadIdB); + const OrientedReadId orientedReadId = orientedReadIdA; + + const uint32_t ordinalA = markerIntervalA.ordinals[1]; // Because vertexIdA is the target of edgeIdA + const uint32_t ordinalB = markerIntervalB.ordinals[0]; // Because vertexIdB is the source of edgeIdB + + // Only use it if the ordinal offset is not negative. + if(ordinalB >= ordinalA) { + + OrientedReadInfo info(orientedReadId); + info.ordinalA = ordinalA; + info.ordinalB = ordinalB; + orientedReadInfos.push_back(info); + } + + ++itA; + ++itB; + } + + } +} + + + +void LocalAssembly::writeOrientedReads() const +{ + if(not html) { + return; + } + if(not options.showOrientedReads) { + return; + } + + html << + "<h2>Oriented reads</h2>" + "<table>" + "<tr>" + "<th>Index" + "<th>Oriented<br>read" + "<th>OrdinalA" + "<th>OrdinalB" + "<th>Ordinal<br>offset" + "<th>PositionA" + "<th>PositionB" + "<th>Position<br>offset" + "<th>First<br>ordinal" + "<th>Last<br>ordinal" + "<th>First<br>position" + "<th>Last<br>position" + ; + + for(uint64_t i=0; i<orientedReadInfos.size(); i++) { + const OrientedReadInfo& info = orientedReadInfos[i]; + + html << + "<tr>" + "<td class=centered>" << i << + "<td class=centered>" << info.orientedReadId; + + html << "<td class=centered>"; + if(info.isOnA()) { + html << info.ordinalA; + } + + html << "<td class=centered>"; + if(info.isOnB()) { + html << info.ordinalB; + } + + html << "<td class=centered>"; + if(info.isOnA() and info.isOnB()) { + html << info.ordinalOffset(); + } + + html << "<td class=centered>"; + if(info.isOnA()) { + html << basePosition(info.orientedReadId, info.ordinalA); + } + + html << "<td class=centered>"; + if(info.isOnB()) { + html << basePosition(info.orientedReadId, info.ordinalB); + } + + html << "<td class=centered>"; + if(info.isOnA() and info.isOnB()) { + const int64_t baseOffset = + basePosition(info.orientedReadId, info.ordinalB) - + basePosition(info.orientedReadId, info.ordinalA); + SHASTA_ASSERT(baseOffset >= 0); + html << baseOffset; + } + + SHASTA_ASSERT(not info.markerInfos.empty()); + const MarkerInfo& firstMarkerInfo = info.markerInfos.front(); + const MarkerInfo& lastMarkerInfo = info.markerInfos.back(); + html << + "<td class=centered>" << firstMarkerInfo.ordinal << + "<td class=centered>" << lastMarkerInfo.ordinal << + "<td class=centered>" << firstMarkerInfo.position << + "<td class=centered>" << lastMarkerInfo.position; + } + + html << "</table>"; + + + // Count reads. + uint64_t commonCount = 0; + uint64_t onlyACount = 0; + uint64_t onlyBCount = 0; + for(const OrientedReadInfo& info: orientedReadInfos) { + const bool isOnA = info.isOnA(); + const bool isOnB = info.isOnB(); + if(isOnA) { + if(isOnB) { + ++commonCount; + } else { + ++onlyACount; + } + } else { + if(isOnB) { + ++onlyBCount; + } else { + SHASTA_ASSERT(0); + } + + } + } + html << + "<p><table>" + "<tr><th class=left>Common<td class=centered>" << commonCount << + "<tr><th class=left>On A only<td class=centered>" << onlyACount << + "<tr><th class=left>On B only<td class=centered>" << onlyBCount << + "<tr><th class=left>Total<td class=centered>" << orientedReadInfos.size() << + "</table>"; + +} + + + +// Get the base position of a marker in an oriented read +// given the ordinal. +int64_t LocalAssembly::basePosition(OrientedReadId orientedReadId, int64_t ordinal) const +{ + const MarkerId markerId = assembler.getMarkerId(orientedReadId, uint32_t(ordinal)); + const int64_t position = int64_t(assembler.markers.begin()[markerId].position); + return position; + +} + + + +void LocalAssembly::estimateOffset() +{ + int64_t n = 0; + int64_t sum = 0; + for(const OrientedReadInfo& info: orientedReadInfos) { + if(info.isOnA() and info.isOnB()) { + const OrientedReadId orientedReadId = info.orientedReadId; + const int64_t positionA = basePosition(orientedReadId, info.ordinalA); + const int64_t positionB = basePosition(orientedReadId, info.ordinalB); + const int64_t baseOffset = positionB - positionA; + SHASTA_ASSERT(baseOffset >= 0); + + sum += baseOffset; + ++n; + } + } + if(n == 0) { + estimatedABOffset = invalid<int64_t>; + + if(html) { + html << "<br>The offset cannot be estimated because there are no common oriented reads between " << + edgeIdA << " and " << edgeIdB; + } + } else { + estimatedABOffset = int64_t(std::round(double(sum) / double(n))); + + if(html) { + html << "<br>Estimated position offset is " << estimatedABOffset << " bases."; + } + } +} + + + +// Fill in the markerInfos vector of each read. +void LocalAssembly::gatherMarkers(double estimatedOffsetRatio) +{ + const int64_t offsetThreshold = int64_t(estimatedOffsetRatio * double(estimatedABOffset)); + + + // Loop over our oriented reads. + for(uint64_t i=0; i<orientedReadInfos.size(); i++) { + OrientedReadInfo& info = orientedReadInfos[i]; + const OrientedReadId orientedReadId = info.orientedReadId; + info.markerInfos.clear(); + + // Oriented reads that appear on both edgeIdA and edgeIdB. + if(info.isOnA() and info.isOnB()) { + for(int64_t ordinal=info.ordinalA; + ordinal<=info.ordinalB; ordinal++) { + addMarkerInfo(i, ordinal); + } + } + + // Oriented reads that appear on edgeIdA but not on edgeIdB. + else if(info.isOnA() and not info.isOnB()) { + const int64_t maxPosition = basePosition(orientedReadId, info.ordinalA) + offsetThreshold; + const int64_t markerCount = int64_t(assembler.markers.size(orientedReadId.getValue())); + + for(int64_t ordinal=info.ordinalA; + ordinal<markerCount; ordinal++) { + const int64_t position = basePosition(orientedReadId, ordinal); + if(position > maxPosition) { + break; + } + addMarkerInfo(i, ordinal); + } + } + + // Oriented reads that appear on edgeIdB but not on edgeIdA. + else if(info.isOnB() and not info.isOnA()) { + const int64_t minPosition = basePosition(orientedReadId, info.ordinalB) - offsetThreshold; + + for(int64_t ordinal=info.ordinalB; ordinal>=0; ordinal--) { + const int64_t position = basePosition(orientedReadId, ordinal); + if(position < minPosition) { + break; + } + addMarkerInfo(i, ordinal); + } + + // We added the MarkerInfos in reverse order, so we have to reverse them. + reverse(info.markerInfos.begin(), info.markerInfos.end()); + } + + else { + SHASTA_ASSERT(0); + } + } + +} + + + +// Add the marker at given ordinal to the i-th oriented read. +void LocalAssembly::addMarkerInfo(uint64_t i, int64_t ordinal) +{ + OrientedReadInfo& info = orientedReadInfos[i]; + + MarkerInfo markerInfo; + markerInfo.ordinal = ordinal; + markerInfo.position = basePosition(info.orientedReadId, ordinal); + markerInfo.kmerId = getOrientedReadMarkerKmerId( + info.orientedReadId, + uint32_t(ordinal), + assembler.assemblerInfo->k, + assembler.getReads(), + assembler.markers); + + info.markerInfos.push_back(markerInfo); +} + + + +void LocalAssembly::writeMarkers() +{ + if(not (html and options.showMarkers)) { + return; + } + + const uint64_t k = assembler.assemblerInfo->k; + + html << + "<h2>Markers used in this assembly step</h2>" + "<table>" + "<tr>" + "<th>Oriented<br>read<br>index" + "<th>Oriented<br>read" + "<th>Ordinal" + "<th>Ordinal<br>offset<br>from A" + "<th>Ordinal<br>offset<br>to B" + "<th>Position" + "<th>KmerId" + "<th>Kmer" + "<th>Vertex" + "<th>Coverage"; + + for(uint64_t i=0; i<orientedReadInfos.size(); i++) { + const OrientedReadInfo& info = orientedReadInfos[i]; + for(const MarkerInfo& markerInfo: info.markerInfos) { + const Kmer kmer(markerInfo.kmerId, k); + + html << + "<tr>" + "<td class=centered>" << i << + "<td class=centered>" << info.orientedReadId << + "<td class=centered>" << markerInfo.ordinal; + + // Ordinal offset from A. + html << "<td class=centered>"; + if(info.isOnA()) { + html << markerInfo.ordinal - info.markerInfos.front().ordinal; + } + + // Ordinal offset to B. + html << "<td class=centered>"; + if(info.isOnB()) { + html << info.markerInfos.back().ordinal - markerInfo.ordinal; + } + + html << + "<td class=centered>" << markerInfo.position << + "<td class=centered>" << markerInfo.kmerId << + "<td class=centered style='font-family:monospace'>"; + kmer.write(html, k); + html << + "<td class=centered>" << markerInfo.disjointSetId << + "<td class=centered>" << disjointSetsMap[markerInfo.disjointSetId].size(); + } + } + + html << "</table>"; +} + + + +// Compute alignments and use them to create the disjoint set data structure, +// from which the marker graph will be created. +// maxDrift is the maximum tolerated length drift of each read. +// Used to compute the band for banded alignments. +void LocalAssembly::alignAndDisjointSets( + uint64_t matchScore, + uint64_t mismatchScore, + uint64_t gapScore, + uint64_t maxSkipBases, + double maxDrift, + uint64_t minHalfBand, + double minScoreRatio + ) +{ + + // SeqAn types we need. + using TSequence = seqan::String<KmerId>; + using TStringSet = seqan::StringSet<TSequence>; + using TDepStringSet = seqan::StringSet< TSequence, seqan::Dependent<> >; + using TAlignGraph = seqan::Graph< seqan::Alignment<TDepStringSet> >; + + const bool detailedDebugOutput = false; + ofstream dot; + ofstream csv; + if(detailedDebugOutput) { + dot.open("LocalAssembly-AlignmentDetails.dot"); + dot << "graph PathFiler3lignments {\n"; + csv.open("LocalAssembly-AlignmentDetails.csv"); + csv << "OrientedReadId0,Ordinal0,OrientedReadId1,Ordinal1\n"; + } + + // Assign ids to markers. + uint64_t markerId = 0; + for(OrientedReadInfo& info: orientedReadInfos) { + for(MarkerInfo& markerInfo: info.markerInfos) { + markerInfo.id = markerId++; + } + } + + // Initialize the disjoint sets data structure. + const uint64_t markerCount = markerId; + vector<uint64_t> rank(markerCount); + vector<uint64_t> parent(markerCount); + boost::disjoint_sets<uint64_t*, uint64_t*> disjointSets(&rank[0], &parent[0]); + for(uint64_t markerId=0; markerId<markerCount; markerId++) { + disjointSets.make_set(markerId); + } + + // Construct a Seqan sequence containing the KmerIds for each oriented read. + // Add 100 to each KmerId because Seqan uses 45 to represent a gap. + vector<TSequence> seqanSequences(orientedReadInfos.size()); + for(uint64_t i=0; i<orientedReadInfos.size(); i++) { + const OrientedReadInfo& info = orientedReadInfos[i]; + TSequence& seqanSequence = seqanSequences[i]; + for(const MarkerInfo& markerInfo: info.markerInfos) { + seqan::appendValue(seqanSequence, markerInfo.kmerId + 100); + } + } + + + + // Loop over pairs of reads. + for(uint64_t i0=0; i0<orientedReadInfos.size()-1; i0++) { + const OrientedReadInfo& info0 = orientedReadInfos[i0]; + const uint64_t length0 = info0.markerInfos.size(); + const TSequence& seqanSequence0 = seqanSequences[i0]; + for(uint64_t i1=i0+1; i1<orientedReadInfos.size(); i1++) { + const OrientedReadInfo& info1 = orientedReadInfos[i1]; + // cout << "*** " << info0.orientedReadId << " " << info1.orientedReadId << endl; + const uint64_t length1 = info1.markerInfos.size(); + const TSequence& seqanSequence1 = seqanSequences[i1]; + + // Figure the constraints for this alignment. + const bool constrainedA = info0.isOnA() and info1.isOnA(); + const bool constrainedB = info0.isOnB() and info1.isOnB(); + + // If constrained on A, merge the first markers of the two reads, + // as the alignment does not guarantee that. + // If constrained on B, merge the last markers of the two reads, + // as the alignment does not guarantee that. + if(constrainedA) { + disjointSets.union_set(info0.markerInfos.front().id, info1.markerInfos.front().id); + } + if(constrainedB) { + disjointSets.union_set(info0.markerInfos.back().id, info1.markerInfos.back().id); + } + + // Only do alignments that are constrained on at least one side. + if(not (constrainedA or constrainedB)) { + continue; + } + + // Align the KmerIds of these oriented reads. + // For now we do a full blown alignment, but later + // we should use banded alignments instead. + // Store them in a SeqAn string set. + TStringSet sequences; + appendValue(sequences, seqanSequence0); + appendValue(sequences, seqanSequence1); + + +#if 0 + // Old code that used geneal alignment, not banded alignments. + // Compute the alignment. + using namespace seqan; + TAlignGraph graph(sequences); + if(constrainedA and constrainedB) { + globalAlignment( + graph, + Score<int, Simple>(int(matchScore), int(mismatchScore), int(gapScore)), + AlignConfig<false, false, false, false>(), + LinearGaps()); + } else if(constrainedA and not constrainedB) { + globalAlignment( + graph, + Score<int, Simple>(int(matchScore), int(mismatchScore), int(gapScore)), + AlignConfig<false, false, true, true>(), + LinearGaps()); + } else if(constrainedB and not constrainedA) { + globalAlignment( + graph, + Score<int, Simple>(int(matchScore), int(mismatchScore), int(gapScore)), + AlignConfig<true, true, false, false>(), + LinearGaps()); + } else { + SHASTA_ASSERT(0); + } +#endif + + + // Banded alignment, allowing for the specified maxDrift. + // This is necessary to prevent large cycles in the graph. + // It is also good for performance. + using namespace seqan; + TAlignGraph graph(sequences); + int score = 0; + if(constrainedA and constrainedB) { + const int64_t diagonalA = 0; + const int64_t diagonalB = int64_t(length0) - int64_t(length1); + const int64_t totalDrift = int64_t(maxDrift * 0.5 * double(min(length0, length1))); + const int64_t halfBand = totalDrift + int64_t(minHalfBand); + const int64_t minBand = min(diagonalA, diagonalB) - halfBand; + const int64_t maxBand = max(diagonalA, diagonalB) + halfBand; + score = globalAlignment( + graph, + Score<int, Simple>(int(matchScore), int(mismatchScore), int(gapScore)), + AlignConfig<false, false, false, false>(), + int(minBand), int(maxBand), + LinearGaps()); + } else if(constrainedA and not constrainedB) { + const int64_t diagonalA = 0; + const int64_t totalDrift = int64_t(maxDrift * double(min(length0, length1))); + const int64_t halfBand = totalDrift + int64_t(minHalfBand); + const int64_t minBand = diagonalA - halfBand; + const int64_t maxBand = diagonalA + halfBand; + score = globalAlignment( + graph, + Score<int, Simple>(int(matchScore), int(mismatchScore), int(gapScore)), + AlignConfig<false, false, true, true>(), + int(minBand), int(maxBand), + LinearGaps()); + } else if(constrainedB and not constrainedA) { + const int64_t diagonalB = int64_t(length0) - int64_t(length1); + const int64_t totalDrift = int64_t(maxDrift * double(min(length0, length1))); + const int64_t halfBand = totalDrift + int64_t(minHalfBand); + const int64_t minBand = diagonalB - halfBand; + const int64_t maxBand = diagonalB + halfBand; + score = globalAlignment( + graph, + Score<int, Simple>(int(matchScore), int(mismatchScore), int(gapScore)), + AlignConfig<true, true, false, false>(), + int(minBand), int(maxBand), + LinearGaps()); + } else { + SHASTA_ASSERT(0); + } + + // If SeqAn was not able to compute the banded aignment, ignore it. + if(score == MinValue<int>::VALUE) { + if(html and options.showDebugInformation) { + html << "<br>Alignment between " << info0.orientedReadId << + " and " << info1.orientedReadId << + " ignored."; + } + continue; + } + + // Check that the score is sufficiently good. + const uint64_t bestPossibleScore = matchScore * min(length0, length1); + const double scoreRatio = double(score) / double(bestPossibleScore); + if(scoreRatio < minScoreRatio) { + if(html and options.showDebugInformation) { + html << "<br>Alignment between " << info0.orientedReadId << + " and " << info1.orientedReadId << ": lengths " << length0 << " " << length1 << + ", score " << score << "/" << bestPossibleScore << " " << + double(score) / double(bestPossibleScore) << + " discarded due to low score."; + } + continue; + } + + + + // Extract the alignment from the graph. + // This creates a single sequence consisting of the two rows + // of the alignment, concatenated. + TSequence align; + convertAlignment(graph, align); + const uint64_t totalAlignmentLength = seqan::length(align); + SHASTA_ASSERT((totalAlignmentLength % 2) == 0); // Because we are aligning two sequences. + const uint64_t alignmentLength = totalAlignmentLength / 2; + const uint64_t seqanGapValue = 45; + +#if 0 + // This is not needed when doing banded alignments. + // If the alignment has large base skips, don't use it. + bool hasLargeSkip = false; + uint64_t j0 = 0; + uint64_t j1 = 0; + uint64_t previousPosition0 = invalid<uint64_t>; + uint64_t previousPosition1 = invalid<uint64_t>; + for(uint64_t positionInAlignment=0; positionInAlignment<alignmentLength; positionInAlignment++) { + const KmerId kmerId0 = align[positionInAlignment]; + const KmerId kmerId1 = align[positionInAlignment + alignmentLength]; + + if(kmerId0 == seqanGapValue) { + if(kmerId1 == seqanGapValue) { + // Both 0 and 1 are gaps. + SHASTA_ASSERT(0); + } else { + // 0 is gap, 1 is not gap. + ++j1; + } + } else { + if(kmerId1 == seqanGapValue) { + // 0 is not gap, 1 is gap. + ++j0; + } else { + // Neither 0 nor 1 is a gap. + if(kmerId0 == kmerId1) { + // Check for large base skips. + const uint64_t position0 = info0.markerInfos[j0].position; + const uint64_t position1 = info1.markerInfos[j1].position; + // cout << "***A " << position0 << " " << position1 << endl; + if(previousPosition0 != invalid<uint64_t>) { + const int64_t offset = int64_t(position0) - int64_t(position1); + const int64_t previousOffset = int64_t(previousPosition0) - int64_t(previousPosition1); + if(abs(offset - previousOffset) > int64_t(maxSkipBases)) { + hasLargeSkip = true; + // cout << "Skip" << endl; + break; + } + } + previousPosition0 = position0; + previousPosition1 = position1; + } + ++j0; + ++j1; + } + + } + } + if(hasLargeSkip) { + if(html and options.showDebugInformation) { + html << "<br>Alignment between " << info0.orientedReadId << + " and " << info1.orientedReadId << + " suppressed."; + } + continue; + } +#endif + + + // Use the alignment to update the disjoint sets data structure. + uint64_t j0 = 0; + uint64_t j1 = 0; + for(uint64_t positionInAlignment=0; positionInAlignment<alignmentLength; positionInAlignment++) { + const KmerId kmerId0 = align[positionInAlignment]; + const KmerId kmerId1 = align[positionInAlignment + alignmentLength]; + + if(kmerId0 == seqanGapValue) { + if(kmerId1 == seqanGapValue) { + // Both 0 and 1 are gaps. + SHASTA_ASSERT(0); + } else { + // 0 is gap, 1 is not gap. + ++j1; + } + } else { + if(kmerId1 == seqanGapValue) { + // 0 is not gap, 1 is gap. + ++j0; + } else { + // Neither 0 nor 1 is a gap. + if(kmerId0 == kmerId1) { + // If a match, merge the disjoint sets containing these two markers. + disjointSets.union_set(info0.markerInfos[j0].id, info1.markerInfos[j1].id); + if(detailedDebugOutput) { + dot << "\"" << info0.orientedReadId << "-"; + dot << info0.markerInfos[j0].ordinal << "\"--\""; + dot << info1.orientedReadId << "-"; + dot << info1.markerInfos[j1].ordinal << "\";\n"; + csv << + info0.orientedReadId << "," << + info0.markerInfos[j0].ordinal << "," << + info1.orientedReadId << "," << + info1.markerInfos[j1].ordinal << "\n"; + } + } + ++j0; + ++j1; + } + + } + } + SHASTA_ASSERT(j0 == length0); + SHASTA_ASSERT(j1 == length1); + } + } + + // Store in each MarkerInfo the id of the disjoint set it belongs to. + for(uint64_t i=0; i<orientedReadInfos.size(); i++) { + OrientedReadInfo& info = orientedReadInfos[i]; + for(MarkerInfo& markerInfo: info.markerInfos) { + markerInfo.disjointSetId = disjointSets.find_set(markerInfo.id); + } + } + + // Fill in the disjoint sets map. + disjointSetsMap.clear(); + for(uint64_t i=0; i<orientedReadInfos.size(); i++) { + const OrientedReadInfo& info = orientedReadInfos[i]; + for(uint64_t j=0; j<info.markerInfos.size(); j++) { + const MarkerInfo& markerInfo = info.markerInfos[j]; + disjointSetsMap[markerInfo.disjointSetId].push_back({i, j}); + } + } + + // Histogram disjoint sets sizes. + disjointSetsSizeHistogram.clear(); + for(const auto& p: disjointSetsMap) { + const uint64_t disjointSetSize = p.second.size(); + if(disjointSetSize >= disjointSetsSizeHistogram.size()) { + disjointSetsSizeHistogram.resize(disjointSetSize + 1, 0); + } + ++disjointSetsSizeHistogram[disjointSetSize]; + } + + + // Write the histogram of disjoint sets sizes. + if(html and options.showDebugInformation) { + + html << + "<h2>Disjoint sets size histogram</h2>" + "<table>" + "<tr>" + "<th>Size" + "<th>Frequency" + "<th>Markers"; + + for(uint64_t disjointSetSize=0; disjointSetSize<disjointSetsSizeHistogram.size(); disjointSetSize++) { + const uint64_t frequency = disjointSetsSizeHistogram[disjointSetSize]; + if(frequency) { + html << + "<tr>" + "<td class=centered>" << disjointSetSize << + "<td class=centered>" << frequency << + "<td class=centered>" << frequency * disjointSetSize; + } + } + + html << "</table>"; + } + + if(detailedDebugOutput) { + dot << "}\n"; + } +} + + + +// Create vertices. Each disjoint set with at least minVertexCoverage markers +// generates a vertex. +uint64_t LocalAssembly::createVertices( + uint64_t minVertexCoverage, + double vertexSamplingRate) // Only used if minVertexCoverage is 0 +{ + LocalAssembly& graph = *this; + + // Remove all vertices and edges, just in case. + LocalAssemblyBaseClass::clear(); + vertexMap.clear(); + + // Find the disjoint sets corresponding to vertexIdA and vertexIdB. + // Those will always generate a vertex regardless of coverage. + disjointSetIdA = invalid<uint64_t>; + disjointSetIdB = invalid<uint64_t>; + for(const OrientedReadInfo& info: orientedReadInfos) { + if(info.isOnA()) { + const MarkerInfo& markerInfoA = info.markerInfos.front(); + if(disjointSetIdA == invalid<uint64_t>) { + disjointSetIdA = markerInfoA.disjointSetId; + } else { + SHASTA_ASSERT(disjointSetIdA == markerInfoA.disjointSetId); + } + } + if(info.isOnB()) { + const MarkerInfo& markerInfoB = info.markerInfos.back(); + if(disjointSetIdB == invalid<uint64_t>) { + disjointSetIdB = markerInfoB.disjointSetId; + } else { + SHASTA_ASSERT(disjointSetIdB == markerInfoB.disjointSetId); + } + } + } + + if(html) { + html << "<br>Start vertex " << disjointSetIdA << ", end vertex " << disjointSetIdB; + } + + + + // If minVertexCoverage is 0, select a value automatically. + // Select a value that gives a number of vertices approximately correct given + // the estimated offset. + if(minVertexCoverage == 0) { + + // Estimate the desired number of vertices given the estimated offset. + const uint64_t totalBaseCount = assembler.assemblerInfo->baseCount * 2; // Both strands. + const uint64_t totalMarkerCount = assembler.markers.totalSize(); + const double markerDensity = double(totalMarkerCount) / double(totalBaseCount); + const uint64_t desiredVertexCount = uint64_t( + vertexSamplingRate * markerDensity * double(estimatedABOffset)); + + // Use the disjointSetsSizeHistogram to choose a value of minVertexCoverage + // that will give us approximately this number of vertices. + // Never reduce minVertexCoverage below 2. + uint64_t cumulativeDisjointSetsCount = 0; + for(minVertexCoverage = disjointSetsSizeHistogram.size()-1; minVertexCoverage>2; --minVertexCoverage) { + cumulativeDisjointSetsCount += disjointSetsSizeHistogram[minVertexCoverage]; +#if 0 + if(html and options.showDebugInformation) { + html << "<br>minVertexCoverage " << minVertexCoverage << + " would generate " << cumulativeDisjointSetsCount << + " vertices and we want " << desiredVertexCount; + } +#endif + if(cumulativeDisjointSetsCount >= desiredVertexCount) { + break; + } + } + + if(html and options.showDebugInformation) { + html << "<br>Set minVertexCoverage to " << minVertexCoverage << + " based on marker density " << markerDensity << + ", vertex sampling rate " << vertexSamplingRate << + ", desired number of vertices " << desiredVertexCount; + } + } + + + + // Loop over disjoint sets that are large enough. + // Also always include disjointSetIdA and disjointSetIdB. + for(const auto& p: disjointSetsMap) { + const uint64_t disjointSetId = p.first; + const auto& disjointSet = p.second; + if(disjointSet.size() >= minVertexCoverage or + disjointSetId==disjointSetIdA or + disjointSetId==disjointSetIdB) { + + const vertex_descriptor v = add_vertex({disjointSetId}, graph); + vertexMap.insert(make_pair(disjointSetId, v)); + } + } + + if(html and options.showDebugInformation) { + html << "<br>The assembly graph has " << num_vertices(graph) << " vertices."; + } + + return minVertexCoverage; +} + + + +// Create edges by following the reads. +void LocalAssembly::createEdges() +{ + LocalAssembly& graph = *this; + + removeAllEdges(); + + // Loop over all reads. + for(uint64_t i=0; i<orientedReadInfos.size(); i++) { + const OrientedReadInfo& info = orientedReadInfos[i]; + + // Follow this read, finding the vertices it reaches. + vertex_descriptor v0 = null_vertex(); + LocalAssemblyMarkerIndexes indexes0; + for(uint64_t j=0; j<info.markerInfos.size(); j++) { + const MarkerInfo& markerInfo = info.markerInfos[j]; + const uint64_t disjointSetId = markerInfo.disjointSetId; + const auto it = vertexMap.find(disjointSetId); + + if(it != vertexMap.end()) { + const vertex_descriptor v1 = it->second; + const LocalAssemblyMarkerIndexes indexes1 = {i, j}; + if(v0 != null_vertex()) { + + // Get the edge v0->v1, creating it if necessary. + edge_descriptor e; + bool edgeExists = false; + tie(e, edgeExists) = edge(v0, v1, graph); + if(not edgeExists) { + bool edgeWasAdded = false; + tie(e, edgeWasAdded) = add_edge(v0, v1, graph); + SHASTA_ASSERT(edgeWasAdded); + } + LocalAssemblyEdge& edge = graph[e]; + + edge.markerIntervals.push_back({indexes0, indexes1}); + } + + // v1 becomes the previous vertex. + v0 = v1; + indexes0 = indexes1; + + } + } + } + if(html and options.showDebugInformation) { + html << "<br>The assembly graph has " << num_edges(graph) << " edges."; + } +} + + + +void LocalAssembly::removeAllEdges() +{ + LocalAssembly& graph = *this; + BGL_FORALL_VERTICES(v, graph, LocalAssembly) { + clear_vertex(v, graph); + } +} + + + +void LocalAssembly::writeGraphviz(const string& fileName) const +{ + ofstream file(fileName); + writeGraphviz(file); +} + + + +void LocalAssembly::writeGraphviz(ostream& s) const +{ + const LocalAssembly& graph = *this; + + // S and V for edges HSV. + const double S = 0.7; + const double V = 1.; + + // Gather assembly path edges. + vector<edge_descriptor> sortedAssemblyPathEdges = assemblyPath; + sort(sortedAssemblyPathEdges.begin(), sortedAssemblyPathEdges.end()); + + s << + "digraph LocalAssembly {\n" + "mclimit=0.01;\n" // For layout speed + "edge [penwidth=6];\n" + "node [fontname=\"Courier New\"];\n" + "edge [fontname=\"Courier New\"];\n"; + + if(options.showVertices) { + if(options.showVertexLabels) { + s << "node [shape=rectangle style=filled color=black fillcolor=gray80];\n"; + } else { + s << "node [shape=point width=0.2];\n"; + } + } else { + s << "node [shape=point style=invis];\n"; + } + + // Vertices. + BGL_FORALL_VERTICES(v, graph, LocalAssembly) { + const uint64_t disjointSetId = graph[v].disjointSetId; + auto it = disjointSetsMap.find(disjointSetId); + SHASTA_ASSERT(it != disjointSetsMap.end()); + const uint64_t coverage = it->second.size(); + + const bool isA = (graph[v].disjointSetId == disjointSetIdA); + const bool isB = (graph[v].disjointSetId == disjointSetIdB); + + s << disjointSetId << "["; + + // Label. + s << "label=\""; + if(isA) { + s << "A\\n"; + } + if(isB) { + s << "B\\n"; + } + s << graph[v].disjointSetId << "\\n" << coverage; + s << "\" "; + + // Special drawing of the begin/end vertices. + if(isA or isB) { + s << "shape=rectangle style=filled color=black fillcolor=cyan"; + } + + s << "];\n"; + } + + // Edges. + BGL_FORALL_EDGES(e, graph, LocalAssembly) { + const LocalAssemblyEdge& edge = graph[e]; + const vertex_descriptor v0 = source(e, graph); + const vertex_descriptor v1 = target(e, graph); + const uint64_t coverage = edge.coverage(); + + // Compute the hue based on coverage. + double H; + if(coverage >= orientedReadInfos.size()) { + H = 1./3.; + } else { + H = (double(coverage - 1) / (3. * double(orientedReadInfos.size() - 1))); + } + const string colorString = "\"" + to_string(H) + " " + to_string(S) + " " + to_string(V) + "\""; + + s << + graph[v0].disjointSetId << "->" << + graph[v1].disjointSetId << " ["; + + if(options.showEdgeLabels) { + s << "label=\"" << coverage << "\""; + } + s << " color=" << colorString; + + // Tooltip. + s << " tooltip=\""; + s << "Coverage " << coverage << "\\n"; + s << "\""; + + // If we have an assembly path and this edge is not on the assembly path, + // draw it dashed. + if(not assemblyPath.empty()) { + if(not std::binary_search(sortedAssemblyPathEdges.begin(), sortedAssemblyPathEdges.end(), e)) { + s << " style=dashed"; + } + } + + s << "];\n"; + } + + s << "}\n"; +} + + + +void LocalAssembly::writeGraph(const string& title) +{ + LocalAssembly& graph = *this; + + if(html and options.showGraph) { + html << "<h2>" << title << "</h2>"; + html << "<p>The assembly graph has " << num_vertices(graph) << + " vertices and " << num_edges(graph) << " edges."; + writeGraph(); + } +} + + + +void LocalAssembly::writeGraph() const +{ + // Write out the graph in graphviz format. + const string uuid = to_string(boost::uuids::random_generator()()); + const string dotFileName = tmpDirectory() + uuid + ".dot"; + { + ofstream dotFile(dotFileName); + writeGraphviz(dotFile); + } + + // Compute layout in svg format. + const string command = "dot -O -T svg " + dotFileName; + bool timeoutTriggered = false; + bool signalOccurred = false; + int returnCode = 0; + const double timeout = 600; + runCommandWithTimeout(command, timeout, timeoutTriggered, signalOccurred, returnCode); + if(returnCode!=0 or signalOccurred) { + throw runtime_error("An error occurred while running the following command: " + command); + } + if(timeoutTriggered) { + std::filesystem::remove(dotFileName); + throw runtime_error("Timeout during graph layout computation."); + } + + // Remove the .dot file. + std::filesystem::remove(dotFileName); + + // Copy the svg file to html. + const string svgFileName = dotFileName + ".svg"; + ifstream svgFile(svgFileName); + html << "<p>" << svgFile.rdbuf(); + svgFile.close(); + + // Remove the .svg file. + std::filesystem::remove(svgFileName); +} + + + +uint64_t LocalAssembly::removeStrongComponents() +{ + LocalAssembly& graph = *this; + uint64_t removedCount = 0; + + // Map the vertices to integers. + uint64_t vertexIndex = 0; + std::map<vertex_descriptor, uint64_t> vertexMap; + BGL_FORALL_VERTICES(v, graph, LocalAssembly) { + vertexMap.insert({v, vertexIndex++}); + } + + // Compute strong components. + std::map<vertex_descriptor, uint64_t> componentMap; + boost::strong_components( + graph, + boost::make_assoc_property_map(componentMap), + boost::vertex_index_map(boost::make_assoc_property_map(vertexMap))); + + // Gather the vertices in each strong component. + std::map<uint64_t, vector<vertex_descriptor> > componentVertices; + for(const auto& p: componentMap) { + componentVertices[p.second].push_back(p.first); + } + + + + // Keep the non-trivial ones. + // A non-trivial strong component has at least one internal edge. + // This means that it either has more than one vertex, + // or it consists of a single vertex with a self-edge. + for(const auto& p: componentVertices) { + + // Figure out if it is non-trivial. + bool isNonTrivial; + if(p.second.size() > 1) { + + // More than one vertex. Certainly non-trivial. + isNonTrivial = true; + } else if (p.second.size() == 1) { + + // Only one vertex. Non-trivial if self-edge present. + const vertex_descriptor v = p.second.front(); + bool selfEdgeExists = false; + tie(ignore, selfEdgeExists) = edge(v, v, graph); + isNonTrivial = selfEdgeExists; + } else { + + // Empty. This should never happen. + SHASTA_ASSERT(0); + } + + // If non-trivial, remove all of its vertices. + // But don't remove vertexIdA or vertexIdB. + if(isNonTrivial) { + for(const vertex_descriptor v: p.second) { + const LocalAssemblyVertex& vertex = graph[v]; + if(vertex.disjointSetId == disjointSetIdA or vertex.disjointSetId == disjointSetIdB) { + continue; + } + removeVertex(v); + ++removedCount; + } + } + } + + if(html and options.showDebugInformation) { + html << + "<br>Removed " << removedCount << + " vertices in non-trivial strongly connected components." + "<br>The graph has now " << num_vertices(graph) << + " vertices."; + + } + + return removedCount; +} + + + +void LocalAssembly::removeVertex(vertex_descriptor v) +{ + LocalAssembly& graph = *this; + + vertexMap.erase(graph[v].disjointSetId); + + clear_vertex(v, graph); + remove_vertex(v, graph); + +} + + + +void LocalAssembly::findAssemblyPath() +{ + const LocalAssembly& graph = *this; + assemblyPath.clear(); + + + // Find the first and last vertex of the path we are looking for. + vertex_descriptor vA = null_vertex(); + vertex_descriptor vB = null_vertex(); + BGL_FORALL_VERTICES(v, graph, LocalAssembly) { + const LocalAssemblyVertex& vertex = graph[v]; + if(vertex.disjointSetId == disjointSetIdA) { + SHASTA_ASSERT(vA == null_vertex()); + vA = v; + } + if(vertex.disjointSetId == disjointSetIdB) { + SHASTA_ASSERT(vB == null_vertex()); + vB = v; + } + } + SHASTA_ASSERT(vA != null_vertex()); + SHASTA_ASSERT(vB != null_vertex()); + + + // Main iteration loop. + vertex_descriptor v = vA; + while(v != vB) { + + // Find the edge with the most coverage. + edge_descriptor eNext; + uint64_t bestCoverage = 0; + BGL_FORALL_OUTEDGES(v, e, graph, LocalAssembly) { + // Ignore a self-edge A->A. + // This can exist because we did not allow vertex A (and B) + // to be removed when removing strong components. + if(v == vA and target(e, graph) == vA) { + continue; + } + const uint64_t coverage = graph[e].coverage(); + if(coverage > bestCoverage) { + eNext = e; + bestCoverage = coverage; + } + } + if(bestCoverage == 0) { + cout << "LocalAssembly: at " << graph[v].disjointSetId << + ": no out-edge found when filling path from " << + edgeIdA << " to " << edgeIdB << endl; + } + SHASTA_ASSERT(bestCoverage > 0); + + // Store this edge. + assemblyPath.push_back(eNext); + v = target(eNext, graph); + } + + if(html and options.showDebugInformation) { + html << "<br>The assembly path has " << assemblyPath.size() << " edges."; + } +} + + + + +void LocalAssembly::assembleAssemblyPathEdges( + uint64_t maxMsaLength, + LongMsaPolicy longMsaPolicy) +{ + const LocalAssembly& graph = *this; + + for(const edge_descriptor e: assemblyPath) { + assembleEdge(maxMsaLength, longMsaPolicy, e); + } + + + + // Write a table containing a summary of edge sequences with coverage, + // and their position in assembled sequence. + if(html and options.showAssemblyDetails) { + html << + "<br><table>" + "<tr>" + "<th>Source" + "<th>Target" + "<th>Begin" + "<th>End" + "<th>length" + "<th>Sequence" + ; + + uint64_t position = 0; + for(const edge_descriptor e: assemblyPath) { + const LocalAssemblyEdge& edge = graph[e]; + const vector<Base>& sequence = edge.consensusSequence; + const vector<uint64_t>& coverage = edge.consensusCoverage; + SHASTA_ASSERT(sequence.size() == coverage.size()); + + html << + "<tr>" + "<td class=centered>" << graph[source(e, graph)].disjointSetId << + "<td class=centered>" << graph[target(e, graph)].disjointSetId << + "<td class=centered>" << position << + "<td class=centered>" << position + sequence.size() << + "<td class=centered>" << sequence.size() << + "<td class=centered style='font-family:monospace'>"; + copy(sequence.begin(), sequence.end(), ostream_iterator<Base>(html)); + html << "<br>"; + for(const uint64_t c: coverage) { + writeCoverageCharacterToHtml(c); + } + + position += sequence.size(); + } + + html << "</table>"; + } +} + + + +void LocalAssembly::assembleEdge( + uint64_t maxMsaLength, + LongMsaPolicy longMsaPolicy, + edge_descriptor e) +{ + LocalAssembly& graph = *this; + LocalAssemblyEdge& edge = graph[e]; + + if(html and options.showAssemblyDetails) { + html << "<h2>Assembly details for edge " << + graph[source(e, graph)].disjointSetId << "->" << + graph[target(e, graph)].disjointSetId << "</h2>" + "<table>" + "<tr><th>Oriented<br>read<th>Sequence<br>length<th>Sequence"; + } + + const uint64_t k = assembler.assemblerInfo->k; + SHASTA_ASSERT((k % 2) == 0); + const uint64_t kHalf = k / 2; + + // Gather the sequences of the contributing oriented reads. + // Each sequence is stored with the number of distinct oriented reads that + // have that sequence. + vector< pair<vector<Base>, uint64_t> > orientedReadSequences; + + // Loop over marker intervals of this edge. + vector<Base> orientedReadSequence; + for(const auto& p: edge.markerIntervals) { + + // Locate the two markers of this marker interval. + const LocalAssemblyMarkerIndexes indexes0 = p.first; + const LocalAssemblyMarkerIndexes indexes1 = p.second; + const uint64_t i0 = indexes0.i; + const uint64_t i1 = indexes1.i; + const uint64_t j0 = indexes0.j; + const uint64_t j1 = indexes1.j; + + // They must belong to the same oriented read. + SHASTA_ASSERT(i0 == i1); + const uint64_t i = i0; + const OrientedReadInfo& info = orientedReadInfos[i]; + const OrientedReadId orientedReadId = info.orientedReadId; + + const MarkerInfo& markerInfo0 = info.markerInfos[j0]; + const MarkerInfo& markerInfo1 = info.markerInfos[j1]; + + // Now we can get the contributing sequence. + const uint64_t position0 = markerInfo0.position + kHalf; + const uint64_t position1 = markerInfo1.position + kHalf; + + // Now we can get the sequence contributed by this oriented read. + orientedReadSequence.clear(); + for(uint64_t position=position0; position!=position1; position++) { + const Base base = assembler.getReads().getOrientedReadBase(orientedReadId, uint32_t(position)); + orientedReadSequence.push_back(base); + } + + if(html and options.showAssemblyDetails) { + html << + "<tr><td class=centered>" << orientedReadId << + "<td class=centered>" << orientedReadSequence.size() << + "<td class=centered style='font-family:monospace'>"; + copy(orientedReadSequence.begin(), orientedReadSequence.end(), + ostream_iterator<Base>(html)); + } + + // Store it. + bool found = false; + for(auto& p: orientedReadSequences) { + if(p.first == orientedReadSequence) { + ++p.second; + found = true; + break; + } + } + if(not found) { + orientedReadSequences.push_back(make_pair(orientedReadSequence, 1)); + } + + } + + // Sort the sequences by decreasing number of supporting reads. + sort(orientedReadSequences.begin(), orientedReadSequences.end(), + OrderPairsBySecondOnlyGreater<vector<Base>, uint64_t>()); + + if(html and options.showAssemblyDetails) { + html << "</table>"; + + html << "<p><table>" + "<tr><th>Coverage<th>Sequence<br>length<th>Sequence"; + for(const auto& p: orientedReadSequences) { + const vector<Base>& sequence = p.first; + const uint64_t coverage = p.second; + html << + "<tr>" + "<td class=centered>" << coverage << + "<td class=centered>" << sequence.size() << + "<td class=centered style='font-family:monospace'>"; + copy(sequence.begin(), sequence.end(), ostream_iterator<Base>(html)); + + } + html << "</table>"; + } + + // If there is only one distinct sequence (all reads agree), + // store that one sequence as the consensus. + // This is the most common case. + if(orientedReadSequences.size() == 1) { + const auto& p = orientedReadSequences.front(); + const vector<Base>& sequence = p.first; + const uint64_t coverage = p.second; + edge.consensusSequence = sequence; + edge.consensusCoverage.clear(); + edge.consensusCoverage.resize(sequence.size(), coverage); + return; + } + + + // If getting here, we have more than one sequence, and we must + // compute a consensus via multiple sequence alignment (MSA). + + // If any of the sequences are too long, react according to longMsaPolicy. + // This can be problematic. + if(orientedReadSequences.size() > 1) { + + // Find the length of the longest sequence. + uint64_t maxLength = 0; + for(const auto& p: orientedReadSequences) { + const vector<Base>& sequence = p.first; + maxLength = max(sequence.size(), maxMsaLength); + } + + if(maxLength > maxMsaLength) { + if(html and options.showDebugInformation) { + html << "<br>MSA length " << maxLength << " at " << + graph[source(e, graph)].disjointSetId << "->" << + graph[target(e, graph)].disjointSetId; + } + if(longMsaPolicy == LongMsaPolicy::throwException) { + throw runtime_error("Long MSA."); + } else { + orientedReadSequences.resize(1); + if(html and options.showDebugInformation) { + html << "<br>Assembling this edge at coverage " << orientedReadSequences.front().second; + } + } + } + } + + // Compute the MSA. + vector< vector<AlignedBase> > alignment; + globalMsaSpoa(orientedReadSequences, alignment); + SHASTA_ASSERT(alignment.size() == orientedReadSequences.size()); + + // Compute coverage at each alignment position for each of the 5 AlignedBases. + const uint64_t alignmentLength = alignment.front().size(); + vector< array<uint64_t, 5> > coverage(alignmentLength, {0, 0, 0, 0, 0}); + for(uint64_t i=0; i<orientedReadSequences.size(); i++) { + const vector<AlignedBase>& alignmentRow = alignment[i]; + SHASTA_ASSERT(alignmentRow.size() == alignmentLength); + for(uint64_t position=0; position<alignmentLength; position++) { + const AlignedBase b = alignmentRow[position]; + coverage[position][b.value] += orientedReadSequences[i].second; + } + } + + // Compute coverage-based consensus at each alignment position. + vector<AlignedBase> alignedConsensus; + vector<uint64_t> alignmentConsensusCoverage; + for(const auto& c: coverage) { + const uint64_t iBase = std::max_element(c.begin(), c.end()) - c.begin(); + alignedConsensus.push_back(AlignedBase::fromInteger(iBase)); + alignmentConsensusCoverage.push_back(c[iBase]); + } + SHASTA_ASSERT(alignedConsensus.size() == alignmentLength); + + // Store in the edge the consensus and its coverage, excluding the gaps. + edge.consensusSequence.clear(); + edge.consensusCoverage.clear(); + for(uint64_t position=0; position<alignedConsensus.size(); position++) { + const AlignedBase b = alignedConsensus[position]; + if(not b.isGap()) { + edge.consensusSequence.push_back(Base(b)); + edge.consensusCoverage.push_back(alignmentConsensusCoverage[position]); + } + } + + if(html and options.showAssemblyDetails) { + + html << "<p><table>" + "<tr><th>Coverage<th>Sequence<br>length<th>Aligned<br>sequence"; + + // Write one row for each distinct sequence. + for(uint64_t i=0; i<orientedReadSequences.size(); i++) { + const auto& p = orientedReadSequences[i]; + const vector<Base>& sequence = p.first; + const uint64_t coverage = p.second; + const vector<AlignedBase>& alignedSequence = alignment[i]; + html << + "<tr>" + "<td class=centered>" << coverage << + "<td class=centered>" << sequence.size() << + "<td class=centered style='font-family:monospace'>"; + for(uint64_t position=0; position<alignedSequence.size(); position++) { + const AlignedBase b = alignedSequence[position]; + const bool isDiscordant = (b != alignedConsensus[position]); + if(isDiscordant) { + html << "<span style='background-color:LightCoral'>"; + } + html << alignedSequence[position]; + if(isDiscordant) { + html << "</span>"; + } + } + } + + // Write one row with aligned consensus. + html << + "<tr>" + "<td class=centered colspan=2>Consensus" + "<td class=centered style='font-family:monospace'>"; + copy(alignedConsensus.begin(), alignedConsensus.end(), + ostream_iterator<AlignedBase>(html)); + + // Write one row with aligned consensus coverage. + html << + "<tr>" + "<td class=centered colspan=2>Consensus coverage" + "<td class=centered style='font-family:monospace'>"; + for(uint64_t position=0; position<coverage.size(); position++) { + writeCoverageCharacterToHtml(alignmentConsensusCoverage[position]); + } + + // Write one row with aligned discordant coverage. + html << + "<tr>" + "<td class=centered colspan=2>Discordant coverage" + "<td class=centered style='font-family:monospace'>"; + for(uint64_t position=0; position<coverage.size(); position++) { + writeCoverageCharacterToHtml(edge.coverage() - alignmentConsensusCoverage[position]); + } + + // Write one row with coverage for each of the 5 AlignedBases. + for(uint64_t b=0; b<5; b++) { + html << + "<tr><td colspan=2 class=centered>" << AlignedBase::fromInteger(b) << " coverage" + "<td class=centered style='font-family:monospace'>"; + for(uint64_t position=0; position<coverage.size(); position++) { + writeCoverageCharacterToHtml(coverage[position][b]); + } + } + html << "</table>"; + + // Write another table with the final, ungapped consensus and its coverage. + html << + "<p>Consensus length is " << edge.consensusSequence.size() << + "<br><table>" + "<tr><th>Consensus<td class=centered style='font-family:monospace'>"; + copy(edge.consensusSequence.begin(), edge.consensusSequence.end(), + ostream_iterator<Base>(html)); + html << "<tr><th>Consensus coverage<td class=centered style='font-family:monospace'>"; + for(const uint64_t coverage: edge.consensusCoverage) { + writeCoverageCharacterToHtml(coverage); + } + html << "<tr><th>Discordant coverage<td class=centered style='font-family:monospace'>"; + for(const uint64_t coverage: edge.consensusCoverage) { + writeCoverageCharacterToHtml(edge.coverage() - coverage); + } + html << "</table>"; + } + +} + + + +void LocalAssembly::writeCoverageCharacterToHtml(uint64_t coverage) const +{ + if(coverage == 0) { + html << " "; + } else if(coverage < 10) { + html << coverage; + } else if(coverage < 36) { + html << char((coverage - 10) + 'A'); + } else { + html << "*"; + } + +} + + +// Get the sequence between edgeIdA and edgeIdB. +// This does not include the sequences of edgeIdA and edgeIdB themselves. +void LocalAssembly::getSecondarySequence( + vector<Base>& sequence) const +{ + const LocalAssembly& graph = *this; + + sequence.clear(); + for(const edge_descriptor e: assemblyPath) { + const vector<Base>& edgeSequence = graph[e].consensusSequence; + copy(edgeSequence.begin(), edgeSequence.end(), back_inserter(sequence)); + } + +} + + + +// Get the complete sequence, including the sequences of edgeIdA and edgeIdB. +void LocalAssembly::getCompleteSequence( + vector<Base>& sequence) const +{ + const LocalAssembly& graph = *this; + + sequence.clear(); + + const auto edgeASequence = assembler.markerGraph.edgeSequence[edgeIdA]; + copy(edgeASequence.begin(), edgeASequence.end(), back_inserter(sequence)); + + for(const edge_descriptor e: assemblyPath) { + const vector<Base>& edgeSequence = graph[e].consensusSequence; + copy(edgeSequence.begin(), edgeSequence.end(), back_inserter(sequence)); + } + + const auto edgeBSequence = assembler.markerGraph.edgeSequence[edgeIdB]; + copy(edgeBSequence.begin(), edgeBSequence.end(), back_inserter(sequence)); + + +} + + + +// Remove vertices that are not accessible from vertexIdA +// or from which vertexIdB is not accessible. +// Returns the number of vertices that were removed. +uint64_t LocalAssembly::removeInaccessibleVertices() +{ + LocalAssembly& graph = *this; + + // Find the vertices corresponding to vertexIdA and vertexIdB. + vertex_descriptor vA = null_vertex(); + vertex_descriptor vB = null_vertex(); + BGL_FORALL_VERTICES(v, graph, LocalAssembly) { + const LocalAssemblyVertex& vertex = graph[v]; + if(vertex.disjointSetId == disjointSetIdA) { + SHASTA_ASSERT(vA == null_vertex()); + vA = v; + } + if(vertex.disjointSetId == disjointSetIdB) { + SHASTA_ASSERT(vB == null_vertex()); + vB = v; + } + } + SHASTA_ASSERT(vA != null_vertex()); + SHASTA_ASSERT(vB != null_vertex()); + + + + // Use a forward BFS to find the vertices that are accessible from vertexIdA, + // moving forward. Those vertices get their isAccessibleA flag set. + { + std::queue<vertex_descriptor> q; + q.push(vA); + graph[vA].isAccessibleA = true; + while(not q.empty()) { + const vertex_descriptor v0 = q.front(); + q.pop(); + + BGL_FORALL_OUTEDGES(v0, e, graph, LocalAssembly) { + const vertex_descriptor v1 = target(e, graph); + auto& vertex1 = graph[v1]; + if(not vertex1.isAccessibleA) { + vertex1.isAccessibleA = true; + q.push(v1); + } + } + } + SHASTA_ASSERT(graph[vB].isAccessibleA); + } + + + + // Use a backward BFS to find the vertices that are accessible from vertexIdB, + // moving backward. Those vertices get their isAccessibleB flag set. + { + std::queue<vertex_descriptor> q; + q.push(vB); + graph[vB].isAccessibleB = true; + while(not q.empty()) { + const vertex_descriptor v0 = q.front(); + q.pop(); + + BGL_FORALL_INEDGES(v0, e, graph, LocalAssembly) { + const vertex_descriptor v1 = source(e, graph); + auto& vertex1 = graph[v1]; + if(not vertex1.isAccessibleB) { + vertex1.isAccessibleB = true; + q.push(v1); + } + } + } + SHASTA_ASSERT(graph[vA].isAccessibleB); + } + + + // Gather the vertices to be removed. + vector<vertex_descriptor> verticesToBeRemoved; + BGL_FORALL_VERTICES(v, graph, LocalAssembly) { + const auto& vertex = graph[v]; + if(not (vertex.isAccessibleA and vertex.isAccessibleB)) { + verticesToBeRemoved.push_back(v); + } + } + + // Remove them. + for(const vertex_descriptor v: verticesToBeRemoved) { + removeVertex(v); + } + + return verticesToBeRemoved.size(); +} + + + +// Remove all vertices and edges and clear the vertexMap and assemblyPath. +// All other data are left alone. +void LocalAssembly::clear() +{ + LocalAssemblyBaseClass::clear(); + vertexMap.clear(); + assemblyPath.clear(); +} + + + +void LocalAssembly::writeOrientedReadsSequences() const +{ + if(not html) { + return; + } + if(not options.showOrientedReads) { + return; + } + + const uint64_t k = assembler.assemblerInfo->k; + SHASTA_ASSERT((k % 2) == 0); + const uint64_t kHalf = k / 2; + + ofstream fasta("LocalAssembly-OrientedReadSequences.fasta"); + + for(const OrientedReadInfo& info: orientedReadInfos) { + + SHASTA_ASSERT(not info.markerInfos.empty()); + const uint64_t position0 = uint64_t(info.markerInfos.front().position) + kHalf; + const uint64_t position1 = uint64_t(info.markerInfos.back().position) + kHalf; + + fasta << + ">" << info.orientedReadId << " " << + position0 << ":" << position1 << + " length " << position1-position0 << "\n"; + for(uint64_t position=position0; position!=position1; position++) { + const Base base = assembler.getReads().getOrientedReadBase(info.orientedReadId, uint32_t(position)); + fasta << base; + } + fasta << "\n"; + } +} diff --git a/src/mode3-LocalAssembly.hpp b/src/mode3-LocalAssembly.hpp new file mode 100644 index 0000000..b52d207 --- /dev/null +++ b/src/mode3-LocalAssembly.hpp @@ -0,0 +1,345 @@ +#pragma once + +// LocalAssembly assembles the sequence between two primary marker graph edges. +// It uses a local marker graph. + +// Shasta. +#include "AssemblerOptions.hpp" +#include "Base.hpp" +#include "invalid.hpp" +#include "ReadId.hpp" +#include "shastaTypes.hpp" + +// Boost libraries. +#include <boost/graph/adjacency_list.hpp> + +// Standard library. +#include "utility.hpp" +#include "vector.hpp" + + + +namespace shasta { + namespace mode3 { + class LocalAssemblyVertex; + class LocalAssemblyEdge; + class LocalAssembly; + using LocalAssemblyBaseClass = boost::adjacency_list< + boost::listS, + boost::listS, + boost::bidirectionalS, + LocalAssemblyVertex, + LocalAssemblyEdge + >; + class LocalAssemblyDisplayOptions; + class LocalAssemblyMarkerIndexes; + } + class Assembler; +}; + + + +class shasta::mode3::LocalAssemblyDisplayOptions { +public: + + // If this is not open, no output takes place. + ostream& html; + + bool showGraph = false; + bool showOrientedReads = false; + bool showMarkers = false; + bool showVertices = false; + bool showVertexLabels = false; + bool showEdgeLabels = false; + bool showAssemblyDetails = false; + bool showDebugInformation = false; + + LocalAssemblyDisplayOptions(ostream& html) : html(html) {} +}; + + + +// A way to identify a marker in LocalAssembly, besides its id. +class shasta::mode3::LocalAssemblyMarkerIndexes { +public: + uint64_t i; // Index in orientedReadInfos + uint64_t j; // Index in OrientedReadInfo::markerInfos; +}; + + + +class shasta::mode3::LocalAssemblyVertex { +public: + uint64_t disjointSetId; + bool isAccessibleA = false; + bool isAccessibleB = false; +}; + + + +class shasta::mode3::LocalAssemblyEdge { +public: + + // Each marker interval is identified by the two markers. + vector< pair<LocalAssemblyMarkerIndexes, LocalAssemblyMarkerIndexes> > markerIntervals; + + uint64_t coverage() const + { + return markerIntervals.size(); + } + + // Consensus of the sequences contributes by each marker interval. + vector<Base> consensusSequence; + vector<uint64_t> consensusCoverage; +}; + + + +class shasta::mode3::LocalAssembly : public LocalAssemblyBaseClass { +public: + + // Hide class Base defined in boost::adjacency_list. + using Base = shasta::Base; + + // The oriented reads common between edgeIdA and edgeIdB are always + // used for assembly. The oriented reads that appear only + // on edgeIdA or edgeIdB are used for assembly under control + // of useA and useB. + // So, if useA and useB are both true (the default), the assembly uses the + // union of the oriented reads on edgeIdA and edgeIdB. + // If they are both false, the assembly uses the + // intersection of the oriented reads on edgeIdA and edgeIdB. + // If useA is true and useB is false, the assembly uses the + // oriented reads on edgeIdA, regardless of whether they appear on edgeIdB. + // If useA is false and useB is true, the assembly uses the + // oriented reads on edgeIdB, regardless of whether they appear on edgeIdA. + LocalAssembly( + const Assembler&, + MarkerGraphEdgeId edgeIdA, + MarkerGraphEdgeId edgeIdB, + uint64_t minVertexCoverage, // 0 = automatic + const LocalAssemblyDisplayOptions&, + const Mode3AssemblyOptions::LocalAssemblyOptions&, + bool useA = true, + bool useB = true); + + // Get the sequence between edgeIdA and edgeIdB. + // This does not include the sequences of edgeIdA and edgeIdB themselves. + void getSecondarySequence( + vector<Base>&) const; + + // Get the complete sequence, including the sequences of edgeIdA and edgeIdB. + void getCompleteSequence( + vector<Base>&) const; + +private: + + // Store constructor arguments. + const Assembler& assembler; + MarkerGraphEdgeId edgeIdA; + MarkerGraphEdgeId edgeIdB; + const LocalAssemblyDisplayOptions& options; + ostream& html; + + MarkerGraphVertexId vertexIdA; // The target vertex of marker graph edge edgeIdA. + MarkerGraphVertexId vertexIdB; // The target vertex of marker graph edge edgeIdA. + + void checkAssumptions() const; + + + + // A class used to store information about a marker of + // an oriented read used in this assembly. + // The ordinal and position are stored signed to facilitate manipulations + // that involve subtractions. + class MarkerInfo { + public: + int64_t ordinal; + int64_t position; + KmerId kmerId; + + // An id for this marker, global to the LocalAssembly. + // This is the index of this marker in the disjoint sets data structure. + uint64_t id; + + // The id of the disjoint set this MarkerInfo belongs to. + uint64_t disjointSetId; + + }; + + + + // Information about the portion of an oriented read used in this assembly. + class OrientedReadInfo { + public: + OrientedReadId orientedReadId; + OrientedReadInfo(OrientedReadId orientedReadId) : + orientedReadId(orientedReadId) + {} + + // The ordinal of vertexIdA in this oriented read. + // Only initialized for oriented reads that appear in edgeIdA. + int64_t ordinalA = invalid<int64_t>; + bool isOnA() const + { + return ordinalA != invalid<int64_t>; + } + + // The ordinal of vertexIdB in this oriented read. + // Only initialized for oriented reads that appear in edgeIdB. + int64_t ordinalB = invalid<int64_t>; + bool isOnB() const + { + return ordinalB != invalid<int64_t>; + } + + // Note we are assuming that each oriented read appears once on edgeIdA, edgeIdB, + // and their source and target vertices. + + // Order OrientedReadInfos by OrientedReadId. + bool operator<(const OrientedReadInfo& that) const + { + return orientedReadId < that.orientedReadId; + } + + + // The ordinal offset between vertexIdA and vertexIdB. + int64_t ordinalOffset() const + { + SHASTA_ASSERT(isOnA() and isOnB()); + return ordinalB - ordinalA; + } + + // Information about the markers of this read we will use in this assembly. + // The first one is at ordinal firstOrdinal. + // The last one is a ordinal lastOrdinal. + vector<MarkerInfo> markerInfos; + + // The first and last ordinals of this oriented read used for this assembly. + // For reads on edgeIdA, firstOrdinal equals ordinalA. + // For reads on edgeIdB, lastOrdinal equals ordinalB. + int64_t firstOrdinal() + { + SHASTA_ASSERT(not markerInfos.empty()); + return markerInfos.front().ordinal; + } + int64_t lastOrdinal() + { + SHASTA_ASSERT(not markerInfos.empty()); + return markerInfos.back().ordinal; + } + + }; + + // Get the base position of a marker in an oriented read + // given the ordinal. + int64_t basePosition(OrientedReadId, int64_t ordinal) const; + + // For assembly, we use the union of the oriented reads + // that appear in edgeIdA and edgeIdB, and that have positive ordinal offset. + // OrientedReadInfos are stored sorted by OrientedReadId. + vector<OrientedReadInfo> orientedReadInfos; + void gatherOrientedReads(bool useA, bool useB); + void writeOrientedReads() const; + void writeOrientedReadsSequences() const; + + // Estimated offset in bases between vertexIdA and vertexIdB. + // The estimate is done using the oriented reads that appear + // both in edgeIdA and edgeIdB. + // If the offset cannot be estimated because there are no + // common oriented reads between egeIdA and edgeIdB, + // it is set to invalid<int64_t>. + // In that case, or if the offset is negative, + // the assembly fails, which results in empty secondary sequence. + int64_t estimatedABOffset; + void estimateOffset(); + + // Fill in the markerInfos vector of each read. + void gatherMarkers(double estimatedOffsetRatio); + void writeMarkers(); + + // Add the marker at given ordinal to the i-th oriented read. + void addMarkerInfo(uint64_t i, int64_t ordinal); + + // Compute alignments and use them to create the disjoint set data structure, + // from which the marker graph will be created. + // maxDrift is the maximum tolerated length drift of each read. + // Used to compute the band for banded alignments. + void alignAndDisjointSets( + uint64_t matchScore, + uint64_t mismatchScore, + uint64_t gapScore, + uint64_t maxSkipBases, + double maxDrift, + uint64_t minHalfBand, + double minScoreRatio + ); + + // This stores the markers in each disjoint set. + // Each marker is stored as pair(i, j) + // where i is the index of the OrientedReadInfo in orientedReadInfos + // and j is the index of the MarkerInfo in orientedReadInfo.markerInfos. + // Keyed by the disjoint set id (the same also stored in each marker). + std::map<uint64_t, vector<LocalAssemblyMarkerIndexes> > disjointSetsMap; + + vector<uint64_t> disjointSetsSizeHistogram; + + // Create vertices. Each disjoint set with at least minVertexCoverage markers + // generates a vertex. + // If minVertexCoverage is 0, a suitable value is computed. + // This returns the value of minVertexCoverage actually used. + uint64_t createVertices( + uint64_t minVertexCoverage, + double vertexSamplingRate); // Only used if minVertexCoverage is 0; + void removeVertex(vertex_descriptor); + + // The disjoint sets corresponding to vertexIdA and vertexIdB. + // Those will always generate a vertex regardless of coverage. + uint64_t disjointSetIdA = invalid<uint64_t>; + uint64_t disjointSetIdB = invalid<uint64_t>; + + // Map that gives the vertex descriptor corresponding to a disjoint set id, if any. + std::map<uint64_t, vertex_descriptor> vertexMap; + + // Create edges by following the reads. + void createEdges(); + void removeAllEdges(); + + // Remove strongly connected components. + // Returns the number of vertices removed. + uint64_t removeStrongComponents(); + + // Remove vertices that are not accessible from vertexIdA + // or from which vertexIdB is not accessible. + // Returns the number of vertices that were removed. + uint64_t removeInaccessibleVertices(); + + // Possible courses of action when a long MSA is encountered. + enum class LongMsaPolicy { + throwException, + assembleAtLowCoverage + }; + + // The assembly path, beginning at vertexIdA and ending at vertexIdB. + // This means that the sequences of edgeIdA and edgeIdB are not included. + vector<edge_descriptor> assemblyPath; + void findAssemblyPath(); + void assembleAssemblyPathEdges(uint64_t maxMsaLength, LongMsaPolicy); + void assembleEdge( + uint64_t maxMsaLength, + LongMsaPolicy, + edge_descriptor); + + // Graphviz output. + void writeGraph() const; + void writeGraph(const string& title); + void writeGraphviz(const string& fileName) const; + void writeGraphviz(ostream&) const; + + void writeCoverageCharacterToHtml(uint64_t coverage) const; + + // Remove all vertices and edges and clear the vertexMap and assemblyPath. + // All other data are left alone. + void clear(); +}; + diff --git a/src/mode3-LocalAssemblyGraph.cpp b/src/mode3-LocalAssemblyGraph.cpp deleted file mode 100644 index 7798147..0000000 --- a/src/mode3-LocalAssemblyGraph.cpp +++ /dev/null @@ -1,1576 +0,0 @@ -// Shasta. -#include "mode3-LocalAssemblyGraph.hpp" -#include "mode3-AssemblyPath.hpp" -#include "mode3-SegmentPairInformation.hpp" -#include "computeLayout.hpp" -#include "html.hpp" -#include "HttpServer.hpp" -#include "MarkerGraph.hpp" -#include "MurmurHash2.hpp" -#include "writeGraph.hpp" -using namespace shasta; -using namespace mode3; - -// Boost libraries. -#include <boost/geometry/algorithms/make.hpp> -#include <boost/geometry/algorithms/length.hpp> -#include <boost/graph/adjacency_list.hpp> -#include <boost/graph/iteration_macros.hpp> -#include <boost/graph/fruchterman_reingold.hpp> -#include <boost/graph/random_layout.hpp> -#include <boost/graph/topology.hpp> - -// Standard library. -#include <map> -#include <queue> -#include "tuple.hpp" - - - -// Create the LocalAssemblyGraph using a BFS -// that starts at the specified vertex and moves away -// (in both directions) up to the specified distance -mode3::LocalAssemblyGraph::LocalAssemblyGraph( - const MarkerGraph& markerGraph, - const AssemblyGraph& assemblyGraph, - uint64_t startSegmentId, - uint64_t maxDistance) : - markerGraph(markerGraph), - assemblyGraph(assemblyGraph), - maxDistance(maxDistance) -{ - LocalAssemblyGraph& localAssemblyGraph= *this; - - // The BFS queue. - std::queue<uint64_t> q; - - // Map segments in the AssemblyGraph to vertices in - // the LocalAssemblyGraph. - std::map<uint64_t, vertex_descriptor> segmentMap; - - // Initialize the BFS. - if(maxDistance > 0) { - q.push(startSegmentId); - } - const vertex_descriptor vStart = addVertex(startSegmentId, 0); - segmentMap.insert(make_pair(startSegmentId, vStart)); - - - - // BFS. - while(not q.empty()) { - - // Dequeue a segment. - const uint64_t segmentId0 = q.front(); - q.pop(); - const vertex_descriptor v0 = segmentMap[segmentId0]; - const uint64_t distance0 = localAssemblyGraph[v0].distance; - const uint64_t distance1 = distance0 + 1; - - // Loop over children. - for(const uint64_t linkId: assemblyGraph.linksBySource[segmentId0]) { - const mode3::AssemblyGraph::Link& link = assemblyGraph.links[linkId]; - const uint64_t segmentId1 = link.segmentId1; - if(segmentMap.find(segmentId1) != segmentMap.end()) { - // We already encountered this segment. - continue; - } - const vertex_descriptor v1 = addVertex(segmentId1, distance1); - segmentMap.insert(make_pair(segmentId1, v1)); - if(distance1 < maxDistance) { - q.push(segmentId1); - } - } - - // Loop over parents. - for(const uint64_t linkId: assemblyGraph.linksByTarget[segmentId0]) { - const mode3::AssemblyGraph::Link& link = assemblyGraph.links[linkId]; - const uint64_t segmentId1 = link.segmentId0; - if(segmentMap.find(segmentId1) != segmentMap.end()) { - // We already encountered this segment. - continue; - } - const vertex_descriptor v1 = addVertex(segmentId1, distance1); - segmentMap.insert(make_pair(segmentId1, v1)); - if(distance1 < maxDistance) { - q.push(segmentId1); - } - } - } - - - - // Add the edges. - for(const auto& p: segmentMap) { - const uint64_t segmentId0 = p.first; - const vertex_descriptor v0 = p.second; - - for(const uint64_t linkId: assemblyGraph.linksBySource[segmentId0]) { - const mode3::AssemblyGraph::Link& link = assemblyGraph.links[linkId]; - const uint64_t segmentId1 = link.segmentId1; - const auto it1 = segmentMap.find(segmentId1); - if(it1 == segmentMap.end()) { - continue; - } - const vertex_descriptor v1 = it1->second; - boost::add_edge(v0, v1, LocalAssemblyGraphEdge(linkId), localAssemblyGraph); - } - } - -} - - - -mode3::LocalAssemblyGraphVertex::LocalAssemblyGraphVertex( - uint64_t segmentId, - uint64_t distance) : - segmentId(segmentId), - distance(distance) -{ -} - - - -mode3::LocalAssemblyGraphVertex::LocalAssemblyGraphVertex() : - segmentId(0), - distance(0) -{ -} - - - -mode3::LocalAssemblyGraph::vertex_descriptor mode3::LocalAssemblyGraph::addVertex( - uint64_t segmentId, - uint64_t distance) -{ - return add_vertex(LocalAssemblyGraphVertex(segmentId, distance), *this); -} - - - -void mode3::LocalAssemblyGraph::writeHtml(ostream& html, const SvgOptions& options) const -{ - // Write the svg object. - html << "<div style='display: inline-block; vertical-align:top'>"; - vector<mode3::AssemblyGraph::AnalyzeSubgraphClasses::Cluster> clusters; - writeSvg(html, options, clusters); - html << "</div>"; - addSvgDragAndZoom(html); - - // Side panel. - html << "<div style='display: inline-block'>"; - - - - // Highlight a segment. - html << R"stringDelimiter( - <script> - function highlightSegment() - { - // Get the segment id from the input field. - inputField = document.getElementById("highlightInputField"); - segmentId = inputField.value; - inputField.value = ""; - - // Make it dashed and wider. - var element = document.getElementById("Segment-" + segmentId); - var thickness = element.getAttribute("stroke-width"); - element.style.strokeDasharray = 0.2 * thickness; - element.setAttribute("stroke-width", 2. * thickness); - } - </script> - Highlight segment - <input id=highlightInputField type=text onchange="highlightSegment()" size=10> - )stringDelimiter"; - - - - // Zoom to a segment. - html << R"stringDelimiter( - <script> - function zoomToSegment() - { - // Get the segment id from the input field. - inputField = document.getElementById("zoomInputField"); - segmentId = inputField.value; - inputField.value = ""; - - zoomToGivenSegment(segmentId); - } - - function zoomToGivenSegment(segmentId) - { - - // Find the bounding box and its center. - var element = document.getElementById("Segment-" + segmentId); - var box = element.getBBox(); - var xCenter = box.x + 0.5 * box.width; - var yCenter = box.y + 0.5 * box.height; - - // Change the viewbox of the svg to be a bit larger than a square - // containing the bounding box. - var enlargeFactor = 5.; - var size = enlargeFactor * Math.max(box.width, box.height); - width = size; - height = size; - x = xCenter - 0.5 * size; - y = yCenter - 0.5 * size; - var svg = document.querySelector('svg'); - svg.setAttribute('viewBox', `${x} ${y} ${size} ${size}`); - ratio = size / svg.getBoundingClientRect().width; - - } - </script> - <p>Zoom to segment - <input id=zoomInputField type=text onchange="zoomToSegment()" size=10> - )stringDelimiter"; - - - - // Initial zoom to segment of interest. - if(options.segmentColoring == "path") { - html << "\n<script>zoomToGivenSegment(" << options.pathStart << ");</script>\n"; - } - if( - options.segmentColoring == "byCommonReads" or - options.segmentColoring == "byJaccard" or - options.segmentColoring == "byRawJaccard" or - options.segmentColoring == "byUnexplainedFractionOnReferenceSegment" or - options.segmentColoring == "byUnexplainedFractionOnDisplayedSegment" - ) { - html << "\n<script>zoomToGivenSegment(" << options.referenceSegmentId << ");</script>\n"; - } - - - - // Tables that will be automatically updated when the mouse is on a segment. - html << R"zzz( -<p> -Hover on a segment to populate the tables below. -<p> -<table style='font-size:9'> -<tr><th class='left'>Segment id<td id='segmentIdCell' class=centered style='width:8em'> -<tr><th class='left'>Distance from start segment<td id='distanceCell' class=centered style='width:8em'> -<tr><th class='left'>Path length<td id='pathLengthCell' class=centered style='width:8em'> -<tr><th class='left'>Average edge coverage<td id='coverageCell' class=centered style='width:8em'> -<tr><th class='left'>Cluster id<td id='clusterIdCell' class=centered style='width:8em'> -</table> -<p> -Comparison of read compositions -<p> -<table> - -<tr> -<td> -<th>Reference<br>segment -<th>Displayed<br>segment - -<tr> -<th class='left'>Total -<th id='totalReferenceCell'> -<th id='totalDisplayedCell'> - -<tr> -<th class='left'>Common -<th id='commonReferenceCell'> -<th id='commonDisplayedCell'> - -<tr> -<th class='left'>Short -<th id='shortReferenceCell'> -<th id='shortDisplayedCell'> - -<tr> -<th class='left'>Jaccard -<th id='jaccardReferenceCell'> -<th id='jaccardDisplayedCell'> - -<tr> -<th class='left'>Raw Jaccard -<th id='rawJaccardReferenceCell'> -<th id='rawJaccardDisplayedCell'> - -<tr> -<th class='left'>Unexplained -<th id='unexplainedReferenceCell'> -<th id='unexplainedDisplayedCell'> - -<tr> -<th class='left'>Unexplained fraction -<th id='unexplainedFractionReferenceCell'> -<th id='unexplainedFractionDisplayedCell'> - -</table> - -<script> -function onMouseEnterSegment(id, distance, pathLength, coverage, clusterId, - totalReference, totalDisplayed, - shortReference, shortDisplayed, - common, - unexplainedReference, unexplainedDisplayed) -{ - document.getElementById('segmentIdCell').innerHTML = id; - document.getElementById('distanceCell').innerHTML = distance; - document.getElementById('pathLengthCell').innerHTML = pathLength; - document.getElementById('coverageCell').innerHTML = coverage; - if(clusterId != 18446744073709551615) { - document.getElementById('clusterIdCell').innerHTML = clusterId; - } - - document.getElementById('totalReferenceCell').innerHTML = totalReference; - document.getElementById('totalDisplayedCell').innerHTML = totalDisplayed; - document.getElementById('commonReferenceCell').innerHTML = common; - document.getElementById('commonDisplayedCell').innerHTML = common; - - if(common > 0) { - document.getElementById('shortReferenceCell').innerHTML = shortReference; - document.getElementById('shortDisplayedCell').innerHTML = shortDisplayed; - jaccard = (common / (common + unexplainedReference + unexplainedDisplayed)).toFixed(2); - rawJaccard = (common / (totalReference + totalDisplayed - common)).toFixed(2); - document.getElementById('jaccardReferenceCell').innerHTML = jaccard; - document.getElementById('jaccardDisplayedCell').innerHTML = jaccard; - document.getElementById('rawJaccardReferenceCell').innerHTML = rawJaccard; - document.getElementById('rawJaccardDisplayedCell').innerHTML = rawJaccard; - document.getElementById('unexplainedReferenceCell').innerHTML = unexplainedReference; - document.getElementById('unexplainedDisplayedCell').innerHTML = unexplainedDisplayed; - document.getElementById('unexplainedFractionReferenceCell').innerHTML = - (unexplainedReference / (common + unexplainedReference)).toFixed(2); - document.getElementById('unexplainedFractionDisplayedCell').innerHTML = - (unexplainedDisplayed / (common + unexplainedDisplayed)).toFixed(2); - } -} -function onMouseExitSegment() -{ - document.getElementById('segmentIdCell').innerHTML = ''; - document.getElementById('distanceCell').innerHTML = ''; - document.getElementById('pathLengthCell').innerHTML = ''; - document.getElementById('coverageCell').innerHTML = ''; - document.getElementById('clusterIdCell').innerHTML = ''; - - document.getElementById('totalReferenceCell').innerHTML = ''; - document.getElementById('totalDisplayedCell').innerHTML = ''; - document.getElementById('shortReferenceCell').innerHTML = ''; - document.getElementById('shortDisplayedCell').innerHTML = ''; - document.getElementById('commonReferenceCell').innerHTML = ''; - document.getElementById('commonDisplayedCell').innerHTML = ''; - document.getElementById('jaccardReferenceCell').innerHTML = ''; - document.getElementById('jaccardDisplayedCell').innerHTML = ''; - document.getElementById('unexplainedReferenceCell').innerHTML = ''; - document.getElementById('unexplainedDisplayedCell').innerHTML = ''; - document.getElementById('unexplainedFractionReferenceCell').innerHTML = ''; - document.getElementById('unexplainedFractionDisplayedCell').innerHTML = ''; -} -</script> - )zzz"; - - - - // Change segment thickness - html << R"stringDelimiter( - <p><table> - <tr><th class=left>Segment thickness<td> - <button type='button' onClick='segmentThickness(0.1)' style='width:3em'>---</button> - <button type='button' onClick='segmentThickness(0.5)' style='width:3em'>--</button> - <button type='button' onClick='segmentThickness(0.8)' style='width:3em'>-</button> - <button type='button' onClick='segmentThickness(1.25)' style='width:3em'>+</button> - <button type='button' onClick='segmentThickness(2.)' style='width:3em'>++</button> - <button type='button' onClick='segmentThickness(10.)' style='width:3em'>+++</button> - <script> - function segmentThickness(factor) - { - const group = document.getElementById('LocalAssemblyGraph-segments'); - descendants = group.querySelectorAll("path"); - for (let i=0; i<descendants.length; i++) { - path = descendants[i]; - path.setAttribute('stroke-width', factor * path.getAttribute('stroke-width')); - } - } - </script> - )stringDelimiter"; - - - - // Change link thickness - html << R"stringDelimiter( - <tr><th class=left>Link thickness<td> - <button type='button' onClick='linkThickness(0.1)' style='width:3em'>---</button> - <button type='button' onClick='linkThickness(0.5)' style='width:3em'>--</button> - <button type='button' onClick='linkThickness(0.8)' style='width:3em'>-</button> - <button type='button' onClick='linkThickness(1.25)' style='width:3em'>+</button> - <button type='button' onClick='linkThickness(2.)' style='width:3em'>++</button> - <button type='button' onClick='linkThickness(10.)' style='width:3em'>+++</button> - <script> - function linkThickness(factor) - { - const group1 = document.getElementById('LocalAssemblyGraph-links'); - for (let i=0; i<group1.children.length; i++) { - group2 = group1.children[i]; - if(group2.tagName == 'g') { - for (let j=0; j<group2.children.length; j++) { - path = group2.children[j]; - if(path.tagName == 'path') { - path.setAttribute('stroke-width', factor * path.getAttribute('stroke-width')); - } - } - } - } - } - </script> - )stringDelimiter"; - - - - // Zoom buttons. - html << R"stringDelimiter( - <tr title='Or use the mouse wheel.'><th class=left>Zoom<td> - <button type='button' onClick='zoomSvg(0.1)' style='width:3em'>---</button> - <button type='button' onClick='zoomSvg(0.5)' style='width:3em'>--</button> - <button type='button' onClick='zoomSvg(0.8)' style='width:3em'>-</button> - <button type='button' onClick='zoomSvg(1.25)' style='width:3em'>+</button> - <button type='button' onClick='zoomSvg(2.)' style='width:3em'>++</button> - <button type='button' onClick='zoomSvg(10.)' style='width:3em'>+++</button> - </table> - )stringDelimiter"; - - - // Code to display one local cluster at a time, with a button - // to cycle through them. - if(options.segmentColoring == "byLocalCluster") { - html << - "<br>Found " << clusters.size() << " clusters. " - "Displaying cluster <span id='currentCluster'></span>" - "<br><button onClick='previousCluster()'>Previous<br>cluster</button>" - "<button onClick='nextCluster()'>Next<br>cluster</button>" - "<script>\n" - "var clusters = ["; - for(uint64_t i=0; i<clusters.size(); i++) { - html << "["; - const auto & cluster = clusters[i]; - for(uint64_t j=0; j<cluster.segments.size(); j++) { - html << cluster.segments[j].first; - if(j != cluster.segments.size() - 1) { - html << ","; - } - } - html << "]"; - if(i != clusters.size() -1) { - html << ","; - } - } - html << "];\n"; - - html << R"stringDelimiter( - - function clusterColor(clusterId) - { - var ratio = clusterId / clusters.length; - return 'hsl(' + Math.round(360*ratio) + ', 85%, 70%)'; - } - - function highlightCluster(clusterId, color) - { - for(i=0; i<clusters[clusterId].length; i++) { - segmentId = clusters[clusterId][i]; - document.getElementById("Segment-" + segmentId).style.stroke = color; - document.getElementById("marker" + segmentId).style.fill = color; - } - } - var currentCluster = 0; - highlightCluster(currentCluster, clusterColor(currentCluster)); - document.getElementById("currentCluster").innerHTML = currentCluster; - function nextCluster() - { - highlightCluster(currentCluster, "Black"); - currentCluster = currentCluster + 1; - if(currentCluster == clusters.length) { - currentCluster = 0; - } - highlightCluster(currentCluster, clusterColor(currentCluster)); - document.getElementById("currentCluster").innerHTML = currentCluster; - } - function previousCluster() - { - highlightCluster(currentCluster, "Black"); - if(currentCluster == 0) { - currentCluster = clusters.length; - } - currentCluster = currentCluster - 1; - highlightCluster(currentCluster, clusterColor(currentCluster)); - document.getElementById("currentCluster").innerHTML = currentCluster; - } - </script> - - )stringDelimiter"; - } - - // End of side panel. - html << "</div>"; - -} - - - -void mode3::LocalAssemblyGraph::writeSvg( - const string& fileName, - const SvgOptions& options, - vector<mode3::AssemblyGraph::AnalyzeSubgraphClasses::Cluster>& clusters) const -{ - ofstream svg(fileName); - writeSvg(svg, options, clusters); -} -void mode3::LocalAssemblyGraph::writeSvg( - ostream& svg, - const SvgOptions& options, - vector<mode3::AssemblyGraph::AnalyzeSubgraphClasses::Cluster>& clusters - ) const -{ - const LocalAssemblyGraph& localAssemblyGraph = *this; - - - // If necessary, compute a map containing a SegmentPairInformation object - // containing pair information between the reference segment - // and each segment in the local assembly graph. - const bool doSegmentPairComputations = true; - std::map<vertex_descriptor, SegmentPairInformation> segmentPairInformationTable; - mode3::AssemblyGraph::SegmentOrientedReadInformation referenceSegmentInfo; - if(doSegmentPairComputations) { - - // Find oriented reads in the reference segment. - assemblyGraph.getOrientedReadsOnSegment(options.referenceSegmentId, referenceSegmentInfo); - - // Loop over segments in the localAssemblyGraph. - BGL_FORALL_VERTICES(v, localAssemblyGraph, LocalAssemblyGraph){ - mode3::AssemblyGraph::SegmentOrientedReadInformation segmentInfo; - assemblyGraph.getOrientedReadsOnSegment( - localAssemblyGraph[v].segmentId, segmentInfo); - - SegmentPairInformation segmentPairInformation; - assemblyGraph.analyzeSegmentPair( - options.referenceSegmentId, localAssemblyGraph[v].segmentId, - referenceSegmentInfo, segmentInfo, - assemblyGraph.markers, segmentPairInformation); - - segmentPairInformationTable.insert(make_pair(v, segmentPairInformation)); - } - } - - - std::map<uint64_t, vector<pair<uint64_t, bool> > > pathSegments; // map(segmentId, (positionsInPath, is referenceSegment)). - AssemblyPath path; - if(options.segmentColoring == "path") { - if(options.pathDirection=="forward" or options.pathDirection=="backward") { - // Forward or backward. - assemblyGraph.createAssemblyPath(options.pathStart, - (options.pathDirection == "forward") ? 0 : 1, path); - if(options.pathDirection == "backward") { - reverse(path.segments.begin(), path.segments.end()); - } - } else { - // Bidirectional. - AssemblyPath forwardPath; - AssemblyPath backwardPath; - assemblyGraph.createAssemblyPath(options.pathStart, 0, forwardPath); - assemblyGraph.createAssemblyPath(options.pathStart, 1, backwardPath); - // Stitch them together, making sure not to repeat the starting segment. - path.segments.clear(); - copy(backwardPath.segments.rbegin(), backwardPath.segments.rend(), back_inserter(path.segments)); - copy(forwardPath.segments.begin() + 1, forwardPath.segments.end(), back_inserter(path.segments)); - } - for(uint64_t position=0; position<path.segments.size(); position++) { - const AssemblyPathSegment& segment = path.segments[position]; - const uint64_t segmentId = segment.id; - pathSegments[segmentId].push_back(make_pair(position, segment.isPrimary)); - } - svg << "\nPath of length " << path.segments.size() << " starting at segment " << path.segments.front().id << - " and ending at segment " << path.segments.back().id << "<br>"; - - ofstream csv("Path.csv"); - csv << "Position,SegmentId,Reference\n"; - for(uint64_t position=0; position<path.segments.size(); position++) { - const AssemblyPathSegment& segment = path.segments[position]; - csv << position << "," << segment.id << "," << int(segment.isPrimary) << "\n"; - } - - // If requested, assemble path sequence. - if(options.assemblePathSequence) { - path.assemble(assemblyGraph); - } - } - - - - // If coloring by local cluster, call mode3::AssemblyGraph::analyzeSubgraph, - // passing as input all the segments in the LocalAssemblyGraph - // except those at maximum distance. - if(options.segmentColoring == "byLocalCluster") { - vector<uint64_t> segmentIds; - BGL_FORALL_VERTICES(v, localAssemblyGraph, LocalAssemblyGraph) { - const LocalAssemblyGraphVertex& vertex = localAssemblyGraph[v]; - if(vertex.distance != maxDistance) { - segmentIds.push_back(vertex.segmentId); - } - } - assemblyGraph.analyzeSubgraph(segmentIds, clusters, true); - - } - - - - // If coloring by cluster id only some clusters, create a color map - // for the clusters to be colored. - std::map<uint64_t, string> clusterColorMap; - if(options.segmentColoring == "byCluster") { - const uint64_t clusterCount = options.clustersToBeColored.size(); - if(clusterCount > 0) { - for(uint64_t i=0; i<clusterCount; i++) { - const uint64_t hue = uint64_t(std::round(double(i) * 360. / double(clusterCount))); - const string color = "hsl(" + to_string(hue) + ",100%, 50%)"; - const uint64_t clusterId = options.clustersToBeColored[i]; - clusterColorMap.insert(make_pair(clusterId, color)); - } - } - } - - - - using boost::geometry::add_point; - using boost::geometry::expand; - using boost::geometry::make_inverse; - using boost::geometry::multiply_value; - using boost::geometry::subtract_point; - using Box = boost::geometry::model::box<Point>; - - // Compute the view box. - Box box = make_inverse<Box>(); - BGL_FORALL_VERTICES(v, localAssemblyGraph, LocalAssemblyGraph) { - const LocalAssemblyGraphVertex& vertex = localAssemblyGraph[v]; - SHASTA_ASSERT(vertex.position.size() >= 2); - const Point& p1 = vertex.position.front(); - const Point& p2 = vertex.position.back(); - - expand(box, p1); - expand(box, p2); - } - Point minCorner = box.min_corner(); - Point maxCorner = box.max_corner(); - - // Add a bit of extra space. - Point delta = maxCorner; - subtract_point(delta, minCorner); - multiply_value(delta, 0.05); - subtract_point(minCorner, delta); - add_point(maxCorner, delta); - - - - // Figure out the required size of the viewbox. - Point diagonal = maxCorner; - subtract_point(diagonal, minCorner); - - // Begin the svg. - const string svgId = "LocalAssemblyGraph"; - svg << "\n<svg id='" << svgId << - "' width='" << options.sizePixels << - "' height='" << options.sizePixels << - "' viewbox='" << minCorner.x() << " " << minCorner.y() << " " << - diagonal.x() << " " << - diagonal.y() << "'" - " style='border-style:solid;border-color:Black;'" - ">\n"; - - - - // Write the links first, so they don't overwrite the segments. - svg << "<g id='" << svgId << "-links'>\n"; - BGL_FORALL_EDGES(e, localAssemblyGraph, LocalAssemblyGraph) { - const uint64_t linkId = localAssemblyGraph[e].linkId; - const AssemblyGraph::Link& link = assemblyGraph.links[linkId]; - - // Access the LocalAssemblyGraph vertices corresponding to - // the two segments of this Link and extract some information - // from them. - const vertex_descriptor v1 = source(e, localAssemblyGraph); - const vertex_descriptor v2 = target(e, localAssemblyGraph); - const LocalAssemblyGraphVertex& vertex1 = localAssemblyGraph[v1]; - const LocalAssemblyGraphVertex& vertex2 = localAssemblyGraph[v2]; - const uint64_t segmentId1 = vertex1.segmentId; - const uint64_t segmentId2 = vertex2.segmentId; - - // Get the positions of the ends of this link. - SHASTA_ASSERT(vertex1.position.size() >= 2); - SHASTA_ASSERT(vertex2.position.size() >= 2); - const Point& p1 = vertex1.position.back(); - const Point& p2 = vertex2.position.front(); - const double length = boost::geometry::distance(p1, p2); - - // Get the tangents and compute the control points. - const double controlPointDistance = 0.25 * length; - const Point& t1 = vertex1.t2; - const Point& t2 = vertex2.t1; - Point q1 = t1; - multiply_value(q1, controlPointDistance); - add_point(q1, p1); - Point q2 = t2; - multiply_value(q2, controlPointDistance); - add_point(q2, p2); - - const double linkThickness = - options.minimumLinkThickness + - options.additionalLinkThicknessPerRead * double(assemblyGraph.linkCoverage(linkId) - 1); - - const string dash = - link.segmentsAreAdjacent ? "" : - " stroke-dasharray='0 " + to_string(1.5 * linkThickness) + "'"; - - // If the link participates in a path, color it consistently with the - // segments is joins. - string linkColor = options.linkColor; - if(options.segmentColoring == "path") { - const auto it1 = pathSegments.find(segmentId1); - if(it1 != pathSegments.end()) { - const auto positions1 = it1->second; - SHASTA_ASSERT(not positions1.empty()); - const auto it2 = pathSegments.find(segmentId2); - if(it2 != pathSegments.end()) { - const auto positions2 = it2->second; - SHASTA_ASSERT(not positions2.empty()); - if(positions1.size()==1 and positions2.size()==1) { - const uint64_t position1 = positions1.front().first; - const uint64_t position2 = positions2.front().first; - if(position2 == position1 + 1) { - const uint32_t hue = uint32_t( - std::round(120. * double(position1 + position2) / double(path.segments.size()))); - linkColor = "hsl(" + to_string(hue) + ",100%, 20%)"; - } - } else { - linkColor = "Fuchsia"; - } - } - } - } - - svg << - "<g>" - // "<a href='exploreMode3AssemblyGraphLink?linkId=" << linkId << "'>" - "<title>" - "Link " << linkId << - " from segment " << segmentId1 << - " to segment " << segmentId2 << - ", coverage " << assemblyGraph.linkCoverage(linkId) << - ", separation " << link.separation << - "</title>" - "<path d=" - "'M " << p1.x() << " " << p1.y() << - " C " << q1.x() << " " << q1.y() << ", " - << q2.x() << " " << q2.y() << "," - << p2.x() << " " << p2.y() << "'" - " stroke='" << linkColor << "'" << - dash << - " stroke-width='" << linkThickness << "'" - " stroke-linecap='round'" - " fill='transparent'" - // " vector-effect='non-scaling-stroke'" - " onclick='if(event.ctrlKey) {location.href=\"exploreMode3AssemblyGraphLink?linkId=" << linkId << "\";}'" - "/>" - // "</a>" - "</g>\n"; - - } - svg << "</g>\n"; - - - - // Write the segments. - svg << "<g id='" << svgId << "-segments'>\n"; - BGL_FORALL_VERTICES(v, localAssemblyGraph, LocalAssemblyGraph) { - const LocalAssemblyGraphVertex& vertex = localAssemblyGraph[v]; - const uint64_t distance = localAssemblyGraph[v].distance; - - // Get the positions of the ends of this segment. - SHASTA_ASSERT(vertex.position.size() >= 2); - const Point& p1 = vertex.position.front(); - const Point& p2 = vertex.position.back(); - const double length = boost::geometry::distance(p1, p2); - - // Get the tangents and compute the control points. - const double controlPointDistance = 0.25 * length; - const Point& t1 = vertex.t1; - const Point& t2 = vertex.t2; - Point q1 = t1; - multiply_value(q1, -controlPointDistance); - add_point(q1, p1); - Point q2 = t2; - multiply_value(q2, -controlPointDistance); - add_point(q2, p2); - - const uint64_t segmentId = localAssemblyGraph[v].segmentId; - - - - // Decide the color for this segment. - string color; - if(distance == maxDistance) { - color = options.segmentAtMaxDistanceColor; - } else { - if(options.segmentColoring == "random") { - color = randomSegmentColor(segmentId); - } else if(options.segmentColoring == "uniform") { - color = options.segmentColor; - } else if(options.segmentColoring == "byCommonReads") { - const uint64_t commonCount = segmentPairInformationTable[v].commonCount; - double fraction; - if(options.greenThreshold) { - fraction = min(1., double(commonCount) / double(options.greenThreshold)); - } else { - fraction = double(commonCount) / double(referenceSegmentInfo.infos.size()); - } - const uint64_t hue = uint64_t(std::round(fraction * 120.)); - color = "hsl(" + to_string(hue) + ",100%, 50%)"; - } else if(options.segmentColoring == "byJaccard") { - const auto& pairInfo = segmentPairInformationTable[v]; - if(pairInfo.commonCount > 0) { - const double jaccard = pairInfo.jaccard(); - const uint64_t hue = uint64_t(std::round(jaccard * 120.)); - color = "hsl(" + to_string(hue) + ",100%, 50%)"; - } else { - color = "blue"; - } - } else if(options.segmentColoring == "byRawJaccard") { - const auto& pairInfo = segmentPairInformationTable[v]; - if(pairInfo.commonCount > 0) { - const double rawJaccard = pairInfo.rawJaccard(); - const uint64_t hue = uint64_t(std::round(rawJaccard * 120.)); - color = "hsl(" + to_string(hue) + ",100%, 50%)"; - } else { - color = "blue"; - } - } else if(options.segmentColoring == "byUnexplainedFractionOnReferenceSegment") { - const auto& pairInfo = segmentPairInformationTable[v]; - if(pairInfo.commonCount > 0) { - const double fraction = 1. - pairInfo.unexplainedFraction(0); - const uint64_t hue = uint64_t(std::round(fraction * 120.)); - color = "hsl(" + to_string(hue) + ",100%, 50%)"; - } else { - color = "blue"; - } - } else if(options.segmentColoring == "byUnexplainedFractionOnDisplayedSegment") { - const auto& pairInfo = segmentPairInformationTable[v]; - if(pairInfo.commonCount > 0) { - const double fraction = 1. - pairInfo.unexplainedFraction(1); - const uint64_t hue = uint64_t(std::round(fraction * 120.)); - color = "hsl(" + to_string(hue) + ",100%, 50%)"; - } else { - color = "blue"; - } - } else if(options.segmentColoring == "byCluster") { - const uint64_t clusterId = assemblyGraph.clusterIds[segmentId]; - if(clusterId == std::numeric_limits<uint64_t>::max()) { - color = "Gray"; - } else { - if(options.clustersToBeColored.empty()) { - // We are coloring all cluster. Use a hash function to decide the color. - const uint32_t hashValue = MurmurHash2(&clusterId, sizeof(clusterId), uint32_t(options.hashSeed)); - const uint32_t hue = hashValue % 360; - color = "hsl(" + to_string(hue) + ",100%, 50%)"; - } else { - // We are only coloring some segments. - auto it = clusterColorMap.find(clusterId); - if(it == clusterColorMap.end()) { - color = "Black"; - } else { - color = it->second; - } - } - } - } else if(options.segmentColoring == "path") { - auto it = pathSegments.find(segmentId); - if(it == pathSegments.end()) { - color = "Black"; - } else { - const auto positions = it->second; - SHASTA_ASSERT(not positions.empty()); - if(positions.size() == 1) { - const auto& p = positions.front(); - const uint64_t positionInPath = p.first; - const bool isReferenceSegment = p.second; - const uint32_t hue = uint32_t( - std::round(240. * double(positionInPath) / double(path.segments.size()))); - color = "hsl(" + to_string(hue) + ",100%, " + (isReferenceSegment ? "40%" : "70%") + ")"; - } else { - // This segment appears more than once on the path. - color = "Fuchsia"; - } - } - } else { - color = "Black"; - } - } - - - - // Get the oriented reads and average edge coverage. - vector<OrientedReadId> orientedReadIds; - const double averageEdgeCoverage = assemblyGraph.findOrientedReadsOnSegment(segmentId, orientedReadIds); - - // Create a marker to show the arrow for this segment. - const string arrowMarkerName = "arrow" + to_string(segmentId); - svg << - "<defs>\n" - "<marker id='" << arrowMarkerName << - "' viewBox='0 0 0.6 1'\n" - "refX='0.1' refY='0.5'\n" - "markerUnits='strokeWidth'\n" - "markerWidth='0.6' markerHeight='1'\n" - "orient='auto'>\n" - "<path id='marker" << segmentId << "' d='M 0 0 L 0.1 0 L 0.6 0.5 L 0.1 1 L 0 1 z' " - "fill='" << color << "' " - "/>\n" - "</marker>\n" - "</defs>\n"; - - // Add this segment to the svg. - const auto& segmentPairInfo = segmentPairInformationTable[v]; - const auto oldPrecision = svg.precision(1); - const auto oldFlags = svg.setf(std::ios_base::fixed, std::ios_base::floatfield); - - if(options.segmentColoring == "path") { - svg << "<g>"; - auto it = pathSegments.find(segmentId); - if(it != pathSegments.end()) { - const auto positions = it->second; - SHASTA_ASSERT(not positions.empty()); - svg << "<title>"; - for(const auto& p: positions) { - svg << p.first << " "; - } - svg << "</title>"; - } - } - - /* - svg << - "<g>" - // "<a href='exploreMode3AssemblyGraphSegment?segmentId=" << segmentId << "'>" - "<title>" - "Segment " << segmentId << - ", distance from start segment " << distance << - ", path length " << assemblyGraph.paths.size(segmentId) << - ", average marker graph edge coverage " << averageEdgeCoverage << - ", number of distinct oriented reads " << orientedReadIds.size(); - if(doSegmentPairComputations) { - svg << ", number of common oriented reads " << segmentPairInfo.commonOrientedReadCount << - " of " << referenceSegmentInfo.infos.size(); - } - */ - svg << - // "</title>" - "<path id='Segment-" << segmentId << "'" - " onmouseenter='onMouseEnterSegment(" << - segmentId << "," << - distance << "," << - assemblyGraph.markerGraphPaths.size(segmentId) << "," << - averageEdgeCoverage << "," << - assemblyGraph.clusterIds[segmentId] << "," << - segmentPairInfo.totalCount[0] << "," << - segmentPairInfo.totalCount[1] << "," << - segmentPairInfo.shortCount[0] << "," << - segmentPairInfo.shortCount[1] << "," << - segmentPairInfo.commonCount << "," << - segmentPairInfo.unexplainedCount[0] << "," << - segmentPairInfo.unexplainedCount[1] << ")'" << - " onmouseleave='onMouseExitSegment()'" << - -#if 0 - // Old code that displays the segment as a cubic spline. - // This can create artifacts when the segment is very thick. - "' d='M " << - p1.x() << " " << p1.y() << " C " << - q1.x() << " " << q1.y() << ", " << - q2.x() << " " << q2.y() << ", " << - p2.x() << " " << p2.y() << "'" << -#endif - - " d='M " << - p1.x() << " " << p1.y() << " L " << - p2.x() << " " << p2.y() << "'" << - " stroke='" << color << "'" - " stroke-width='" << - options.minimumSegmentThickness + averageEdgeCoverage * options.additionalSegmentThicknessPerUnitCoverage << "'" - " fill='none'" - " marker-end='url(#" << - arrowMarkerName << - ")'" - " onclick='if(event.ctrlKey) {" - "location.href=\"exploreMode3AssemblyGraphSegment?segmentId=" << segmentId << - "&showSequence=on\";}'" - "/>" - // "</a>" - // "</g>" - "\n"; - svg.precision(oldPrecision); - svg.flags(oldFlags); - if(options.segmentColoring == "path") { - svg << "</g>"; - } - } - svg << "</g>\n"; - - - - // End the svg. - svg << "</svg>\n"; -} - - - -void mode3::LocalAssemblyGraph::computeLayout( - const SvgOptions& options, - double timeout) -{ - LocalAssemblyGraph& localAssemblyGraph = *this; - - - // Create an auxiliary graph with two vertices for each segment. - using G = boost::adjacency_list<boost::vecS, boost::vecS, boost::undirectedS>; - G g; - std::map<vertex_descriptor, array<G::vertex_descriptor, 2> > vertexMap; - std::map<G::edge_descriptor, double> edgeLengthMap; - BGL_FORALL_VERTICES(v, localAssemblyGraph, LocalAssemblyGraph) { - const uint64_t segmentId = localAssemblyGraph[v].segmentId; - - const uint64_t pathLength = assemblyGraph.markerGraphPaths.size(segmentId); - const double displayLength = - options.minimumSegmentLength + - double(pathLength - 1) * options.additionalSegmentLengthPerMarker; - - // Add the auxiliary vertices. - array<G::vertex_descriptor, 2>& auxiliaryVertices = vertexMap[v]; - for(uint64_t i=0; i<2; i++) { - auxiliaryVertices[i] = boost::add_vertex(g); - } - - // Add the edge between these auxiliary vertices. - G::edge_descriptor e; - tie(e, ignore) = boost::add_edge(auxiliaryVertices[0], auxiliaryVertices[1], g); - edgeLengthMap.insert(make_pair(e, displayLength)); - } - - - - // Add auxiliary graph edges between vertices corresponding to different - // LocalAssemblyGraph vertices. - BGL_FORALL_EDGES(e, localAssemblyGraph, LocalAssemblyGraph) { - const vertex_descriptor v1 = source(e, localAssemblyGraph); - const vertex_descriptor v2 = target(e, localAssemblyGraph); - const LocalAssemblyGraphEdge& edge = localAssemblyGraph[e]; - const uint64_t linkId = edge.linkId; - const AssemblyGraph::Link& link = assemblyGraph.links[linkId]; - - double edgeLength; - if(link.segmentsAreAdjacent) { - edgeLength = options.minimumLinkLength; - } else { - const int32_t linkSeparation = max(link.separation, 0); - edgeLength = 3. * options.minimumLinkLength + double(linkSeparation) * options.additionalLinkLengthPerMarker; - } - G::edge_descriptor eAuxiliary; - tie(eAuxiliary, ignore) = add_edge( - vertexMap[v1].back(), - vertexMap[v2].front(), - g); - edgeLengthMap.insert(make_pair(eAuxiliary, edgeLength)); - } - - - - // Compute the layout of the auxiliary graph. - std::map<G::vertex_descriptor, array<double, 2> > positionMap; - ComputeLayoutReturnCode returnCode = ComputeLayoutReturnCode::Success; - if(options.layoutMethod == "neato") { - returnCode = shasta::computeLayoutGraphviz(g, "neato", timeout, positionMap, "", &edgeLengthMap); - } else if(options.layoutMethod == "custom") { - returnCode = shasta::computeLayoutCustom(g, edgeLengthMap, positionMap, timeout); - } else { - throw runtime_error("Invalid layout method specified: " + options.layoutMethod); - } - if(returnCode == ComputeLayoutReturnCode::Timeout) { - throw runtime_error("Graph layout took too long. " - "Increase the timeout or decrease the maximum distance."); - } - if(returnCode != ComputeLayoutReturnCode::Success) { - throw runtime_error("Graph layout failed."); - } - - - - // Store the layout in the vertices of the localAssemblyGraph. - BGL_FORALL_VERTICES(v, localAssemblyGraph, LocalAssemblyGraph) { - LocalAssemblyGraphVertex& vertex = localAssemblyGraph[v]; - vertex.position.clear(); - - // Locate the auxiliary vertices corresponding to this segment. - auto it = vertexMap.find(v); - SHASTA_ASSERT(it != vertexMap.end()); - const array<G::vertex_descriptor, 2>& auxiliaryVertices = it->second; - - // Loop over the auxiliary vertices. - for(const G::vertex_descriptor u: auxiliaryVertices) { - auto jt = positionMap.find(u); - SHASTA_ASSERT(jt != positionMap.end()); - const array<double, 2>& p = jt->second; - vertex.position.push_back(Point(p[0], p[1])); - } - } -} - - - -void LocalAssemblyGraph::computeSegmentTangents() -{ - LocalAssemblyGraph& localAssemblyGraph = *this; - BGL_FORALL_VERTICES(v, localAssemblyGraph, LocalAssemblyGraph) { - computeSegmentTangents(v); - } -} - - - - -void LocalAssemblyGraph::computeSegmentTangents(vertex_descriptor v0) -{ - LocalAssemblyGraph& localAssemblyGraph = *this; - LocalAssemblyGraphVertex& vertex0 = localAssemblyGraph[v0]; - SHASTA_ASSERT(vertex0.position.size() >= 2); - const Point& vertex0Start = vertex0.position.front(); - const Point& vertex0End = vertex0.position.back(); - - Point t = vertex0End; - boost::geometry::subtract_point(t, vertex0Start); - const double length = sqrt(t.x() * t.x() + t.y() * t.y()); - boost::geometry::multiply_value(t, 1. / length); - vertex0.t2 = t; - boost::geometry::multiply_value(t, -1.); - vertex0.t1 = t; - - -#if 0 - // This is used if we display segments as Bezier cubics. - - - // To compute t1, average the unit vectors of the backward links. - array<double, 2> direction = {0., 0.}; - uint64_t n = 0; - BGL_FORALL_INEDGES(v0, e, localAssemblyGraph, LocalAssemblyGraph) { - const vertex_descriptor v1 = source(e, localAssemblyGraph); - LocalAssemblyGraphVertex& vertex1 = localAssemblyGraph[v1]; - SHASTA_ASSERT(vertex1.position.size() >= 2); - const Point& vertex1Start = vertex1.position.front(); - - const double dx = vertex1Start.x() - vertex0End.x(); - const double dy = vertex1Start.y() - vertex0End.y(); - const double d = sqrt(dx * dx + dy * dy); - if(d == 0.) { - continue; - } - - // Accumulate the unit vector. - ++n; - direction[0] += dx / d; - direction[1] += dy / d; - } - // Compute the average,normalized direction. - double dLength = sqrt(direction[0] * direction[0] + direction[1] * direction[1]); - if(dLength == 0.) { - direction[0] = vertex0Start.x() - vertex0End.x(); - direction[1] = vertex0Start.y() - vertex0End.y(); - dLength = sqrt(direction[0] * direction[0] + direction[1] * direction[1]); - } - direction[0] /= dLength; - direction[1] /= dLength; - - vertex0.t1.x(direction[0]); - vertex0.t1.y(direction[1]); - - - - // To compute the second control point, q2, - // average the unit vectors of the forward links. - direction = {0., 0.}; - n = 0; - BGL_FORALL_OUTEDGES(v0, e, localAssemblyGraph, LocalAssemblyGraph) { - const vertex_descriptor v1 = target(e, localAssemblyGraph); - LocalAssemblyGraphVertex& vertex1 = localAssemblyGraph[v1]; - SHASTA_ASSERT(vertex1.position.size() >= 2); - const Point& vertex1Start = vertex1.position.front(); - - const double dx = vertex1Start.x() - vertex0End.x(); - const double dy = vertex1Start.y() - vertex0End.y(); - const double d = sqrt(dx * dx + dy * dy); - if(d == 0.) { - continue; - } - - // Accumulate the unit vector. - ++n; - direction[0] += dx / d; - direction[1] += dy / d; - } - // Compute the average,normalized direction. - dLength = sqrt(direction[0] * direction[0] + direction[1] * direction[1]); - if(dLength == 0.) { - direction[0] = vertex0End.x() - vertex0Start.x(); - direction[1] = vertex0End.y() - vertex0Start.y(); - dLength = sqrt(direction[0] * direction[0] + direction[1] * direction[1]); - } - direction[0] /= dLength; - direction[1] /= dLength; - - vertex0.t2.x(direction[0]); - vertex0.t2.y(direction[1]); -#endif -} - - - -// Return the svg color for a segment. -string LocalAssemblyGraph::randomSegmentColor(uint64_t segmentId) -{ - const uint32_t hue = MurmurHash2(&segmentId, sizeof(segmentId), 231) % 360; - return "hsl(" + to_string(hue) + ",50%,50%)"; -} - - - -// Find out if the paths of two segments are consecutive. -bool LocalAssemblyGraph::haveConsecutivePaths( - vertex_descriptor v0, - vertex_descriptor v1 -) const -{ - const LocalAssemblyGraph& localAssemblyGraph = *this; - - const LocalAssemblyGraphVertex& vertex0 = localAssemblyGraph[v0]; - const LocalAssemblyGraphVertex& vertex1 = localAssemblyGraph[v1]; - - const uint64_t segmentId0 = vertex0.segmentId; - const uint64_t segmentId1 = vertex1.segmentId; - - const auto path0 = assemblyGraph.markerGraphPaths[segmentId0]; - const auto path1 = assemblyGraph.markerGraphPaths[segmentId1]; - - const MarkerGraphEdgeId edgeId0 = path0.back(); - const MarkerGraphEdgeId edgeId1 = path1.front(); - - const MarkerGraph::Edge& edge0 = markerGraph.edges[edgeId0]; - const MarkerGraph::Edge& edge1 = markerGraph.edges[edgeId1]; - - return edge0.target == edge1.source; -} - - - -// Return the average link separation for the Link -// described by an edge. -int32_t LocalAssemblyGraph::linkSeparation(edge_descriptor e) const -{ - const LocalAssemblyGraph& localAssemblyGraph = *this; - const uint64_t linkId = localAssemblyGraph[e].linkId; - return assemblyGraph.links[linkId].separation; -} - - - -// Construct the svg options from an html request. -LocalAssemblyGraph::SvgOptions::SvgOptions(const vector<string>& request) -{ - // The initial layout method if set to "custom" if - // command "customLayout" is available, "neato" otherwise. - static bool firstTime = true; - static string layoutDefaultMethod = "neato"; - if(firstTime) { - firstTime = false; - const string command = "which customLayout"; - const int returnCode = system(command.c_str()); - if(returnCode == 0) { - layoutDefaultMethod = "custom"; - } - } - layoutMethod = layoutDefaultMethod; - - HttpServer::getParameterValue(request, "sizePixels", sizePixels); - HttpServer::getParameterValue(request, "layoutMethod", layoutMethod); - - // Segment length and thickness. - HttpServer::getParameterValue(request, "minimumSegmentLength", minimumSegmentLength); - HttpServer::getParameterValue(request, "additionalSegmentLengthPerMarker", additionalSegmentLengthPerMarker); - HttpServer::getParameterValue(request, "minimumSegmentThickness", minimumSegmentThickness); - HttpServer::getParameterValue(request, "additionalSegmentThicknessPerUnitCoverage", additionalSegmentThicknessPerUnitCoverage); - - // Segment coloring - HttpServer::getParameterValue(request, "segmentColoring", segmentColoring); - HttpServer::getParameterValue(request, "segmentColor", segmentColor); - HttpServer::getParameterValue(request, "greenThreshold", greenThreshold); - HttpServer::getParameterValue(request, "referenceSegmentId", referenceSegmentId); - HttpServer::getParameterValue(request, "hashSeed", hashSeed); - HttpServer::getParameterValue(request, "pathStart", pathStart); - HttpServer::getParameterValue(request, "pathDirection", pathDirection); - - string clustersToBeColoredString; - HttpServer::getParameterValue(request, "clustersToBeColored", clustersToBeColoredString); - clustersToBeColored.clear(); - if(not clustersToBeColoredString.empty()) { - vector<string> tokens; - boost::algorithm::split(tokens, clustersToBeColoredString, boost::algorithm::is_any_of(",")); - for(const string& token: tokens) { - try { - const uint64_t clusterId =std::stoi(token); - clustersToBeColored.push_back(clusterId); - } catch(const std::exception&) { - // Neglect it. - } - } - } - - // Flag to turn on sequence assembly when coloring a path. - string assemblePathSequenceString; - assemblePathSequence = HttpServer::getParameterValue(request, "assemblePathSequence", assemblePathSequenceString); - - // Link length and thickness. - HttpServer::getParameterValue(request, "minimumLinkLength", minimumLinkLength); - HttpServer::getParameterValue(request, "additionalLinkLengthPerMarker", additionalLinkLengthPerMarker); - HttpServer::getParameterValue(request, "minimumLinkThickness", minimumLinkThickness); - HttpServer::getParameterValue(request, "additionalLinkThicknessPerRead", additionalLinkThicknessPerRead); -} - - - -// Add rows to the html request form. -void LocalAssemblyGraph::SvgOptions::addFormRows(ostream& html) -{ - html << - "<tr>" - "<td>Graphics size in pixels" - "<td class=centered><input type=text name=sizePixels size=8 style='text-align:center'" - " value='" << sizePixels << - "'>" - - "<tr>" - "<td>Graph layout method" - "<td class=left>" - "<input type=radio name=layoutMethod value=neato" - << (layoutMethod=="neato" ? " checked=checked" : "") << - ">Graphviz neato (slow for large graphs)<br>" - "<input type=radio name=layoutMethod value=custom" - << (layoutMethod=="custom" ? " checked=checked" : "") << - ">Custom (user-provided command <code>customLayout</code>)<br>" - - "<tr>" - "<td>Segments" - "<td class=centered>" - "<table>" - "<tr><td class=left>" - "Minimum display length " - "<td><input type=text name=minimumSegmentLength size=8 style='text-align:center'" - " value='" << minimumSegmentLength << "'>" - "<tr><td class=left>" - "Additional display length per marker" - "<td><input type=text name=additionalSegmentLengthPerMarker size=8 style='text-align:center'" - " value='" << additionalSegmentLengthPerMarker << "'>" - "<tr>" - "<td class=left>Minimum thickness" - "<td class=centered><input type=text name=minimumSegmentThickness size=8 style='text-align:center'" - " value='" << minimumSegmentThickness << - "'>" - "<tr>" - "<td class=left>Additional thickness per unit coverage" - "<td class=centered><input type=text name=additionalSegmentThicknessPerUnitCoverage size=8 style='text-align:center'" - " value='" << additionalSegmentThicknessPerUnitCoverage << - "'>" - - - - // Segment coloring. - "<tr>" - "<td class = left>Color" - "<td class=left>" - - // Random segment coloring. - "<input type=radio name=segmentColoring value=random" - << (segmentColoring=="random" ? " checked=checked" : "") << - ">Random<hr>" - - // Uniform segment coloring. - "<input type=radio name=segmentColoring value=uniform" - << (segmentColoring=="uniform" ? " checked=checked" : "") << - ">" - "<input type=text name=segmentColor size=8 style='text-align:center'" - " value='" << segmentColor << "'>" - "<hr>" - - // Segment coloring by Jaccard similarity with the reference segment. - "<input type=radio name=segmentColoring value=byJaccard" - << (segmentColoring=="byJaccard" ? " checked=checked" : "") << - ">By Jaccard similarity with reference segment, without counting short reads" - "<br>" - - // Segment coloring by raw Jaccard similarity with the reference segment. - "<input type=radio name=segmentColoring value=byRawJaccard" - << (segmentColoring=="byRawJaccard" ? " checked=checked" : "") << - ">By raw Jaccard similarity with reference segment (no special treatment of short reads)" - "<br>" - - // Segment coloring by number of common reads with the reference segment. - "<input type=radio name=segmentColoring value=byCommonReads" - << (segmentColoring=="byCommonReads" ? " checked=checked" : "") << - ">By number of common supporting oriented reads with reference segment" - "<div style='text-indent:3em'>" - "Green if at least " - "<input type=text name=greenThreshold size=4 style='text-align:center'" - " value='" << greenThreshold << - "'>" " common reads (0 = automatic)" - "</div>" - - // Segment coloring by unexplained fraction on the reference segment. - "<input type=radio name=segmentColoring value=byUnexplainedFractionOnReferenceSegment" - << (segmentColoring=="byUnexplainedFractionOnReferenceSegment" ? " checked=checked" : "") << - ">By unexplained fraction on the reference segment" - "<br>" - - // Segment coloring by unexplained fraction on the displayed segment. - "<input type=radio name=segmentColoring value=byUnexplainedFractionOnDisplayedSegment" - << (segmentColoring=="byUnexplainedFractionOnDisplayedSegment" ? " checked=checked" : "") << - ">By unexplained fraction on the displayed segment" - "<br>" - - "Reference segment <input type=text name=referenceSegmentId size=8 style='text-align:center'" - " value='" << referenceSegmentId << "'><hr>" - - // Segment coloring by cluster id. - "<input type=radio name=segmentColoring value=byCluster" - << (segmentColoring=="byCluster" ? " checked=checked" : "") << - ">By cluster" - "<br>" - "Hash seed <input type=text name=hashSeed size=8 style='text-align:center'" - " value='" << hashSeed << "'><br>" - "Only color clusters <input type=text name=clustersToBeColored size=8 style='text-align:center'" - " value='"; - for(const uint64_t clusterId: clustersToBeColored) { - html << clusterId << ","; - } - html << "'><hr>" - - // Segment coloring by local cluster - // (computed by analyzeSubgraph using as input only the segments at - // distance less than maxDistance). - "<input type=radio name=segmentColoring value=byLocalCluster" - << (segmentColoring=="byLocalCluster" ? " checked=checked" : "") << - ">By local cluster" - "<br>"; - - // Segment coloring using a path. - html << - "<hr>" - "<input type=radio name=segmentColoring value=path" - << (segmentColoring=="path" ? " checked=checked" : "") << - ">Color an assembly path" - "<br>" - "Start the path at segment <input type=text name=pathStart size=8 style='text-align:center'" - " value='" << pathStart << "'>" - "<br><input type=radio name=pathDirection value=forward" << - (pathDirection=="forward" ? " checked=checked" : "") << "> Forward" - "<br><input type=radio name=pathDirection value=backward" << - (pathDirection=="backward" ? " checked=checked" : "") << "> Backward" - "<br><input type=radio name=pathDirection value=bidirectional" << - (pathDirection=="bidirectional" ? " checked=checked" : "") << "> Both directions" << - "<br><input type=checkbox name=assemblePathSequence" << - (assemblePathSequence ? " checked=checked" : "") << - "> Assemble path sequence."; - - - html << "</table>" - - - - "<tr>" - "<td>Links" - "<td class=centered>" - "<table>" - "<tr><td class=left>" - "Minimum display length " - "<td><input type=text name=minimumLinkLength size=8 style='text-align:center'" - " value='" << minimumLinkLength << "'>" - "<tr><td class=left>" - "Additional display length per marker" - "<td><input type=text name=additionalLinkLengthPerMarker size=8 style='text-align:center'" - " value='" << additionalLinkLengthPerMarker << "'>" - "<tr>" - "<td class=left>Minimum thickness" - "<td class=centered><input type=text name=minimumLinkThickness size=8 style='text-align:center'" - " value='" << minimumLinkThickness << - "'>" - "<tr>" - "<td class=left>Additional thickness per read" - "<td class=centered><input type=text name=additionalLinkThicknessPerRead size=8 style='text-align:center'" - " value='" << additionalLinkThicknessPerRead << - "'>" - "</table>" - - "</table>" - - - ; - -} - - - -// Return true if there were no changes in the options -// that affect graph layout changed, compared to another -// SvgOptions object. -bool LocalAssemblyGraph::SvgOptions::hasSameLayoutOptions(const SvgOptions& that) const -{ - return - (layoutMethod == that.layoutMethod) and - (minimumSegmentLength == that.minimumSegmentLength) and - (additionalSegmentLengthPerMarker == that.additionalSegmentLengthPerMarker) and - (minimumLinkLength == that.minimumLinkLength) and - (additionalLinkLengthPerMarker == that.additionalLinkLengthPerMarker) - ; -} - - - -// Write the local assembly graph in gfa format. -void LocalAssemblyGraph::writeGfa(const string& fileName) const -{ - ofstream gfa(fileName); - writeGfa(gfa); -} -void LocalAssemblyGraph::writeGfa(ostream& gfa) const -{ - const LocalAssemblyGraph& localAssemblyGraph = *this; - - // Write the header. - gfa << "H\tVN:Z:1.0\n"; - - // Write the segments. - BGL_FORALL_VERTICES(v, localAssemblyGraph, LocalAssemblyGraph) { - const uint64_t segmentId = localAssemblyGraph[v].segmentId; - const auto path = assemblyGraph.markerGraphPaths[segmentId]; - gfa << - "S\t" << segmentId << "\t" << - "*\tLN:i:" << path.size() << "\n"; - } - - - // Write the links. - BGL_FORALL_EDGES(e, localAssemblyGraph, LocalAssemblyGraph) { - const uint64_t linkId = localAssemblyGraph[e].linkId; - const mode3::AssemblyGraph::Link& link = assemblyGraph.links[linkId]; - gfa << "L\t" << - link.segmentId0 << "\t+\t" << - link.segmentId1 << "\t+\t0M\n"; - } - -} - diff --git a/src/mode3-LocalAssemblyGraph.hpp b/src/mode3-LocalAssemblyGraph.hpp deleted file mode 100644 index a42cc82..0000000 --- a/src/mode3-LocalAssemblyGraph.hpp +++ /dev/null @@ -1,186 +0,0 @@ -#ifndef SHASTA_MODE3_LOCAL_ASSEMBLY_GRAPH_HPP -#define SHASTA_MODE3_LOCAL_ASSEMBLY_GRAPH_HPP - -// Shasta. -#include "mode3.hpp" - -// Boost libraries. -#include <boost/geometry/geometries/point_xy.hpp> -#include <boost/geometry/algorithms/distance.hpp> -#include <boost/geometry/arithmetic/arithmetic.hpp> -#include <boost/graph/adjacency_list.hpp> - - - -namespace shasta { - namespace mode3 { - - class LocalAssemblyGraph; - class LocalAssemblyGraphEdge; - class LocalAssemblyGraphVertex; - - using Point = boost::geometry::model::d2::point_xy<double>; - } - -} - - -// Classes used to display in the http server a local portion of the AssemblyGraph. -class shasta::mode3::LocalAssemblyGraphVertex { -public: - uint64_t segmentId; - uint64_t distance; // From the start vertex. - LocalAssemblyGraphVertex( - uint64_t segmentId, - uint64_t distance); - LocalAssemblyGraphVertex(); - - // The positions of the auxiliary graph vertices corresponding - // to this segment. - vector<Point> position; - - // Unit vectors for the outward pointing tangents at the two ends of the segment. - // The are computed as averages of the directions of the - // incoming/outgoing links. - // They are used to display the segment as a cubic spline. - Point t1; - Point t2; -}; - - - -class shasta::mode3::LocalAssemblyGraphEdge { -public: - uint64_t linkId; - LocalAssemblyGraphEdge(uint64_t linkId=0) : - linkId(linkId) - {} -}; - - - -class shasta::mode3::LocalAssemblyGraph : - public boost::adjacency_list<boost::listS, boost::listS, boost::bidirectionalS, - LocalAssemblyGraphVertex, LocalAssemblyGraphEdge> { -public: - - LocalAssemblyGraph( - const MarkerGraph&, - const AssemblyGraph&, - uint64_t startSegmentId, - uint64_t maxDistance); - - const MarkerGraph& markerGraph; - const AssemblyGraph& assemblyGraph; - uint64_t maxDistance; - - vertex_descriptor addVertex( - uint64_t segmentId, - uint64_t distance); - - - - class SvgOptions { - public: - - double sizePixels = 600.; - string layoutMethod; - - - - // Segment length and thickness. - - // The display length of a segment is computed as - // minimumSegmentLength + (n-1) * additionalSegmentLengthPerMarker - // where n is the path length of the segment, in markers. - double minimumSegmentLength = 1.; - double additionalSegmentLengthPerMarker = 0.2; - - // The thickness of a segment is computed as - // minimumSegmentThickness + coverage * additionalSegmentThicknessPerUnitCoverage - // where coverage is average marker graph edge coverage on the segment path. - double minimumSegmentThickness = 0.3; - double additionalSegmentThicknessPerUnitCoverage = 0.005; - - // Segment coloring - string segmentColoring = "random"; - string segmentColor = "Green"; // Only used if segmentColoring is "uniform" - uint64_t greenThreshold = 0; // Minimum number of common reads to color green (0=automatic). - uint64_t referenceSegmentId = 0;// Only used if segmentColoring is "byCommonReads" - uint64_t hashSeed = 0; // Only used if segmentCooring is "byClusterId" - uint64_t pathStart = 0; // Only used is segmentColoring is "path" - string pathDirection = "forward"; // Only used is segmentColoring is "path" - - // Clusters to be colored, if coloring by cluster id. - // If empty, all clusters are colored. - vector<uint64_t> clustersToBeColored; - - // Flag to turn on sequence assembly when coloring a path. - bool assemblePathSequence = false; - - // Link length and thickness. - - // The display length of a link is computed as follows: - // - For a link between segments that are consecutive in the marker graph: - // linkLength = minimumLinkLength - // - For a link between segments that are not consecutive in the marker graph: - // linkLength = 3 * minimumLinkLength + linkSeparation * additionalLinkLengthPerMarker - // (with the linkSeperation replaced with zero if it is negative). - double minimumLinkLength = 1; - double additionalLinkLengthPerMarker = 0.2; - - // The display thickness of a link is computed as - // minimumLinkThickness + (n-1) * additionalSegmentLengthPerMarker - // where n is the path length of the segment, in markers. - double minimumLinkThickness = 0.05; - double additionalLinkThicknessPerRead = 0.005; - - - - // Colors. - string segmentAtMaxDistanceColor = "LightGray"; - string linkColor = "Black"; - - // Construct the options from an html request. - SvgOptions(const vector<string>& request); - - // Add rows to the html request form. - void addFormRows(ostream& html); - - // Return true if there were no changes in the options - // that affect graph layout changed, compared to another - // SvgOptions object. - bool hasSameLayoutOptions(const SvgOptions& that) const; - }; - void writeHtml(ostream& html, const SvgOptions&) const; - void writeSvg( - const string& fileName, - const SvgOptions&, - vector<mode3::AssemblyGraph::AnalyzeSubgraphClasses::Cluster>&) const; - void writeSvg( - ostream&, - const SvgOptions&, - vector<mode3::AssemblyGraph::AnalyzeSubgraphClasses::Cluster>&) const; - void computeLayout(const SvgOptions&, double timeout); - void computeSegmentTangents(); - void computeSegmentTangents(vertex_descriptor); - - // Return the random svg color for a segment. - static string randomSegmentColor(uint64_t segmentId); - - - - bool haveConsecutivePaths( - vertex_descriptor v1, - vertex_descriptor v2) const; - - // Return the average link separation for the Link - // described by an edge. - int32_t linkSeparation(edge_descriptor) const; - - // Write the local assembly graph in gfa format. - void writeGfa(const string& fileName) const; - void writeGfa(ostream&) const; -}; -#endif - diff --git a/src/mode3-PathGraph.cpp b/src/mode3-PathGraph.cpp deleted file mode 100644 index 200f44b..0000000 --- a/src/mode3-PathGraph.cpp +++ /dev/null @@ -1,1393 +0,0 @@ -// Shasta. -#include "mode3-PathGraph.hpp" -#include "findLinearChains.hpp" -#include "MurmurHash2.hpp" -#include "orderPairs.hpp" -#include "transitiveReduction.hpp" -using namespace shasta; -using namespace mode3; - -// Boost libraries. -#include <boost/graph/iteration_macros.hpp> -#include <boost/graph/strong_components.hpp> -#include <boost/icl/interval_set.hpp> - -// Standard library. -#include <bitset> -#include "fstream.hpp" -#include "iostream.hpp" -#include <queue> -#include <stack> - -#include "MultithreadedObject.tpp" -template class MultithreadedObject<mode3::PathGraph>; - - - -// Create the PathGraph from the AssemblyGraph. -// Start with a single segment for each vertex -// (that is, paths of length 1). -PathGraph::PathGraph(const AssemblyGraph& assemblyGraph) : - MultithreadedObject<PathGraph>(*this), - assemblyGraph(assemblyGraph) -{ - // HARDWIRED CONSTANTS TO BE EXPOSED WHEN CODE STABILIZES. - const uint64_t minCoverage = 3; - const uint64_t partitionMaxDistance = 10; - const uint64_t minSubgraphSize = 8; - - // Create initial vertices from the AssemblyGraph. - PathGraph& pathGraph = *this; - createVertices(); - - // Detangle iteration. - // At the beginning of each iteration we only have vertices. - for(uint64_t iteration=0; iteration<6; iteration++) { - - createEdges(minCoverage); - cout << "The path graph at iteration " << iteration << " has " << num_vertices(pathGraph) << - " vertices and " << num_edges(pathGraph) << " edges." << endl; - - // Compute oriented read journeys. - computeJourneys(); - // writeJourneys("PathGraphJourneys.csv"); - - // Partition the PathGraph into subgraphs. - partition(partitionMaxDistance, minSubgraphSize); - writeGfa("PathGraph-" + to_string(iteration)); - writeCsvDetailed("PathGraphDetailed-" + to_string(iteration) + ".csv"); - - // Interactive local detangling, without modifying the PathGraph. - // Turn this on for debugging. - while(false) { - int64_t subgraphId; - cout << "Enter a subgraph to detangle interactively, -1 to quit, or -2 to continue with detangle:" << endl; - cin >> subgraphId; - if(not cin) { - return; - } - if(subgraphId == -1) { - return; - } - if(subgraphId == -2) { - break; - } - vector<PathGraphVertex> newVertices; - detangleSubgraph(uint64_t(subgraphId), newVertices, true); - cout << "Detangling subgraph " << subgraphId << - " generated " << newVertices.size() << " new vertices." << endl; - } - - // Detangle. - vector<PathGraphVertex> newVertices; - detangle(newVertices); - - // Recreate the vertices. - clear(); - createVertices(newVertices); - } -} - - - -// Initial creation of the vertices. -// Start with a single segment for each vertex -// (that is, paths of length 1). -void PathGraph::createVertices() { - - PathGraph& pathGraph = *this; - - - // Create a vertex for each segment in the AssemblyGraph. - for(uint64_t segmentId=0; segmentId<assemblyGraph.markerGraphPaths.size(); segmentId++) { - - // Create the vertex. - const vertex_descriptor v = add_vertex(pathGraph); - PathGraphVertex& vertex = pathGraph[v]; - vertex.id = nextVertexId++; - - // Store the path. - vertex.path.push_back(segmentId); - - // Store the AssemblyGraphJourneyInterval's. - const span<const pair<OrientedReadId, uint64_t> > journeyInfos = - assemblyGraph.assemblyGraphJourneyInfos[segmentId]; - for(const pair<OrientedReadId, uint64_t>& p: journeyInfos) { - const OrientedReadId orientedReadId = p.first; - const uint64_t position = p.second; - AssemblyGraphJourneyInterval interval; - interval.orientedReadId = orientedReadId; - interval.first = position; - interval.last = position; - vertex.journeyIntervals.push_back( - make_pair(interval, std::numeric_limits<uint64_t>::max())); - } - } - -} - - -// Creation of vertices after a detangle iteration. -void PathGraph::createVertices(const vector<PathGraphVertex>& newVertices) -{ - PathGraph& pathGraph = *this; - - nextVertexId = 0; - for(const PathGraphVertex& newVertex: newVertices) { - const vertex_descriptor v = boost::add_vertex(newVertex, pathGraph); - PathGraphVertex& vertex = pathGraph[v]; - vertex.id = nextVertexId++; - } -} - - - -// Recreate all edges from scratch, using only the -// information stored in the vertices. -void PathGraph::createEdges(uint64_t minCoverage) -{ - PathGraph& pathGraph = *this; - - // Gather AssemblyGraphJourneyInterval's for all oriented reads. - vector< vector<pair<AssemblyGraphJourneyInterval, vertex_descriptor> > > - journeyIntervals(2 * assemblyGraph.readCount()); - BGL_FORALL_VERTICES(v, pathGraph, PathGraph) { - for(const auto& p: pathGraph[v].journeyIntervals) { - const AssemblyGraphJourneyInterval& interval = p.first; - journeyIntervals[interval.orientedReadId.getValue()].push_back( - make_pair(interval, v)); - } - } - for(auto& v: journeyIntervals) { - sort(v.begin(), v.end(), - OrderPairsByFirstOnly<AssemblyGraphJourneyInterval, vertex_descriptor>()); - } - - // Create the edges. - for(const auto& orientedReadJourneyIntervals: journeyIntervals) { - - for(uint64_t i=1; i<orientedReadJourneyIntervals.size(); i++) { - const vertex_descriptor v0 = orientedReadJourneyIntervals[i-1].second; - const vertex_descriptor v1 = orientedReadJourneyIntervals[i ].second; - - if(v0 != v1) { - edge_descriptor e; - bool edgeExists = false; - tie(e, edgeExists) = edge(v0, v1, pathGraph); - if(not edgeExists) { - tie(e, edgeExists) = add_edge(v0, v1, pathGraph); - SHASTA_ASSERT(edgeExists); - } - ++pathGraph[e].coverage; - } - } - } - - - - // Remove the low coverage edges. - vector<edge_descriptor> edgesToBeRemoved; - BGL_FORALL_EDGES(e, pathGraph, PathGraph) { - if(pathGraph[e].coverage < minCoverage) { - edgesToBeRemoved.push_back(e); - } - } - for(const edge_descriptor e: edgesToBeRemoved) { - boost::remove_edge(e, pathGraph); - } -} - - - -// Compute the journeys of all oriented reads in the PathGraph. -// The journey of an oriented read in the PathGraph is -// a sequence of vertex descriptors which is not necessarily a path. -// Indexed by OrientedReadId::getValue(); -void PathGraph::computeJourneys() -{ - PathGraph& pathGraph = *this; - const ReadId readCount = ReadId(assemblyGraph.readCount()); - - // First create, for each oriented read, a vector - // of pairs (AssemblyGraphJourneyInterval, vertex_descriptor). - vector< vector< pair<AssemblyGraphJourneyInterval, vertex_descriptor> > > - journeyTable(2 * readCount); - BGL_FORALL_VERTICES(v, pathGraph, PathGraph) { - for(const auto& p: pathGraph[v].journeyIntervals) { - const AssemblyGraphJourneyInterval& journeyInterval = p.first; - journeyTable[journeyInterval.orientedReadId.getValue()].push_back(make_pair(journeyInterval, v)); - } - } - - // Sort them and sanity check. - for(vector< pair<AssemblyGraphJourneyInterval, vertex_descriptor> >& v: journeyTable) { - sort(v.begin(), v.end()); - - // Sanity check. - if(v.size() > 1) { - for(uint64_t i=1; i<v.size(); i++) { - const AssemblyGraphJourneyInterval& previous = v[i-1].first; - const AssemblyGraphJourneyInterval& current = v[i].first; - SHASTA_ASSERT(previous.last < current.first); - } - } - } - - - // Store what we got. - BGL_FORALL_VERTICES(v, pathGraph, PathGraph) { - pathGraph[v].journeyIntervals.clear(); - } - journeys.clear(); - journeys.resize(2 * readCount); - for(ReadId readId=0; readId<readCount; readId++) { - for(Strand strand=0; strand<2; strand++) { - const OrientedReadId orientedReadId(readId, strand); - const uint64_t index = orientedReadId.getValue(); - for(uint64_t position=0; position<journeyTable[index].size(); position++) { - const auto& p = journeyTable[index][position]; - const AssemblyGraphJourneyInterval& interval = p.first; - const vertex_descriptor v = p.second; - journeys[index].push_back(v); - pathGraph[v].journeyIntervals.push_back(make_pair(interval, position)); - } - } - } -} - - - -void PathGraph::writeJourneys(const string& fileName) const -{ - const PathGraph& pathGraph = *this; - ofstream csv(fileName); - - // Loop over all oriented reads. - const ReadId readCount = ReadId(assemblyGraph.readCount()); - for(ReadId readId=0; readId<readCount; readId++) { - for(Strand strand=0; strand<2; strand++) { - const OrientedReadId orientedReadId(readId, strand); - csv << orientedReadId << ","; - - // Write the journey of this oriented read in the PathGraph. - const auto journey = journeys[orientedReadId.getValue()]; - for(const vertex_descriptor v: journey) { - csv << pathGraph[v].id << ","; - } - csv << "\n"; - } - } -} - - - -// Partition the PathGraph into subgraphs. -void PathGraph::partition( - uint64_t maxDistance, - uint64_t minSubgraphSize) -{ - PathGraph& pathGraph = *this; - - // Mark all vertices as not assigned to any partition. - BGL_FORALL_VERTICES(v, pathGraph, PathGraph) { - pathGraph[v].subgraphId = noSubgraph; - } - - // Start at all vertices with zero in-degree, - // plus the boundary vertices we find that way. - vector<vertex_descriptor> boundaryVertices; - std::stack<vertex_descriptor> s; - BGL_FORALL_VERTICES(v, pathGraph, PathGraph) { - if(in_degree(v, pathGraph) == 0) { - s.push(v); - } - } - uint64_t subgraphId = 0; - while(not s.empty()) { - const vertex_descriptor v = s.top(); - s.pop(); - - if(pathGraph[v].subgraphId == noSubgraph) { - partitionIteration(v, maxDistance, subgraphId++, boundaryVertices); - for(const vertex_descriptor v: boundaryVertices) { - s.push(v); - } - } - } - - - - // In exceptional cases, the above procedure might not assign all - // vertices to a subgraph. - // This code takes care of that. - BGL_FORALL_VERTICES(v, pathGraph, PathGraph) { - if(pathGraph[v].subgraphId == noSubgraph) { - partitionIteration(v, maxDistance, subgraphId++, boundaryVertices); - } - } - - - - // Combine small subgraphs with adjacent subgraphs, if possible. - // This can leave subgraphs with size 0, but we don't worry about that. - while(true) { - - // Gather the subgraphs based on the current settings of - // the vertices subgraphId. - gatherSubgraphs(); - - // Find the small subgraphs. - std::set<uint64_t> smallSubgraphs; - for(uint64_t subgraphId=0; subgraphId<subgraphs.size(); subgraphId++) { - const vector<vertex_descriptor>& subgraph = subgraphs[subgraphId]; - const uint64_t subgraphSize = subgraph.size(); - if((subgraphSize != 0) and (subgraph.size() < minSubgraphSize)) { - smallSubgraphs.insert(subgraphId); - } - } - - - - // Try and merge small subgraphs with adjacent subgraphs. - - // Loop over small subgraphs. - bool changesWereMade = false; - for(uint64_t subgraphId0: smallSubgraphs) { - const vector<vertex_descriptor>& subgraph0 = subgraphs[subgraphId0]; - const uint64_t subgraph0Size = subgraph0.size(); - SHASTA_ASSERT(subgraph0Size < minSubgraphSize); - - // Find adjacent subgraphs and their sizes. - vector< pair<uint64_t, uint64_t> > adjacentSubgraphsTable; // (size, subgraphId) of adjacent. - for(const vertex_descriptor v0: subgraph0) { - BGL_FORALL_OUTEDGES(v0, e, pathGraph, PathGraph) { - const vertex_descriptor v1 = target(e, pathGraph); - const uint64_t subgraphId1 = pathGraph[v1].subgraphId; - if(subgraphId1 != subgraphId0){ - adjacentSubgraphsTable.push_back(make_pair(subgraphs[subgraphId1].size(), subgraphId1)); - } - } - BGL_FORALL_INEDGES(v0, e, pathGraph, PathGraph) { - const vertex_descriptor v1 = source(e, pathGraph); - const uint64_t subgraphId1 = pathGraph[v1].subgraphId; - if(subgraphId1 != subgraphId0){ - adjacentSubgraphsTable.push_back(make_pair(subgraphs[subgraphId1].size(), subgraphId1)); - } - } - } - if(adjacentSubgraphsTable.empty()) { - continue; - } - sort(adjacentSubgraphsTable.begin(), adjacentSubgraphsTable.end()); - - // Merge it with the smallest adjacent subgraph. - const uint64_t subgraphId1 = adjacentSubgraphsTable.front().second; - smallSubgraphs.erase(subgraphId1); - for(const vertex_descriptor v0: subgraph0) { - pathGraph[v0].subgraphId = subgraphId1; - } - changesWereMade = true; - } - - if(not changesWereMade) { - break; - } - } - - - // Sort the vertex descriptors in each subgraph. - for(vector<vertex_descriptor>& subgraph: subgraphs) { - sort(subgraph.begin(), subgraph.end(), PathGraphOrderVerticesById(pathGraph)); - } - - - - // Subgraph statistics. - cout << "Partitioned the path graph into " << subgraphs.size() << " subgraphs." << endl; - histogramSubgraphs(); - - // Count the edges across subgraphs. - uint64_t crossEdgeCount = 0; - BGL_FORALL_EDGES(e, pathGraph, PathGraph) { - const vertex_descriptor v0 = source(e, pathGraph); - const vertex_descriptor v1 = target(e, pathGraph); - if(pathGraph[v0].subgraphId != pathGraph[v1].subgraphId) { - ++crossEdgeCount; - } - } - cout << "Number of edges across subgraphs is " << crossEdgeCount << endl; -} - - - -// A partition iteration does a single BFS starting at v. -// It moves forward from v, avoiding vertices already -// assigned to a subgraph, and up to maxDistance from v. -// It also returns the boundaryVertices, that is the -// vertices found in the process that are at distance maxDistance+1 -// from v and are not yet assigned to a subgraph. -// These can then used as starting points new partition iterations. -void PathGraph::partitionIteration( - vertex_descriptor v, - uint64_t maxDistance, - uint64_t subgraphId, - vector<vertex_descriptor>& boundaryVertices) -{ - PathGraph& pathGraph = *this; - - boundaryVertices.clear(); - - // Initialize the BFS. - std::queue<vertex_descriptor> q; - q.push(v); - PathGraphVertex& vertex = pathGraph[v]; - SHASTA_ASSERT(vertex.subgraphId == noSubgraph); - vertex.subgraphId = subgraphId; - vertex.distance = 0; - - // BFS loop. - while(not q.empty()) { - const vertex_descriptor v0 = q.front(); - q.pop(); - - const uint64_t distance0 = pathGraph[v0].distance; - const uint64_t distance1 = distance0 + 1; - SHASTA_ASSERT(distance0 <= maxDistance); - - // Loop over edges starting at v0. - BGL_FORALL_OUTEDGES(v0, e01, pathGraph, PathGraph) { - const vertex_descriptor v1 = target(e01, pathGraph); - PathGraphVertex& vertex1 = pathGraph[v1]; - - // If v1 is already in a subgraph, skip it. - if(vertex1.subgraphId != noSubgraph) { - continue; - } - - // Assign v1 to this subgraph, if it is within maxDistance. - if(distance1 <= maxDistance) { - vertex1.subgraphId = subgraphId; - vertex1.distance = distance1; - } - - // Queue it or add it to the boundary vertices. - if(distance1 <= maxDistance) { - q.push(v1); - } else { - SHASTA_ASSERT(distance1 == maxDistance + 1); - boundaryVertices.push_back(v1); - } - - } - - } -} - - - -// Gather subgraphs using the subgraphId stored in each vertex. -void PathGraph::gatherSubgraphs() -{ - PathGraph& pathGraph = *this; - - subgraphs.clear(); - BGL_FORALL_VERTICES(v, pathGraph, PathGraph) { - const uint64_t subgraphId = pathGraph[v].subgraphId; - SHASTA_ASSERT(subgraphId != noSubgraph); - - if(subgraphId >= subgraphs.size()) { - subgraphs.resize(subgraphId + 1); - } - - subgraphs[subgraphId].push_back(v); - } -} - - - -void PathGraph::histogramSubgraphs() -{ - vector<uint64_t> histogram; - for(const vector<vertex_descriptor>& subgraph: subgraphs) { - const uint64_t subgraphSize = subgraph.size(); - if(subgraphSize >= histogram.size()) { - histogram.resize(subgraphSize + 1, 0); - } - ++histogram[subgraphSize]; - } - - ofstream csv("PathGraphSubgraphHistogram.csv"); - csv << "Size,Frequency,Vertices\n"; - for(uint64_t subgraphSize=0; subgraphSize<histogram.size(); subgraphSize++) { - const uint64_t frequency = histogram[subgraphSize]; - csv << subgraphSize << ","; - csv << frequency << ","; - csv << subgraphSize*frequency << "\n"; - } -} - - - - -void PathGraph::writeGfa(const string& baseName) const -{ - const PathGraph& pathGraph = *this; - - // Open the gfa and write the header. - ofstream gfa(baseName + ".gfa"); - gfa << "H\tVN:Z:1.0\n"; - - // Open the csv and write the header. - ofstream csv(baseName + ".csv"); - csv << "PathGraph-VertexId,Color,SubgraphId\n"; - - // Write each vertex as a segment in the gfa. - // Note these segments are different from assembly graph segments: - // here each segment represents a vertex of the path graph. - BGL_FORALL_VERTICES(v, pathGraph, PathGraph) { - gfa << - "S\t" << - pathGraph[v].id << "\t" // Segment name - "*" // Segment length - "\n"; - - - // Color based on the subgraphId. - const uint64_t subgraphId = pathGraph[v].subgraphId; - string color = "LightGrey"; - if(subgraphId != noSubgraph) { - const uint64_t r = MurmurHash2(&subgraphId, sizeof(subgraphId), 231) &255; - const uint64_t g = MurmurHash2(&subgraphId, sizeof(subgraphId), 233) &255; - const uint64_t b = MurmurHash2(&subgraphId, sizeof(subgraphId), 235) &255; - - std::ostringstream s; - s.fill('0'); - s << "#"; - s << hex << std::setw(2) << r; - s << hex << std::setw(2) << g; - s << hex << std::setw(2) << b; - color = s.str(); - } - - csv << pathGraph[v].id << "," << color << "," << subgraphId << "\n"; - - } - - // Write each edge as a link. - BGL_FORALL_EDGES(e, pathGraph, PathGraph) { - const vertex_descriptor v0 = source(e, pathGraph); - const vertex_descriptor v1 = target(e, pathGraph); - gfa << - "L\t" << - pathGraph[v0].id << "\t+\t" << - pathGraph[v1].id << "\t+\t0M\n"; - } - -} - - - -void PathGraph::writeCsvDetailed(const string& fileName) const -{ - const PathGraph& pathGraph = *this; - ofstream csv(fileName); - csv << "PathGraph-VertexId,SubgraphId,SegmentId\n"; - - // Loop over vertices of the PathGraph. - BGL_FORALL_VERTICES(v, pathGraph, PathGraph) { - const PathGraphVertex& vertex = pathGraph[v]; - - // Write the AssemblyGraph path corresponding to this vertex. - for(const uint64_t segmentId: vertex.path) { - csv << vertex.id << ","; - if(vertex.subgraphId != invalid<uint64_t>) { - csv << vertex.subgraphId; - } - csv << ","; - csv << segmentId << "\n"; - } - } -} - - - -// Detangling of a subgraph. -// Returns new vertices for the next detangle iteration. -// The new vertices can only be used in a new PathGraph -// created from scratch. -// Only the path and journeyIntervals are filled in. -void PathGraph::detangleSubgraph( - uint64_t subgraphId, - vector<PathGraphVertex>& newVertices, - bool debug -) const -{ - const vector<vertex_descriptor>& subgraph = subgraphs[subgraphId]; - - if(subgraph.empty()) { - newVertices.clear(); - if(debug) { - cout << "The subgraph to be detangled is empty." << endl; - } - return; - } - - // Call the templated function appropriate for the - // size of this subgraph. This way we use the shortest possible - // bitmap (with size multiple of 64). - if(subgraph.size() <= 64) { - detangleSubgraphTemplate<64>(subgraphId, newVertices, debug); - } else if(subgraph.size() <= 128) { - detangleSubgraphTemplate<128>(subgraphId, newVertices, debug); - } else if(subgraph.size() <= 192) { - detangleSubgraphTemplate<192>(subgraphId, newVertices, debug); - } else if(subgraph.size() <= 256) { - detangleSubgraphTemplate<256>(subgraphId, newVertices, debug); - } else if(subgraph.size() <= 320) { - detangleSubgraphTemplate<320>(subgraphId, newVertices, debug); - } else if(subgraph.size() <= 384) { - detangleSubgraphTemplate<384>(subgraphId, newVertices, debug); - } else if(subgraph.size() <= 448) { - detangleSubgraphTemplate<448>(subgraphId, newVertices, debug); - } else if(subgraph.size() <= 512) { - detangleSubgraphTemplate<512>(subgraphId, newVertices, debug); - } else { - SHASTA_ASSERT(0); - } -} - - -// This code is similar to mode3::AssemblyGraph::analyzeSubgraphTemplate -// but it operates on a subgraph of the PathGraph, not of the AssemblyGraph. -template<uint64_t N> void PathGraph::detangleSubgraphTemplate( - uint64_t subgraphId, - vector<PathGraphVertex>& newVertices, - bool debug -) const -{ - // EXPOSE WHEN CODE STABILIZES. - const double fractionThreshold = 0.05; - const uint64_t minVertexCoverage = 6; - const uint64_t minClusterCoverage = 6; - - const PathGraph& pathGraph = *this; - const vector<vertex_descriptor>& subgraph = subgraphs[subgraphId]; - - // The bitmap type used to store which vertices are visited - // by each journey snippet. - using BitVector = std::bitset<N>; - SHASTA_ASSERT(subgraph.size() <= N); - - if(debug) { - cout << "Detangling a PathGraph subgraph consisting of the following " << - subgraph.size() << " vertices:" << endl; - for(const vertex_descriptor v: subgraph) { - cout << pathGraph[v].id << " "; - } - cout << endl; - } - - // Sanity check: we expect the vertices in the subgraph to be sorted by vertex id. - SHASTA_ASSERT(std::is_sorted(subgraph.begin(), subgraph.end(), - PathGraphOrderVerticesById(pathGraph))); - - // For vertices in the subgraph, gather triplets - // (orientedReadId, position in path graph journey, vertex_descriptor). - using Triplet = tuple<OrientedReadId, uint64_t, vertex_descriptor>; - vector<Triplet> triplets; - for(const vertex_descriptor v: subgraph) { - const PathGraphVertex& vertex = pathGraph[v]; - - // Loop over oriented reads that visit this vertex. - for(const pair<AssemblyGraphJourneyInterval, uint64_t>& p: vertex.journeyIntervals) { - const AssemblyGraphJourneyInterval& assemblyGraphJourneyInterval = p.first; - const uint64_t position = p.second; - const OrientedReadId orientedReadId = assemblyGraphJourneyInterval.orientedReadId; - triplets.push_back(Triplet(orientedReadId, position, v)); - } - } - sort(triplets.begin(), triplets.end()); - - // Write the triplets. - if(debug) { - ofstream csv("Triplets.csv"); - for(const Triplet& triplet: triplets) { - csv << get<0>(triplet) << ","; - csv << get<1>(triplet) << ","; - csv << pathGraph[get<2>(triplet)].id << "\n"; - } - } - - - - // Find streaks for the same OrientedReadId where the position - // increases by 1 each time. - // Each streak generates a PathGraphJourneySnippet. - vector<PathGraphJourneySnippet> snippets; - for(uint64_t i=0; i<triplets.size(); /* Increment later */) { - const OrientedReadId orientedReadId = get<0>(triplets[i]); - - // Find this streak. - uint64_t streakBegin = i; - uint64_t streakEnd = streakBegin + 1; - for(; streakEnd<triplets.size(); streakEnd++) { - if(get<0>(triplets[streakEnd]) != orientedReadId) { - break; - } - if(get<1>(triplets[streakEnd]) != get<1>(triplets[streakEnd-1]) + 1) { - break; - } - } - - // Add a snippet. - PathGraphJourneySnippet snippet; - snippet.orientedReadId = orientedReadId; - snippet.firstPosition = get<1>(triplets[streakBegin]); - for(uint64_t j=streakBegin; j!=streakEnd; ++j) { - snippet.vertices.push_back(get<2>(triplets[j])); - } - snippets.push_back(snippet); - - // Prepare to process the next streak. - i = streakEnd; - } - - - - - // Write the snippets. - if(debug) { - ofstream csv("PathGraphJourneySnippets.csv"); - csv << "SnippetIndex,OrientedReadId,First position,LastPosition,Vertices\n"; - for(uint64_t snippetIndex=0; snippetIndex<snippets.size(); snippetIndex++) { - const PathGraphJourneySnippet& snippet = snippets[snippetIndex]; - csv << snippetIndex << ","; - csv << snippet.orientedReadId << ","; - csv << snippet.firstPosition << ","; - csv << snippet.lastPosition() << ","; - for(const vertex_descriptor v: snippet.vertices) { - csv << pathGraph[v].id << ","; - } - csv << "\n"; - } - } - - - - // For each snippet, create a BitVector that describes the segments - // the snippet visits. - const uint64_t snippetCount = snippets.size(); - vector<BitVector> bitVectors(snippetCount); - vector<uint64_t> bitVectorsPopCount(snippetCount); // The number of bits set in each of the bit vectors. - for(uint64_t snippetIndex=0; snippetIndex<snippetCount; snippetIndex++) { - const PathGraphJourneySnippet& snippet = snippets[snippetIndex]; - BitVector& bitVector = bitVectors[snippetIndex]; - - for(const vertex_descriptor v: snippet.vertices) { - auto it = lower_bound(subgraph.begin(), subgraph.end(), v, PathGraphOrderVerticesById(pathGraph)); - SHASTA_ASSERT(it != subgraph.end()); - SHASTA_ASSERT(*it == v); - const uint64_t bitIndex = it - subgraph.begin(); - bitVector.set(bitIndex); - } - bitVectorsPopCount[snippetIndex] = bitVector.count(); - } - - - - if(debug) { - ofstream csv("SnippetBitVector.csv"); - csv << "Snippet,OrientedReadId,"; - for(uint64_t i=0; i<subgraph.size(); i++) { - const vertex_descriptor v = subgraph[i]; - csv << pathGraph[v].id << ","; - } - csv << "\n"; - for(uint64_t snippetIndex=0; snippetIndex<snippetCount; snippetIndex++) { - const PathGraphJourneySnippet& snippet = snippets[snippetIndex]; - csv << snippetIndex << ","; - csv << snippet.orientedReadId << ","; - const BitVector& bitVector = bitVectors[snippetIndex]; - for(uint64_t i=0; i<subgraph.size(); i++) { - csv << bitVector[i] << ","; - } - csv << "\n"; - } - } - - - - // Create the SnippetGraph. - // A vertex represents a set of snippets and stores - // the corresponding snippet indexes. - // An edge x->y is created if there is at least one snippet in y - // that is an approximate subset of a snippet in x. - // We express this condition as |y-x| < fractionThreshold * |y| - // We start with one snippet per vertex. - SnippetGraph snippetGraph; - vector<SnippetGraph::vertex_descriptor> vertexTable; - std::map<SnippetGraph::vertex_descriptor, uint64_t> vertexMap; - for(uint64_t snippetIndex=0; snippetIndex<snippetCount; snippetIndex++) { - const auto v = add_vertex(SnippetGraphVertex(snippetIndex), snippetGraph); - vertexTable.push_back(v); - vertexMap.insert(make_pair(v, snippetIndex)); - } - for(uint64_t iy=0; iy<snippetCount; iy++) { - const BitVector& y = bitVectors[iy]; - const uint64_t threshold = uint64_t(std::round(fractionThreshold * double(bitVectorsPopCount[iy]))); - const SnippetGraph::vertex_descriptor vy = vertexTable[iy]; - for(uint64_t ix=0; ix<snippetCount; ix++) { - if(ix == iy) { - continue; - } - const BitVector& x = bitVectors[ix]; - - // Compute z = y-x. - BitVector z = y; - z &= (~x); - - if(z.count() <= threshold) { - const SnippetGraph::vertex_descriptor vx = vertexTable[ix]; - add_edge(vx, vy, snippetGraph); - } - } - } - if(debug) { - snippetGraph.writeGraphviz("SnippetGraph-Initial.dot"); - } - - - - // Compute strongly connected components of the SnippetGraph. - std::map<SnippetGraph::vertex_descriptor, uint64_t> componentMap; - const uint64_t componentCount = boost::strong_components( - snippetGraph, - boost::make_assoc_property_map(componentMap), - boost::vertex_index_map(boost::make_assoc_property_map(vertexMap))); - // cout << "Found " << componentCount << " strongly connected components." << endl; - - // Gather the vertices of each strongly connected component. - vector< vector<SnippetGraph::vertex_descriptor> > components(componentCount); - BGL_FORALL_VERTICES_T(v, snippetGraph, SnippetGraph) { - const uint64_t componentId = componentMap[v]; - SHASTA_ASSERT(componentId < componentCount); - components[componentId].push_back(v); - } - if(false) { - cout << "Strongly connected components:\n"; - for(uint64_t componentId=0; componentId<componentCount; componentId++) { - cout << componentId << ": "; - for(const SnippetGraph::vertex_descriptor v: components[componentId]) { - cout << vertexMap[v] << " "; - } - cout << "\n"; - } - } - - - - // Condense the strongly connected components. - // After this, the SnippetGraph is guaranteed to be acyclic. - for(const vector<SnippetGraph::vertex_descriptor>& component: components) { - if(component.size() == 1) { - continue; - } - - // Create a new vertex to represent this component. - const auto vNew = add_vertex(snippetGraph); - vector<uint64_t>& snippetsNew = snippetGraph[vNew].snippetIndexes; - for(const vertex_descriptor v: component) { - const vector<uint64_t>& snippets = snippetGraph[v].snippetIndexes; - SHASTA_ASSERT(snippets.size() == 1); - snippetsNew.push_back(snippets.front()); - } - - // Create the new edges. - for(const vertex_descriptor v0: component) { - - // Out-edges. - BGL_FORALL_OUTEDGES_T(v0, e01, snippetGraph, SnippetGraph) { - const vertex_descriptor v1 = target(e01, snippetGraph); - if(v1 != vNew) { - add_edge(vNew, v1, snippetGraph); - } - } - - // In-edges. - BGL_FORALL_INEDGES_T(v0, e10, snippetGraph, SnippetGraph) { - const vertex_descriptor v1 = source(e10, snippetGraph); - if(v1 != vNew) { - add_edge(v1, vNew, snippetGraph); - } - } - } - - // Remove the old vertices and their edges. - for(const vertex_descriptor v: component) { - clear_vertex(v, snippetGraph); - remove_vertex(v, snippetGraph); - } - } - - - - // Compute which maximal vertices each vertex is a descendant of. - std::map<SnippetGraph::vertex_descriptor, vector<SnippetGraph::vertex_descriptor> > ancestorMap; - BGL_FORALL_VERTICES_T(v0, snippetGraph, SnippetGraph) { - if(in_degree(v0, snippetGraph) != 0) { - continue; // Not a maximal vertex. - } - - // Find the descendants of this maximal vertex. - vector<vertex_descriptor> descendants; - snippetGraph.findDescendants(v0, descendants); - - // Update the ancestor map. - for(const vertex_descriptor v1: descendants) { - ancestorMap[v1].push_back(v0); - } - } - - - - // Each maximal vertex generates a cluster consisting of the vertices - // that descend from it and from no other maximal vertex. - // Gather the vertices in each cluster. - std::map<SnippetGraph::vertex_descriptor, vector<SnippetGraph::vertex_descriptor> > clusterMap; - uint64_t unclusterVertexCount = 0; - BGL_FORALL_VERTICES_T(v1, snippetGraph, SnippetGraph) { - const vector<SnippetGraph::vertex_descriptor>& ancestors = ancestorMap[v1]; - if(ancestors.size() == 1) { - const vertex_descriptor v0 = ancestors.front(); - clusterMap[v0].push_back(v1); - } else { - ++unclusterVertexCount; - } - } - if(debug or unclusterVertexCount>0) { - cout << "Subgraph " << subgraphId << " has " << unclusterVertexCount << - " unclustered snippets out of " << snippetCount << " total." << endl; - } - - - - // Gather the snippets in each cluster. - vector<PathGraphJourneySnippetCluster> clusters; - for(const auto& p: clusterMap) { - const vector<SnippetGraph::vertex_descriptor>& clusterVertices = p.second; - clusters.resize(clusters.size() + 1); - PathGraphJourneySnippetCluster& cluster = clusters.back(); - - vector<uint64_t> clusterSnippetIndexes; // Only used for debug output. - for(const SnippetGraph::vertex_descriptor v: clusterVertices) { - const vector<uint64_t>& snippetIndexes = snippetGraph[v].snippetIndexes; - for(const uint64_t snippetIndex: snippetIndexes) { - cluster.snippets.push_back(snippets[snippetIndex]); - clusterSnippetIndexes.push_back(snippetIndex); - } - } - cluster.constructVertices(pathGraph); - cluster.cleanupVertices(minVertexCoverage); - if(debug) { - cout << "Found a cluster candidate with " << - clusterVertices.size() << " vertices and " << - cluster.snippets.size() << " snippets:" << endl; - for(const uint64_t snippetIndex: clusterSnippetIndexes) { - cout << snippetIndex << " "; - } - cout << endl; - } - - // If coverage on this cluster is too low, discard it. - if(cluster.coverage() < minClusterCoverage) { - clusters.resize(clusters.size() - 1); - if(debug) { - cout << "This cluster candidate was discarded because of low coverage." << endl; - } - continue; - } - - // This cluster will be stored and is assigned this clusterId; - const uint64_t clusterId = clusters.size() - 1; - - if(debug) { - - cout << "This cluster was stored as cluster " << clusterId << endl; - cout << "Vertex(coverage) for this cluster:\n"; - for(const auto& p: cluster.vertices) { - cout << pathGraph[p.first].id << "(" << p.second << ") "; - } - cout << endl; - } - - // Mark the vertices of this cluster. - for(const SnippetGraph::vertex_descriptor v: clusterVertices) { - snippetGraph[v].clusterId = clusterId; - } - } - snippetGraph.clusterCount = clusters.size(); - - - - - // Write out the SnippetGraph. - if(debug) { - snippetGraph.writeGraphviz("SnippetGraph.dot"); - } - - - - // Find the paths of each cluster. - // Each of these paths generates a new vertex for the next detangle iteration. - newVertices.clear(); - if(debug) { - cout << "Kept " << clusters.size() << " clusters." << endl; - } - for(uint64_t clusterId=0; clusterId<clusters.size(); clusterId++) { - PathGraphJourneySnippetCluster& cluster = clusters[clusterId]; - vector< vector<vertex_descriptor> > paths; - ofstream graphOut; - if(debug) { - graphOut.open("Cluster-" + to_string(clusterId) + ".dot"); - cout << "Finding paths generated by cluster " << clusterId << endl; - } - findClusterPaths(cluster, paths, debug ? &graphOut : 0, debug); - - // Construct the clusterSet for this cluster. - // It is set of all pairs (orientedReadId, vertex) covered by this cluster. - cluster.createClusterSet(); - - // For each path, generate a new vertex for the next detangle iteration. - for(const vector<vertex_descriptor>& path: paths) { - newVertices.emplace_back(); - PathGraphVertex& newVertex = newVertices.back(); - - // Construct the assembly graph path for the new vertex. - for(const vertex_descriptor v: path) { - const PathGraphVertex& vertex = pathGraph[v]; - copy(vertex.path.begin(), vertex.path.end(), back_inserter(newVertex.path)); - } - - // Intersect the clusterSet of this cluster with this path. - std::set<vertex_descriptor> pathVertices; - for(const vertex_descriptor v: path) { - pathVertices.insert(v); - } - std::set< pair<OrientedReadId, vertex_descriptor> > pathSet; - for(const auto& p: cluster.clusterSet) { - if(pathVertices.contains(p.second)) { - pathSet.insert(p); - } - } - - // Write out this pathSet. - if(debug) { - cout << "pathSet for this path:" << endl; - for(const auto& p: pathSet) { - const OrientedReadId orientedReadId = p.first; - const vertex_descriptor v = p.second; - const PathGraphVertex& vertex = pathGraph[v]; - for(const pair<AssemblyGraphJourneyInterval, uint64_t>& p: vertex.journeyIntervals) { - const AssemblyGraphJourneyInterval& interval = p.first; - if(interval.orientedReadId == orientedReadId) { - cout << orientedReadId << " " << interval.first << " " << interval.last << endl; - } - } - } - } - - // Describe the pathSet as an interval map for each oriented read. - std::map< OrientedReadId, boost::icl::interval_set<uint64_t> > pathSetMap; - for(const auto& p: pathSet) { - const OrientedReadId orientedReadId = p.first; - const vertex_descriptor v = p.second; - const PathGraphVertex& vertex = pathGraph[v]; - for(const pair<AssemblyGraphJourneyInterval, uint64_t>& p: vertex.journeyIntervals) { - const AssemblyGraphJourneyInterval& assemblyGraphJourneyInterval = p.first; - if(assemblyGraphJourneyInterval.orientedReadId == orientedReadId) { - auto interval = boost::icl::interval<uint64_t>::right_open( - assemblyGraphJourneyInterval.first, - assemblyGraphJourneyInterval.last + 1); - pathSetMap[orientedReadId].insert(interval); - } - } - } - if(debug) { - cout << "pathSetMap:" << endl; - for(const auto& p: pathSetMap) { - const OrientedReadId orientedReadId = p.first; - const boost::icl::interval_set<uint64_t>& intervals = p.second; - for(const auto& interval: intervals) { - cout << orientedReadId << " " << interval.lower() << " " << interval.upper() << endl; - } - } - } - - // With this information we can construct the AssemblyGraphJourneyInterval's for the new vertex. - for(const auto& p: pathSetMap) { - const OrientedReadId orientedReadId = p.first; - const boost::icl::interval_set<uint64_t>& intervals = p.second; - for(const auto& interval: intervals) { - AssemblyGraphJourneyInterval assemblyGraphJourneyInterval; - assemblyGraphJourneyInterval.orientedReadId = orientedReadId; - assemblyGraphJourneyInterval.first = interval.lower(); - assemblyGraphJourneyInterval.last = interval.upper() - 1; - newVertex.journeyIntervals.push_back(make_pair(assemblyGraphJourneyInterval, invalid<uint64_t>)); - } - } - } - } -} - - - -// Detangle all the subgraphs. -// This does not modify the PathGraph. -// Instead, it creates vertices to be used for next detangle iteration. -void PathGraph::detangle(vector<PathGraphVertex>& allNewVertices) const -{ - allNewVertices.clear(); - vector<PathGraphVertex> newVertices; - for(uint64_t subgraphId=0; subgraphId<subgraphs.size(); subgraphId++) { - detangleSubgraph(subgraphId, newVertices, false); - copy(newVertices.begin(), newVertices.end(), back_inserter(allNewVertices)); - } -} - - - -// Construct a set of all pairs (orientedReadId, vertex) covered by this cluster. -void PathGraphJourneySnippetCluster::createClusterSet() -{ - clusterSet.clear(); - for(const PathGraphJourneySnippet& snippet: snippets) { - for(const PathGraphBaseClass::vertex_descriptor v: snippet.vertices) { - clusterSet.insert(make_pair(snippet.orientedReadId, v)); - } - } -} - - - -void SnippetGraph::findDescendants( - const vertex_descriptor vStart, - vector<vertex_descriptor>& descendants) const -{ - const SnippetGraph& graph = *this; - - // Initialize the BFS. - std::queue<vertex_descriptor> q; - q.push(vStart); - std::set<vertex_descriptor> descendantsSet; - descendantsSet.insert(vStart); - - // BFS loop. - while(not q.empty()) { - const vertex_descriptor v0 = q.front(); - q.pop(); - - BGL_FORALL_OUTEDGES(v0, e01, graph, SnippetGraph) { - const vertex_descriptor v1 = target(e01, graph); - if(descendantsSet.find(v1) == descendantsSet.end()) { - q.push(v1); - descendantsSet.insert(v1); - } - } - } - - descendants.clear(); - copy(descendantsSet.begin(), descendantsSet.end(), back_inserter(descendants)); -} - - - -void SnippetGraph::writeGraphviz( - const string& fileName) const -{ - const SnippetGraph& graph = *this; - - ofstream dot(fileName); - dot << "digraph SnippetGraph{\n" - "node [shape=rectangle];\n"; - BGL_FORALL_VERTICES(v, graph, SnippetGraph) { - dot << "\"" << v << "\" [label=\""; - const vector<uint64_t>& snippetIndexes = graph[v].snippetIndexes; - for(const uint64_t snippetIndex: snippetIndexes) { - dot << snippetIndex; - dot << "\\n"; - } - dot << "\""; - const uint64_t clusterId = graph[v].clusterId; - if(clusterId != invalid<uint64_t>) { - dot << " style=filled fillcolor=\"" << - float(clusterId)/float(clusterCount) << - ",0.3,1\""; - } - dot << "];\n"; - } - BGL_FORALL_EDGES(e, graph, SnippetGraph) { - const vertex_descriptor vx = source(e, graph); - const vertex_descriptor vy = target(e, graph); - dot << "\"" << vx << "\"->\"" << vy << "\";\n"; - } - dot << "}\n"; - -} - - - -vector<PathGraphBaseClass::vertex_descriptor> PathGraphJourneySnippetCluster::getVertices() const -{ - vector<PathGraphBaseClass::vertex_descriptor> v; - for(const auto& p: vertices) { - v.push_back(p.first); - } - return v; -} - - - -void PathGraphJourneySnippetCluster::cleanupVertices(uint64_t minVertexCoverage) -{ - vector< pair<PathGraphBaseClass::vertex_descriptor, uint64_t > > newVertices; - for(const auto& p: vertices) { - if(p.second >= minVertexCoverage) { - newVertices.push_back(p); - } - } - vertices.swap(newVertices); -} - - - -void PathGraphJourneySnippetCluster::constructVertices(const PathGraph& pathGraph) -{ - // A map with Key=vertex_descriptor, value = coverage. - auto vertexMap = std::map<PathGraphBaseClass::vertex_descriptor, uint64_t, PathGraphOrderVerticesById>( - PathGraphOrderVerticesById(pathGraph)); - - for(const PathGraphJourneySnippet& snippet: snippets) { - for(const PathGraphBaseClass::vertex_descriptor v: snippet.vertices) { - auto it = vertexMap.find(v); - if(it == vertexMap.end()) { - vertexMap.insert(make_pair(v, 1)); - } else { - ++(it->second); - } - } - } - - vertices.clear(); - copy(vertexMap.begin(), vertexMap.end(), back_inserter(vertices)); -} - - - -// Given a PathGraphJourneySnippetCluster, find a plausible -// path for it in the PathGraph. -void PathGraph::findClusterPaths( - const PathGraphJourneySnippetCluster& cluster, - vector< vector<vertex_descriptor> >& paths, - ostream* graphOut, - bool debug) const -{ - const PathGraph& pathGraph = *this; - - // Map vertex descriptors to indexes in cluster.vertices. - std::map<vertex_descriptor, uint64_t> vertexMap; - for(uint64_t i=0; i<cluster.vertices.size(); i++) { - const vertex_descriptor v = cluster.vertices[i].first; - vertexMap.insert(make_pair(v, i)); - } - - // Construct the subgraph induced by the vertices of the cluster. - using Subgraph = boost::adjacency_list<boost::listS, boost::vecS, boost::bidirectionalS>; - Subgraph subgraph(vertexMap.size()); - for(const auto& p: vertexMap) { - const vertex_descriptor v0 = p.first; - const uint64_t i0 = p.second; - BGL_FORALL_OUTEDGES(v0, e, pathGraph, PathGraph) { - const vertex_descriptor v1 = target(e, pathGraph); - const auto it = vertexMap.find(v1); - if(it == vertexMap.end()) { - continue; - } - const uint64_t i1 = it->second; - add_edge(i0, i1, subgraph); - } - } - - // Compute strong connected components of this subgraph. - const auto indexMap = get(boost::vertex_index, subgraph); - vector<uint64_t> strongComponent(num_vertices(subgraph)); - boost::strong_components( - subgraph, - boost::make_iterator_property_map(strongComponent.begin(), indexMap)); - - // Remove edges internal to strong components. - vector<Subgraph::edge_descriptor> edgesToBeRemoved; - BGL_FORALL_EDGES(e, subgraph, Subgraph) { - const uint64_t i0 = source(e, subgraph); - const uint64_t i1 = target(e, subgraph); - if(strongComponent[i0] == strongComponent[i1]) { - edgesToBeRemoved.push_back(e); - } - } - for(const Subgraph::edge_descriptor e: edgesToBeRemoved) { - boost::remove_edge(e, subgraph); - } - - // Transitive reduction. - transitiveReduction(subgraph); - - - // Write it out. - if(graphOut) { - (*graphOut) << "digraph cluster {\n"; - for(uint64_t i=0; i<vertexMap.size(); i++) { - const auto& p = cluster.vertices[i]; - const vertex_descriptor v = p.first; - const uint64_t coverage = p.second; - (*graphOut) << pathGraph[v].id; - (*graphOut) << " [label=\"" << pathGraph[v].id << "\\n" << coverage << "\"]"; - (*graphOut) << ";\n"; - } - BGL_FORALL_EDGES(e, subgraph, Subgraph) { - const uint64_t i0 = source(e, subgraph); - const uint64_t i1 = target(e, subgraph); - const vertex_descriptor v0 = cluster.vertices[i0].first; - const vertex_descriptor v1 = cluster.vertices[i1].first; - (*graphOut) << pathGraph[v0].id << "->" << pathGraph[v1].id; - (*graphOut) << ";\n"; - } - (*graphOut) << "}\n"; - - } - - - // Find linear chains of vertices. - vector< vector<Subgraph::vertex_descriptor> > chains; - findLinearVertexChains(subgraph, chains); - if(debug) { - cout << "Found the following paths:" << endl; - for(const vector<Subgraph::vertex_descriptor>& chain: chains) { - for(const Subgraph::vertex_descriptor v: chain) { - const PathGraph::vertex_descriptor u = cluster.vertices[v].first; - cout << pathGraph[u].id << " "; - } - cout << endl; - } - } - - // Store a path for each chain. - paths.clear(); - for(const vector<Subgraph::vertex_descriptor>& chain: chains) { - vector<PathGraph::vertex_descriptor> path; - for(const Subgraph::vertex_descriptor v: chain) { - const PathGraph::vertex_descriptor u = cluster.vertices[v].first; - path.push_back(u); - } - paths.push_back(path); - } - -} diff --git a/src/mode3-PathGraph.hpp b/src/mode3-PathGraph.hpp deleted file mode 100644 index d5181f2..0000000 --- a/src/mode3-PathGraph.hpp +++ /dev/null @@ -1,286 +0,0 @@ -#ifndef SHASTA_MODE3_PATH_GRAPH_HPP -#define SHASTA_MODE3_PATH_GRAPH_HPP - -/******************************************************************************* - -The mode3::PathGraph is a directed graph in which each vertex represents -a path in the mode3::AssemblyGraph. - -*******************************************************************************/ - -// Shasta. -#include "mode3.hpp" -#include "MultithreadedObject.hpp" - -// Boost libraries. -#include <boost/graph/adjacency_list.hpp> - -// Standard libraries. -#include <limits> -#include "vector.hpp" - -namespace shasta { - namespace mode3 { - class PathGraph; - class PathGraphVertex; - class PathGraphEdge; - class PathGraphOrderVerticesById; - class PathGraphJourneySnippet; - class PathGraphJourneySnippetCluster; - class SnippetGraph; - class SnippetGraphVertex; - - using PathGraphBaseClass = boost::adjacency_list< - boost::listS, - boost::listS, - boost::bidirectionalS, - PathGraphVertex, PathGraphEdge>; - using SnippetGraphBaseClass = - boost::adjacency_list<boost::setS, boost::listS, boost::bidirectionalS, SnippetGraphVertex>; - - } - - extern template class MultithreadedObject<mode3::PathGraph>; -} - - - -// A PathGraphJourneySnippet describes a sequence of consecutive positions -// of the path graph journey of an oriented read. -// An OrientedReadId can have than more one PathGraphJourneySnippet in a given subgraph, -// but this is not common. It can happen if the PathGraph contains a cycle. -class shasta::mode3::PathGraphJourneySnippet { -public: - - // The OrientedReadId this refers to. - OrientedReadId orientedReadId; - - // The sequence of vertices encountered. - vector<PathGraphBaseClass::vertex_descriptor> vertices; - - // The first and last position of this snippet - // in the path graph journey of this OrientedReadId. - uint64_t firstPosition; - uint64_t lastPosition() const - { - return firstPosition + vertices.size() - 1; - } -}; - - - -class shasta::mode3::PathGraphJourneySnippetCluster { -public: - - // The snippets in this cluster. - vector<PathGraphJourneySnippet> snippets; - uint64_t coverage() const - { - return snippets.size(); - } - - // The PathGraph vertices visited by the snippets of this cluster, - // each stored with its coverage (number of snippets); - vector< pair<PathGraphBaseClass::vertex_descriptor, uint64_t > > vertices; - vector<PathGraphBaseClass::vertex_descriptor> getVertices() const; - - // Remove vertices with coverage less than the specified value. - void cleanupVertices(uint64_t minClusterCoverage); - - // Construct the vertices given the snippets. - void constructVertices(const PathGraph&); - - // Construct a set of all pairs (orientedReadId, vertex) covered by this cluster. - std::set< pair<OrientedReadId, PathGraphBaseClass::vertex_descriptor> > clusterSet; - void createClusterSet(); -}; - - - -// The SnippetGraph is used by PathGraph::detangleSubgraph. -// A vertex represents a set of snippets and stores -// the corresponding snippet indexes. -// An edge x->y is created if there is at least one snippet in y -// that is an approximate subset of a snippet in x. -// Strongly connected components are condensed, so after that -// the graph is guaranteed to have no cycles. -class shasta::mode3::SnippetGraphVertex { - public: - vector<uint64_t> snippetIndexes; - uint64_t clusterId = std::numeric_limits<uint64_t>::max(); - SnippetGraphVertex() {} - SnippetGraphVertex(uint64_t snippetIndex) : - snippetIndexes(1, snippetIndex) {} - }; -class shasta::mode3::SnippetGraph : public SnippetGraphBaseClass { -public: - uint64_t clusterCount = 0; - void findDescendants(const vertex_descriptor, vector<vertex_descriptor>&) const; - void writeGraphviz(const string& fileName) const; -}; - - - -// Each vertex of the PathGraph describes a path -// in the mode3::AssemblyGraph. -class shasta::mode3::PathGraphVertex { -public: - - // The segment ids of the mode3::AssemblyGraph path - // that this vertex describes. - vector<uint64_t> path; - - // We also store the assembly graph journey intervals - // for the oriented reads that are believed to follow this path. - // Note that an oriented read can have more than one journey interval - // (e. g. if it goes around in a cycle). - // The second item in the pair is the ordinal - // of this vertex in the path graph journey of the oriented read. - // It is filled in by computeJourneys. - vector<pair<AssemblyGraphJourneyInterval, uint64_t> > journeyIntervals; - - // The vertex id is only used to help keep track of vertices - // for testing and debugging. - uint64_t id; - - // The partition this vertex was assigned to. - uint64_t subgraphId = invalid<uint64_t>; - - // Distance from the start vertex of the BFS. - // Only used during the BFS. - uint64_t distance = 0; -}; - - - -class shasta::mode3::PathGraphEdge { -public: - uint64_t coverage = 0; -}; - - - -class shasta::mode3::PathGraph : - public PathGraphBaseClass, - public MultithreadedObject<PathGraph> { -public: - - // Create the PathGraph from the AssemblyGraph. - PathGraph(const AssemblyGraph&); - - // This writes a GFA representation of the PathGraph, - // with one GFA segment per vertex. - // It also writes an accompanying csv file that can be loaded in Bandage. - void writeGfa(const string& baseName) const; - - // This writes a detailed csv file containing the path corresponding - // to each vertex. - void writeCsvDetailed(const string& fileName) const; - -private: - - // The AssemblyGraph this PathGraph refers to. - const AssemblyGraph& assemblyGraph; - - // Initial creation of the vertices. - // Start with a single segment for each vertex - // (that is, paths of length 1). - void createVertices(); - - // Creation of vertices after a detangle iteration. - void createVertices(const vector<PathGraphVertex>&); - - // Recreate all edges from scratch, using only the - // information stored in the vertices. - void createEdges(uint64_t minCoverage); - - // The id of the next vertex to be added. - // Vertex ids are only used to help keep track of vertices - // for testing and debugging. - uint64_t nextVertexId = 0; - - // The journeys of all oriented reads in the PathGraph. - // The journey of an oriented read in the PathGraph is - // a sequence of vertex descriptors which is not necessarily a path. - // Indexed by OrientedReadId::getValue(); - vector< vector<vertex_descriptor> > journeys; - void computeJourneys(); - void writeJourneys(const string& fileName) const; - - // Partition the PathGraph into subgraphs. - void partition( - uint64_t maxDistance, - uint64_t minSubgraphSize); - static const uint64_t noSubgraph = std::numeric_limits<uint64_t>::max(); - - // Gather subgraphs using the subgraphId stored in each vertex. - // A subgraph can have size 0, and in that case it should be ignored. - void gatherSubgraphs(); - void histogramSubgraphs(); - vector< vector<vertex_descriptor> > subgraphs; - - // A partition iteration does a single BFS starting at v. - // It moves forward from v, avoiding vertices already - // assigned to a subgraph, and up to maxDistance from v. - // It also returns the boundaryVertices, that is the - // vertices found in the process that are at distance maxDistance+1 - // from v and are nto yet assigned to a subgraph. - // These can then used as starting points new partition iterations. - void partitionIteration( - vertex_descriptor v, - uint64_t maxDistance, - uint64_t subgraphId, - vector<vertex_descriptor>& boundaryVertices); - - - - // Detangling of a subgraph. - // Returns new vertices for the next detangle iteration. - // The new vertices can only be used in a new PathGraph - // created from scratch. - void detangleSubgraph( - uint64_t subgraphId, - vector<PathGraphVertex>& newVertices, - bool debug - ) const; - template<uint64_t N> void detangleSubgraphTemplate( - uint64_t subgraphId, - vector<PathGraphVertex>& newVertices, - bool debug - ) const; - - // Detangle all the subgraphs. - // This does not modify the PathGraph. - // Instead, it creates vertices to be used for next detangle iteration. - void detangle(vector<PathGraphVertex>& newVertices) const; - - // Given a PathGraphJourneySnippetCluster, find plausible - // paths for it in the PathGraph. - void findClusterPaths( - const PathGraphJourneySnippetCluster&, - vector< vector<vertex_descriptor> >& path, - ostream*, - bool debug) const; -}; - - - -// Class used to order/sort PathGraph vertex descriptors -// by increasing vertex id. -class shasta::mode3::PathGraphOrderVerticesById { -public: - PathGraphOrderVerticesById(const PathGraph& pathGraph) : - pathGraph(pathGraph) {} - const PathGraph& pathGraph; - - bool operator()( - PathGraph::vertex_descriptor v0, - PathGraph::vertex_descriptor v1) const - { - return pathGraph[v0].id < pathGraph[v1].id; - } -}; - - - -#endif diff --git a/src/mode3-PhasedComponent.cpp b/src/mode3-PhasedComponent.cpp new file mode 100644 index 0000000..92698cd --- /dev/null +++ b/src/mode3-PhasedComponent.cpp @@ -0,0 +1,31 @@ +#include "mode3-PhasedComponent.hpp" +#include "orderPairs.hpp" +#include "SHASTA_ASSERT.hpp" +using namespace shasta; +using namespace mode3; + +#include "algorithm.hpp" +#include "limits" + + + +void PhasedComponent::sort() +{ + SHASTA_ASSERT(size() > 1); + std::sort(begin(), end(), OrderPairsByFirstOnly<uint64_t, int64_t>()); + minPositionInBubbleChain = front().first; + maxPositionInBubbleChain = back().first; +} + + + +void PhasedComponent::computePositionRange() +{ + minPositionInBubbleChain = std::numeric_limits<uint64_t>::max(); + maxPositionInBubbleChain = 0; + for(const auto& p: *this) { + const uint64_t positionInBubbleChain = p.first; + minPositionInBubbleChain = min(minPositionInBubbleChain, positionInBubbleChain); + maxPositionInBubbleChain = max(maxPositionInBubbleChain, positionInBubbleChain); + } +} diff --git a/src/mode3-PhasedComponent.hpp b/src/mode3-PhasedComponent.hpp new file mode 100644 index 0000000..f4b0e87 --- /dev/null +++ b/src/mode3-PhasedComponent.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include "cstdint.hpp" +#include "utility.hpp" +#include "vector.hpp" + +namespace shasta { + namespace mode3 { + class PhasedComponent; + } +} + + + +// A PhasedComponent is a set of phased diploid bubbles +// in a BubbleChain. +// It is a vector of (bubble position in bubble chain, phase), +// sorted by bubble position in bubble chain. +// The phase can be -1 or +1. +// PhasedComponents are created in such a way that their position ranges +// in the bubble chain are not overlapping. +class shasta::mode3::PhasedComponent : public vector< pair<uint64_t, int64_t> > { +public: + uint64_t minPositionInBubbleChain; + uint64_t maxPositionInBubbleChain; + void sort(); + void computePositionRange(); +}; + diff --git a/src/mode3-PhasingTable.cpp b/src/mode3-PhasingTable.cpp new file mode 100644 index 0000000..fd1c937 --- /dev/null +++ b/src/mode3-PhasingTable.cpp @@ -0,0 +1,1258 @@ +// Shasta. + +#include "Assembler.hpp" +#include "bits/stdint-uintn.h" +#include "mode3-AssemblyGraph.hpp" +#include "mode3-PhasingTable.hpp" +#include "MarkerGraph.hpp" +#include "MarkerInterval.hpp" +#include "orderPairs.hpp" +#include "PngImage.hpp" +#include "shastaTypes.hpp" +#include "SHASTA_ASSERT.hpp" +using namespace shasta; +using namespace mode3; + +// Boost libraries +#include <boost/graph/iteration_macros.hpp> +#include <boost/graph/graph_traits.hpp> +#include <boost/multi_index/detail/bidir_node_iterator.hpp> +#include <boost/multi_index/detail/ord_index_impl.hpp> +#include <boost/operators.hpp> + +// Standard library. +#include "algorithm.hpp" +#include "filesystem.hpp" +#include "iostream.hpp" +#include <limits> +#include <set> +#include "stdexcept.hpp" +#include "string.hpp" +#include "tuple.hpp" +#include "utility.hpp" +#include "vector.hpp" + + + +void AssemblyGraph::writeBubbleChainsPhasingTables( + const string& fileNamePrefix, + double phaseErrorThreshold) const +{ + const AssemblyGraph& cGraph = *this; + + const string directoryName = fileNamePrefix + "-PhasingTables"; + if(not std::filesystem::create_directory(directoryName)) { + throw runtime_error("Could not create directory " + directoryName); + } + + + // Loop over all BubbleChains. + BGL_FORALL_EDGES(ce, cGraph, AssemblyGraph) { + const AssemblyGraphEdge& edge = cGraph[ce]; + const BubbleChain& bubbleChain = edge; + + // Create the phasing table for this bubble chain. + PhasingTable phasingTable(bubbleChain, assembler.markerGraph, phaseErrorThreshold); + + if(phasingTable.empty()) { + continue; + } + if(phasingTable.bubbleCount() < 2) { + continue; + } + + cout << "Phasing table for " << bubbleChainStringId(ce) << + " has " << phasingTable.entryCount() << + " entries (of which " << phasingTable.ambiguousEntryCount() << + " ambiguous) for " << + phasingTable.bubbleCount() << " bubbles and " << + phasingTable.orientedReadCount() << " oriented reads." << endl; + + const string fileNamePrefix = directoryName + "/" + bubbleChainStringId(ce); + phasingTable.writeCsv(fileNamePrefix); + phasingTable.writePng(fileNamePrefix + "-RelativePhase.png", + PhasingTable::ColoringMethod::byRelativePhase); + phasingTable.writePng(fileNamePrefix + "-DiscreteRelativePhase.png", + PhasingTable::ColoringMethod::byDiscreteRelativePhase); + + phasingTable.greedyPhasing(); + phasingTable.writePng(fileNamePrefix + "-Consistency.png", + PhasingTable::ColoringMethod::byConsistency); + +#if 0 + for(uint64_t i=0; i<6; i++) { + cout << "Discordant count before sweep " << i << " = " << phasingTable.discordantCount() << endl; + phasingTable.flipSweep(); + } + cout << "Final discordant count = " << phasingTable.discordantCount() << endl; + phasingTable.writePng(directoryName + "/" + bubbleChainStringId(ce) + "-sweep.png", false); + phasingTable.writePng(directoryName + "/" + bubbleChainStringId(ce) + "-sweep-byType.png", true); +#endif + } +} + + +PhasingTable::PhasingTable( + const BubbleChain& bubbleChain, + const MarkerGraph& markerGraph, + double phaseErrorThreshold) +{ + fill(bubbleChain, markerGraph, phaseErrorThreshold); + gatherOrientedReads(); + gatherBubbles(); + fillIndexes(); +} + + + +void PhasingTable::fill( + const BubbleChain& bubbleChain, + const MarkerGraph& markerGraph, + double phaseErrorThreshold) +{ + clear(); + + // Loop over the bubbles in this bubble chain. + for(uint64_t positionInBubbleChain=0; positionInBubbleChain<bubbleChain.size(); positionInBubbleChain++) { + const mode3::Bubble& bubble = bubbleChain[positionInBubbleChain]; + + // If this bubble is not diploid, skip it. + if(not bubble.isDiploid()) { + continue; + } + + // Loop over the two chains of this diploid bubble. + for(uint64_t chainIndexInBubble=0; chainIndexInBubble<bubble.size(); chainIndexInBubble++) { + SHASTA_ASSERT(chainIndexInBubble < 2); + const Chain& chain = bubble[chainIndexInBubble]; + + + // Loop over marker graph edges of this chain, excluding the terminal ones. + SHASTA_ASSERT(chain.size() >= 2); + for(uint64_t i=1; i<chain.size()-1; i++) { + const MarkerGraphEdgeId markerGraphEdgeId = chain[i]; + + // Loop over MarkerIntervals of this marker graph edge. + const span<const MarkerInterval> markerIntervals = markerGraph.edgeMarkerIntervals[markerGraphEdgeId]; + for(const MarkerInterval& markerInterval: markerIntervals) { + const OrientedReadId orientedReadId = markerInterval.orientedReadId; + + // Access the PhasingTableEntry for this OrientedReadId and + // position in the bubble chain, creating it if necessary. + auto it = indexByBoth().find(make_tuple(orientedReadId, positionInBubbleChain)); + if(it == indexByBoth().end()) { + tie(it, ignore) = insert(PhasingTableEntry(orientedReadId, positionInBubbleChain)); + } + // Access it as non-const so we can update the frequency array. + // We can do a const_cast because we only update the frequency, + // which does not participate in any field used to index the PhasingTable. + PhasingTableEntry& entry = const_cast<PhasingTableEntry&>(*it); + + // Increment the PhasingTableEntry for this OrientedReadId and positionInBubbleChain. + ++entry.frequency[chainIndexInBubble]; + } + } + } + } + + // Compute the relative phase of all PhasingTableEntries. + for(const PhasingTableEntry& phasingTableEntry: indexByBoth()) { + PhasingTableEntry& nonConstPhasingTableEntry = const_cast<PhasingTableEntry&>(phasingTableEntry); + nonConstPhasingTableEntry.storeRelativePhase(phaseErrorThreshold); + } +} + + + +void PhasingTable::gatherOrientedReads() +{ + + // Gather the distinct OrientedReadIds that appear in this PhasingTable. + std::set<OrientedReadId> orientedReadIds; + for(const PhasingTableEntry& phasingTableEntry: indexByBoth()) { + orientedReadIds.insert(phasingTableEntry.orientedReadId); + } + + // Store them in the orientedReads vector. + orientedReads.clear(); + for(const OrientedReadId orientedReadId: orientedReadIds) { + OrientedRead orientedRead; + orientedRead.id = orientedReadId; + orientedReads.push_back(orientedRead); + } + + // Fill in the min/max positions in the bubble chain. + for(OrientedRead& orientedRead: orientedReads) { + orientedRead.minPositionInBubbleChain = std::numeric_limits<uint64_t>::max(); + orientedRead.maxPositionInBubbleChain = 0; + for(auto it=indexByOrientedReadId().find(orientedRead.id); + it!=indexByOrientedReadId().end() and it->orientedReadId == orientedRead.id; ++it) { + const uint64_t positionInBubbleChain = it->positionInBubbleChain; + orientedRead.minPositionInBubbleChain = min(orientedRead.minPositionInBubbleChain, positionInBubbleChain); + orientedRead.maxPositionInBubbleChain = max(orientedRead.maxPositionInBubbleChain, positionInBubbleChain); + } + } + + // Sort the orientedReads vector by average position. + vector< pair<uint64_t, uint64_t> > orientedReadsTable; // (index, minPosition + maxPosition) + for(uint64_t i=0; i<orientedReads.size(); i++) { + const OrientedRead& orientedRead = orientedReads[i]; + orientedReadsTable.push_back({i, orientedRead.minPositionInBubbleChain + orientedRead.maxPositionInBubbleChain}); + } + sort(orientedReadsTable.begin(), orientedReadsTable.end(), + OrderPairsBySecondOnly<uint64_t, uint64_t>()); + vector<OrientedRead> sortedOrientedReads; + for(const auto& p: orientedReadsTable) { + sortedOrientedReads.push_back(orientedReads[p.first]); + } + orientedReads.swap(sortedOrientedReads); + + // Fill in the orientedReadIdsMap map. + orientedReadsMap.clear(); + for(uint64_t i=0; i<orientedReads.size(); i++) { + orientedReadsMap.insert({orientedReads[i].id, i}); + } +} + + + +void PhasingTable::gatherBubbles() +{ + + // Gather the positions in the bubble chains of the diploid bubbles + // that the oriented reads appear in. + std::set<uint64_t> positionsInBubbleChain; + for(const PhasingTableEntry& phasingTableEntry: indexByBoth()) { + positionsInBubbleChain.insert(phasingTableEntry.positionInBubbleChain); + } + + // Store them in the bubbles vector. + bubbles.clear(); + for(const uint64_t positionInBubbleChain: positionsInBubbleChain) { + bubbles.push_back({positionInBubbleChain}); + } + + // Check that the bubbles are sorted by position. + for(uint64_t i1=1; i1<bubbles.size(); i1++) { + const uint64_t i0 = i1 - 1; + const Bubble& bubble0 = bubbles[i0]; + const Bubble& bubble1 = bubbles[i1]; + SHASTA_ASSERT(bubble0.positionInBubbleChain < bubble1.positionInBubbleChain); + } + + // Fill in the bubble map. + bubblesMap.clear(); + for(uint64_t i=0; i<bubbles.size(); i++) { + bubblesMap.insert({bubbles[i].positionInBubbleChain, i}); + } + +} + + + +// Fill the orientedReadIndex and bubbleIndex in all PhasingTableEntries. +// This can only be done after gatherOrientedReads and gatherBubbles +// have been called. +void PhasingTable::fillIndexes() +{ + for(const PhasingTableEntry& phasingTableEntry: indexByBoth()) { + + // Access the PhasingTableEntry as a non-const reference. + // This is ok because we will not modify the fields that participate + // in the PhasingTable indexes. + PhasingTableEntry& entry = const_cast<PhasingTableEntry&>(phasingTableEntry); + + entry.orientedReadIndex = orientedReadsMap[entry.orientedReadId]; + entry.bubbleIndex = bubblesMap[entry.positionInBubbleChain]; + } + +} + +#if 0 + +void PhasingTable::write(const string& fileNamePrefix) const +{ + writeCsv(fileNamePrefix); + writeHtml(fileNamePrefix); + writePng(fileNamePrefix + ".png", true); +} +#endif + + + +void PhasingTable::writeCsv(const string& fileNamePrefix) const +{ + writeOrientedReadsCsv(fileNamePrefix); + writeBubblesCsv(fileNamePrefix, true); + writeDetailsCsv(fileNamePrefix); +} + + + +void PhasingTable::writeOrientedReadsCsv(const string& fileNamePrefix) const +{ + ofstream csv(fileNamePrefix + "-OrientedReads.csv"); + csv << "OrientedReadId,Min position in bubble chain,Max position in bubble chain," + "Oriented read index,Min bubble index,Max bubble Index,\n"; + + for(uint64_t i=0; i<orientedReads.size(); i++) { + const OrientedRead& orientedRead = orientedReads[i]; + csv << orientedRead.id << ","; + csv << orientedRead.minPositionInBubbleChain << ","; + csv << orientedRead.maxPositionInBubbleChain << ","; + csv << i << ","; + csv << bubblesMap.find(orientedRead.minPositionInBubbleChain)->second << ","; + csv << bubblesMap.find(orientedRead.maxPositionInBubbleChain)->second << ","; + csv << "\n"; + } +} + + + +void PhasingTable::writeBubblesCsv( + const string& fileNamePrefix, + bool writePhasingInformation) const +{ + ofstream csv(fileNamePrefix + "-Bubbles.csv"); + csv << "Position in bubble chain,Bubble index,Unambiguous,Ambiguous,"; + if(writePhasingInformation) { + csv << "Consistent,Inconsistent,Error rate,"; + } + csv << "\n"; + + for(uint64_t i=0; i<bubbles.size(); i++) { + csv << bubbles[i].positionInBubbleChain << ","; + csv << i << ","; + + uint64_t unambiguous; + uint64_t ambiguous; + tie(unambiguous, ambiguous) = countEntriesForBubble(bubbles[i].positionInBubbleChain); + csv << unambiguous << ","; + csv << ambiguous << ","; + + if(writePhasingInformation) { + uint64_t consistent; + uint64_t inconsistent; + tie(consistent, inconsistent) = countConsistentEntriesForBubble(bubbles[i].positionInBubbleChain); + csv << consistent << ","; + csv << inconsistent << ","; + csv << double(inconsistent) / double(consistent + inconsistent) << ","; + } + + csv << "\n"; + } +} + + + +void PhasingTable::writeDetailsCsv(const string& fileNamePrefix) const +{ + ofstream csv(fileNamePrefix + "-Details.csv"); + + csv << "Position in bubble chain,OrientedReadId,Bubble index,Oriented read index,Frequency0,Frequency1," + "Relative phase,DiscreteRelative phase\n"; + + for(const OrientedRead& orientedRead: orientedReads) { + const OrientedReadId orientedReadId = orientedRead.id; + for(auto it=indexByOrientedReadId().find(orientedReadId); + it!=indexByOrientedReadId().end() and it->orientedReadId == orientedReadId; ++it) { + const PhasingTableEntry& phasingTableEntry = *it; + phasingTableEntry.writeCsv(csv); + csv << "\n"; + } + } +} + + + +void PhasingTableEntry::writeCsv(ostream& csv) const +{ + csv << positionInBubbleChain << ","; + csv << orientedReadId << ","; + csv << bubbleIndex << ","; + csv << orientedReadIndex << ","; + csv << frequency[0] << ","; + csv << frequency[1] << ","; + csv << relativePhase << ","; + csv << discreteRelativePhase << ","; +} + + + +void PhasingTable::writePng(const string& fileName, ColoringMethod coloringMethod) const +{ + PngImage image{int(bubbleCount()), int(orientedReadCount())}; + for(uint64_t x=0; x<bubbleCount(); x++) { + for(uint64_t y=0; y<orientedReadCount(); y++) { + image.setPixel(int(x), int(y), 255, 255, 255); + } + } + + for(const PhasingTableEntry& entry: indexByBoth()) { + + int r, g, b; + if(coloringMethod == ColoringMethod::byDiscreteRelativePhase) { + switch(entry.discreteRelativePhase) { + case 0: + // Ambiguous: black + r = 0; + g = 0; + b = 0; + break; + case +1: + // In-phase: red. + r = 255; + g = 0; + b = 0; + break; + case -1: + // Out-of-phase: blue. + r = 0; + g = 0; + b = 255; + break; + default: + SHASTA_ASSERT(0); + } + + } else if(coloringMethod == ColoringMethod::byRelativePhase) { + + // Compute (r, g, b) values that give: + // - Green if relativePhase is 1 (in-phase). + // - Red if relativePhase is -1 (out-of-phase). + if(entry.relativePhase >= 0.) { + r = 255; + g = 0; + b = int(std::round((1. - entry.relativePhase) * 255.)); + } else { + r = int(std::round((1. + entry.relativePhase) * 255.)); + g = 0; + b = 255; + } + } else if(coloringMethod == ColoringMethod::byConsistency) { + const int64_t state = consistencyState(entry); + switch(state) { + case +1: + r = 0; + g = 255; + b = 0; + break; + case -1: + r = 255; + g = 0; + b = 0; + break; + case 0: + r = 255; + g = 255; + b = 0; + break; + default: + SHASTA_ASSERT(0); + } + + } else { + SHASTA_ASSERT(0); + } + + image.setPixel(int(entry.bubbleIndex), int(entry.orientedReadIndex), r, g, b); + } + + image.write(fileName); +} + + + +uint64_t PhasingTable::unambiguousEntryCount() const +{ + const auto& indexByBoth = get<0>(); + + uint64_t n = 0; + for(const PhasingTableEntry& entry: indexByBoth) { + if(entry.discreteRelativePhase != 0) { + ++n; + } + } + return n; +} + + + +uint64_t PhasingTable::ambiguousEntryCount() const +{ + const auto& indexByBoth = get<0>(); + + uint64_t n = 0; + for(const PhasingTableEntry& entry: indexByBoth) { + if(entry.discreteRelativePhase == 0) { + ++n; + } + } + return n; +} + + + +// Compute the consistency state of a PhasingTableEntry relative +// to the current phases of its oriented read and bubble. +// It can be +1 (consistent), -1 (inconsistent), or 0 (unassigned or ambiguous). +int64_t PhasingTable::consistencyState(const PhasingTableEntry& entry) const +{ + if(entry.discreteRelativePhase == 0) { + return 0; + } + + const int64_t orientedReadPhase = orientedReads[entry.orientedReadIndex].phase; + if(orientedReadPhase == 0) { + return 0; + } + + const int64_t bubblePhase = bubbles[entry.bubbleIndex].phase; + if(bubblePhase == 0) { + return 0; + } + + if(entry.discreteRelativePhase == 1) { + if(orientedReadPhase == bubblePhase) { + return +1; + } else { + return -1; + } + } else { + if(orientedReadPhase == bubblePhase) { + return -1; + } else { + return +1; + } + } +} + + + +// Count the number of (consistent,inconsistent) PhasingTableEntries +// for an oriented read based on the phases currently assigned +// to bubbles and oriented reads. +pair<uint64_t, uint64_t> PhasingTable::countConsistentEntriesForOrientedRead( + OrientedReadId orientedReadId) const +{ + uint64_t consistentCount = 0; + uint64_t inconsistentCount = 0; + + for(auto it=indexByOrientedReadId().find(orientedReadId); + it!=indexByOrientedReadId().end() and it->orientedReadId == orientedReadId; ++it) { + const PhasingTableEntry& entry = *it; + + const int64_t s = consistencyState(entry); + switch(s) { + case +1: + ++consistentCount; + break; + case -1: + ++inconsistentCount; + break; + case 0: + break; + default: + SHASTA_ASSERT(0); + } + } + + return {consistentCount, inconsistentCount}; +} + + + +// Count the number of (consistent,inconsistent) PhasingTableEntries +// for the bubble at a given bubble chain position based on the phases currently assigned +// to bubbles and oriented reads. +pair<uint64_t, uint64_t> PhasingTable::countConsistentEntriesForBubble(uint64_t positionInBubbleChain) const +{ + uint64_t consistentCount = 0; + uint64_t inconsistentCount = 0; + + for(auto it=indexByPositionInBubbleChain().find(positionInBubbleChain); + it!=indexByPositionInBubbleChain().end() and it->positionInBubbleChain == positionInBubbleChain; ++it) { + const PhasingTableEntry& entry = *it; + + const int64_t s = consistencyState(entry); + switch(s) { + case +1: + ++consistentCount; + break; + case -1: + ++inconsistentCount; + break; + case 0: + break; + default: + SHASTA_ASSERT(0); + } + } + + return {consistentCount, inconsistentCount}; + +} + + + +pair<uint64_t, uint64_t> PhasingTable::countEntriesForBubble(uint64_t positionInBubbleChain) const +{ + uint64_t unambiguous = 0; + uint64_t ambiguous = 0; + + for(auto it=indexByPositionInBubbleChain().find(positionInBubbleChain); + it!=indexByPositionInBubbleChain().end() and it->positionInBubbleChain == positionInBubbleChain; ++it) { + const PhasingTableEntry& entry = *it; + + if(entry.discreteRelativePhase == 0) { + ++ambiguous; + } else { + ++unambiguous; + } + } + + return {unambiguous, ambiguous}; + +} + + +// Count the number of (consistent,inconsistent) PhasingTableEntries +// based on the phases currently assigned +// to bubbles and oriented reads. +pair<uint64_t, uint64_t> PhasingTable::countConsistentEntries() const +{ + uint64_t consistentCount = 0; + uint64_t inconsistentCount = 0; + + for(const PhasingTableEntry& entry: indexByBoth()) { + + const int64_t s = consistencyState(entry); + switch(s) { + case +1: + ++consistentCount; + break; + case -1: + ++inconsistentCount; + break; + case 0: + break; + default: + SHASTA_ASSERT(0); + } + } + + return {consistentCount, inconsistentCount}; + +} + + + +// Iteratively optimize the phases of the oriented reads and of the bubbles. +// Experimental. Do not use. +void PhasingTable::simpleIterativePhasing1() +{ + // Start with the phases of all oriented reads and bubbles set to +1. + for(OrientedRead& orientedRead: orientedReads) { + orientedRead.phase = +1; + } + for(Bubble& bubble: bubbles) { + bubble.phase = +1; + } + + + // Iteration loop. + uint64_t consistentCount; + uint64_t inconsistentCount; + tie(consistentCount, inconsistentCount) = countConsistentEntries(); + const uint64_t unassignedCount = size() - (consistentCount + inconsistentCount); + uint64_t oldInconsistentCount = inconsistentCount; + cout << "Initial consistency statistics: consistent " << consistentCount << + ", inconsistent " << inconsistentCount << + ", unassigned " << unassignedCount << endl; + for(uint64_t iteration=0; ; iteration++) { + + // Set the oriented read phases based on the current bubble phases. + for(OrientedRead& orientedRead: orientedReads) { + + // Count the number of consistent/inconsistent PhasingTableEntries + // for this bubble. + tie(consistentCount, inconsistentCount) = + countConsistentEntriesForOrientedRead(orientedRead.id); + + // Set the phase of this oriented read accordingly. + if(consistentCount >= inconsistentCount) { + // Do nothing. + } else { + // Flip it. + orientedRead.phase = - orientedRead.phase; + } + } + + // Set the bubble phases based on the current oriented read phases. + for(Bubble& bubble: bubbles) { + + // Count the number of consistent/inconsistent PhasingTableEntries + // for this bubble. + tie(consistentCount, inconsistentCount) = + countConsistentEntriesForBubble(bubble.positionInBubbleChain); + + const double consistentFraction = double(consistentCount) / double(consistentCount + inconsistentCount); + + // Set the phase of this bubble accordingly. + if(consistentFraction > 0.2) { + // Do nothing. + } else { + // Flip it. + bubble.phase = - bubble.phase; + } + } + + tie(consistentCount, inconsistentCount) = countConsistentEntries(); + const uint64_t unassignedCount = size() - (consistentCount + inconsistentCount); + cout << "Consistency statistics after phasing iteration " << iteration << + ": consistent " << consistentCount << + ", inconsistent " << inconsistentCount << + ", unassigned " << unassignedCount << endl; + SHASTA_ASSERT(inconsistentCount <= oldInconsistentCount); + if(inconsistentCount == oldInconsistentCount) { + break; + } + oldInconsistentCount = inconsistentCount; + } +} + + + +// Iteratively optimize the phases of the oriented reads and of the bubbles. +// Experimental. Do not use. +void PhasingTable::simpleIterativePhasing2() +{ + // Start with the phases of all oriented reads and bubbles set to +1. + for(OrientedRead& orientedRead: orientedReads) { + orientedRead.phase = +1; + } + for(Bubble& bubble: bubbles) { + bubble.phase = +1; + } + + + // Iteration loop. + uint64_t consistentCount; + uint64_t inconsistentCount; + tie(consistentCount, inconsistentCount) = countConsistentEntries(); + const uint64_t unassignedCount = size() - (consistentCount + inconsistentCount); + cout << "Initial consistency statistics: consistent " << consistentCount << + ", inconsistent " << inconsistentCount << + ", unassigned " << unassignedCount << endl; + vector<uint64_t> consistentBubbles; + vector<uint64_t> inconsistentBubbles; + for(uint64_t iteration=0; iteration<6; iteration++) { + + // Loop over oriented reads. + for(OrientedRead& orientedRead: orientedReads) { + + // Gather the bubbles that have a consistent/inconsistent + // PhasingTableEntry with this oriented read. + // Gather the bubbles where this oriented read appears with phase +1 or -1 + // (with tolerance equal to phaseError). + consistentBubbles.clear(); + inconsistentBubbles.clear(); + for(auto it=indexByOrientedReadId().find(orientedRead.id); + it!=indexByOrientedReadId().end() and it->orientedReadId == orientedRead.id; ++it) { + const PhasingTableEntry& phasingTableEntry = *it; + const int64_t s = consistencyState(phasingTableEntry); + + if(s == +1) { + consistentBubbles.push_back(phasingTableEntry.bubbleIndex); + } else if(s == -1) { + inconsistentBubbles.push_back(phasingTableEntry.bubbleIndex); + } + } + + // If there are more consistentBubbles than inconsistentBubbles, flip the minusBubbles. + // If there are more inconsistentBubbles than consistentBubbles, flip the plusBubbles. + if(consistentBubbles.size() == inconsistentBubbles.size()) { + continue; + } + const vector<uint64_t>& bubblesToFlip = + (consistentBubbles.size() > inconsistentBubbles.size()) ? inconsistentBubbles : consistentBubbles; + for(const uint64_t bubbleIndex: bubblesToFlip) { + Bubble& bubble = bubbles[bubbleIndex]; + bubble.phase = -bubble.phase; + } + if(inconsistentBubbles.size() > consistentBubbles.size()) { + orientedRead.phase = - orientedRead.phase; + } + } + + tie(consistentCount, inconsistentCount) = countConsistentEntries(); + const uint64_t unassignedCount = size() - (consistentCount + inconsistentCount); + cout << "Consistency statistics after phasing iteration " << iteration << + ": consistent " << consistentCount << + ", inconsistent " << inconsistentCount << + ", unassigned " << unassignedCount << endl; + } +} + + + +void PhasingTable::greedyPhasing() +{ + const bool debug = false; + + class OrientedReadInfo { + public: + + // Index of this oriented read in the orientedReads vector. + uint64_t orientedReadIndex; + + // The total number of unambiguous PhasingTableEntries for this oriented read. + uint64_t unambiguousBubbleCount = 0; + + // The number of bubbles that have already been phased and that have an + // unambiguous PhasingTableEntry with this oriented read. + uint64_t phasedUnambiguousBubbleCount = 0; + + OrientedReadInfo(uint64_t orientedReadIndex) : + orientedReadIndex(orientedReadIndex) {} + }; + + // The OrientedReadTable is a container of OrientedReadInfo + // used to keep track of unphased oriented reads by various criteria. + class OrientedReadTable : public boost::multi_index_container<OrientedReadInfo, + boost::multi_index::indexed_by < + + // Index by orientedReadIndex (unique). + boost::multi_index::ordered_unique<boost::multi_index::member< + OrientedReadInfo, + uint64_t, + &OrientedReadInfo::orientedReadIndex> >, + + // Index by unambiguousBubbleCount (non-unique, largest first). + boost::multi_index::ordered_non_unique<boost::multi_index::member< + OrientedReadInfo, + uint64_t, + &OrientedReadInfo::unambiguousBubbleCount>, + std::greater<uint64_t> >, + + // Index by phasedUnambiguousBubbleCount (non-unique, largest first). + boost::multi_index::ordered_non_unique<boost::multi_index::member< + OrientedReadInfo, + uint64_t, + &OrientedReadInfo::phasedUnambiguousBubbleCount>, + std::greater<uint64_t> > + > > { + }; + OrientedReadTable orientedReadTable; + + + + // Initialize the OrientedReadTable. + for(uint64_t orientedReadIndex=0; orientedReadIndex<orientedReadCount(); orientedReadIndex++) { + const OrientedReadId orientedReadId = orientedReads[orientedReadIndex].id; + + OrientedReadInfo orientedReadInfo(orientedReadIndex); + for(auto it=indexByOrientedReadId().find(orientedReadId); + it!=indexByOrientedReadId().end() and it->orientedReadId == orientedReadId; ++it) { + const PhasingTableEntry& phasingTableEntry = *it; + if(phasingTableEntry.discreteRelativePhase != 0) { + ++orientedReadInfo.unambiguousBubbleCount; + } + } + orientedReadTable.insert(orientedReadInfo); + } + + + // Initialize the phases and phasing components of all oriented reads and bubbles. + for(OrientedRead& orientedRead: orientedReads) { + orientedRead.phase = 0; + orientedRead.phasingComponent = invalid<uint64_t>; + } + for(Bubble& bubble: bubbles) { + bubble.phase = 0; + bubble.phasingComponent = invalid<uint64_t>; + } + + + + // Outer loop is over phasing components. + for(uint64_t phasingComponent=0; ; phasingComponent++) { + if(orientedReadTable.empty()) { + break; + } + + // Find the starting oriented read for this phasing component. + const auto it = orientedReadTable.get<1>().begin(); + const OrientedReadInfo& orientedReadInfo = *it; + OrientedRead& orientedRead = orientedReads[orientedReadInfo.orientedReadIndex]; + + const uint64_t minPositionInBubbleChain = orientedRead.minPositionInBubbleChain; + const uint64_t maxPositionInBubbleChain = orientedRead.maxPositionInBubbleChain; + const uint64_t minBubbleIndex = bubblesMap[minPositionInBubbleChain]; + const uint64_t maxBubbleIndex = bubblesMap[maxPositionInBubbleChain]; + + if(debug) { + cout << "Begin phasing component " << phasingComponent << endl; + cout << "Phasing group begins at " << orientedRead.id << + ", index " << orientedReadInfo.orientedReadIndex << + " with " << orientedReadInfo.unambiguousBubbleCount << " unambiguous bubbles." << endl; + cout << "Bubble index range for this oriented read is [" << + minBubbleIndex << "," << maxBubbleIndex << "]." << endl; + } + + if(orientedReadInfo.unambiguousBubbleCount == 0) { + break; + } + + // Assign phase +1 in this phasing group to this starting read for this phasing component. + SHASTA_ASSERT(orientedRead.phase == 0); + SHASTA_ASSERT(orientedRead.phasingComponent == invalid<uint64_t>); + orientedRead.phase = +1; + orientedRead.phasingComponent = phasingComponent; + + // Assign to all unambiguous bubbles of this oriented read a phase consistent with it. + for(auto it=indexByOrientedReadId().find(orientedRead.id); + it!=indexByOrientedReadId().end() and it->orientedReadId == orientedRead.id; ++it) { + const PhasingTableEntry& phasingTableEntry = *it; + Bubble& bubble = bubbles[phasingTableEntry.bubbleIndex]; + SHASTA_ASSERT(bubble.phase == 0); + SHASTA_ASSERT(bubble.phasingComponent == invalid<uint64_t>); + + // Skip it if it is ambiguous. + if(phasingTableEntry.discreteRelativePhase == 0) { + continue; + } + + // Set the phase of this bubble to a phase consistent with the +1 phase + // we assigned to the starting oriented read. + bubble.phase = phasingTableEntry.discreteRelativePhase; + bubble.phasingComponent = phasingComponent; + + // Update the OrientedReadTable to reflect the fact that this bubble was just phased. + for(auto it=indexByPositionInBubbleChain().find(bubble.positionInBubbleChain); + it!=indexByPositionInBubbleChain().end() and it->positionInBubbleChain == bubble.positionInBubbleChain; ++it) { + const PhasingTableEntry& phasingTableEntry = *it; + if(phasingTableEntry.discreteRelativePhase == 0) { + continue; + } + + auto jt = orientedReadTable.get<0>().find(phasingTableEntry.orientedReadIndex); + SHASTA_ASSERT(jt != orientedReadTable.get<0>().end()); + OrientedReadInfo info = *jt; + info.phasedUnambiguousBubbleCount++; + orientedReadTable.get<0>().replace(jt, info); + } + } + + // Remove the starting oriented read from the orientedReadTable. + orientedReadTable.get<1>().erase(it); + + + + // The inner loop phases one oriented read at a time, adding it to the current + // phasing component. + while(not orientedReadTable.empty()) { + + // Find the oriented read with the most phased bubbles. + const auto it = orientedReadTable.get<2>().begin(); + const OrientedReadInfo& orientedReadInfo = *it; + OrientedRead& orientedRead = orientedReads[orientedReadInfo.orientedReadIndex]; + + const uint64_t minPositionInBubbleChain = orientedRead.minPositionInBubbleChain; + const uint64_t maxPositionInBubbleChain = orientedRead.maxPositionInBubbleChain; + const uint64_t minBubbleIndex = bubblesMap[minPositionInBubbleChain]; + const uint64_t maxBubbleIndex = bubblesMap[maxPositionInBubbleChain]; + + if(orientedReadInfo.phasedUnambiguousBubbleCount == 0) { + // Finish this phasing component. + break; + } + + if(debug) { + cout << "Adding to phasing group " << orientedRead.id << + ", index " << orientedReadInfo.orientedReadIndex << + " with " << orientedReadInfo.unambiguousBubbleCount << " unambiguous bubbles," << + " of which " << orientedReadInfo.phasedUnambiguousBubbleCount << " already phased ." << endl; + cout << "Bubble index range for this oriented read is [" << + minBubbleIndex << "," << maxBubbleIndex << "]." << endl; + } + + // Use the bubbles that are already phased to assign a phase to this oriented read. + uint64_t plusCount = 0; + uint64_t minusCount = 0; + for(auto it=indexByOrientedReadId().find(orientedRead.id); + it!=indexByOrientedReadId().end() and it->orientedReadId == orientedRead.id; ++it) { + const PhasingTableEntry& phasingTableEntry = *it; + if(phasingTableEntry.discreteRelativePhase == 0) { + continue; + } + Bubble& bubble = bubbles[phasingTableEntry.bubbleIndex]; + if(bubble.phase == 0) { + continue; + } + int64_t phase; + if(phasingTableEntry.discreteRelativePhase == +1) { + phase = bubble.phase; + } else { + phase = - bubble.phase; + } + if(phase == +1) { + ++plusCount; + } else if(phase == -1) { + ++minusCount; + } + } + + SHASTA_ASSERT(plusCount + minusCount == orientedReadInfo.phasedUnambiguousBubbleCount); + + // Phase this oriented read in this phasing component. + SHASTA_ASSERT(orientedRead.phase == 0); + SHASTA_ASSERT(orientedRead.phasingComponent == invalid<uint64_t>); + orientedRead.phase = (plusCount >= minusCount) ? +1 : -1; + orientedRead.phasingComponent = phasingComponent; + + // Assign to all unambiguous bubbles of this oriented read + // that are not already phased a phase consistent with it. + for(auto it=indexByOrientedReadId().find(orientedRead.id); + it!=indexByOrientedReadId().end() and it->orientedReadId == orientedRead.id; ++it) { + const PhasingTableEntry& phasingTableEntry = *it; + + // Skip it if it is ambiguous. + if(phasingTableEntry.discreteRelativePhase == 0) { + continue; + } + Bubble& bubble = bubbles[phasingTableEntry.bubbleIndex]; + + // If already phased, skip it. + if(bubble.phase != 0) { + continue; + } + + // Phase this bubble to a phase consistent with this oriented read. + bubble.phase = (phasingTableEntry.discreteRelativePhase == +1) ? orientedRead.phase : -orientedRead.phase; + bubble.phasingComponent = phasingComponent; + + // Update the OrientedReadTable to reflect the fact that this bubble was just phased. + for(auto it=indexByPositionInBubbleChain().find(bubble.positionInBubbleChain); + it!=indexByPositionInBubbleChain().end() and it->positionInBubbleChain == bubble.positionInBubbleChain; ++it) { + const PhasingTableEntry& phasingTableEntry = *it; + if(phasingTableEntry.discreteRelativePhase == 0) { + continue; + } + + auto jt = orientedReadTable.get<0>().find(phasingTableEntry.orientedReadIndex); + if(jt == orientedReadTable.get<0>().end()) { + continue; + } + OrientedReadInfo info = *jt; + info.phasedUnambiguousBubbleCount++; + orientedReadTable.get<0>().replace(jt, info); + } + } + + // Remove the oriented read from the orientedReadTable. + orientedReadTable.get<2>().erase(it); + } + } +} + + + +double PhasingTable::bubbleErrorRate(uint64_t positionInBubbleChain) const +{ + // Must be called for a diploid bubble + auto it = bubblesMap.find(positionInBubbleChain); + SHASTA_ASSERT(it != bubblesMap.end()); + const Bubble& bubble = bubbles[it->second]; + + if(bubble.phase == 0) { + return 1.; + } + + // This bubble is diploid and phased. + uint64_t consistent; + uint64_t inconsistent; + tie(consistent, inconsistent) = countConsistentEntriesForBubble(positionInBubbleChain); + return double(inconsistent) / double(consistent+ inconsistent); +} + + + +// Use the phases stored in the Bubbles to consruct the PhasedComponents. +// The PhasedComponents must be non-overlapping and sorted by position. +void PhasingTable::constructPhasedComponents(bool debug) +{ + phasedComponents.clear(); + + // Create an initial version of PhasedComponents without + // worrying about ordering by position and about overlap between PhasedComponents. + for(const Bubble& bubble: bubbles) { + if(bubble.phase == 0) { + continue; + } + const uint64_t phasedComponentId = bubble.phasingComponent; + if(phasedComponentId >= phasedComponents.size()) { + for(uint64_t i=phasedComponents.size(); i<=phasedComponentId; i++) { + phasedComponents.push_back(make_shared<PhasedComponent>()); + } + } + phasedComponents[phasedComponentId]->push_back({bubble.positionInBubbleChain, bubble.phase}); + } + + if(debug) { + uint64_t totalPhasedBubbleCount = 0; + for(const auto& phasedComponent: phasedComponents) { + totalPhasedBubbleCount += phasedComponent->size(); + } + cout << "Created " << phasedComponents.size() << " initial phased components " + "with a total " << totalPhasedBubbleCount << " phased diploid bubbles." << endl; + } + + + + // If there is more than one PhasedComponent, we have to eliminate overlaps. + // We do this by removing bubbles from overlapping PhasedComponents, giving + // priority to larger PhasedComponents. + if(phasedComponents.size() > 1) { + + if(debug) { + cout << "More than one phased components found. Removing overlaps." << endl; + } + + // Sort the phased components by decreasing size. + class SortHelper { + public: + bool operator()( + const shared_ptr<PhasedComponent>& p0, + const shared_ptr<PhasedComponent>& p1 + ) const + { + return p0->size() > p1->size(); + } + }; + sort(phasedComponents.begin(), phasedComponents.end(), SortHelper()); + + for(const auto& phasedComponent: phasedComponents) { + phasedComponent->computePositionRange(); + } + + // Process the PhasedComponents in order of decreasing size. + vector< pair<uint64_t, uint64_t> > forbiddenRanges; // (min, max) + for(auto& phasedComponent: phasedComponents) { + + // See if it overlaps any of the forbidden ranges. + bool overlaps = false; + for(const auto& forbiddenRange: forbiddenRanges) { + const bool disjointLeft = phasedComponent->maxPositionInBubbleChain < forbiddenRange.first; + const bool disjointRight = phasedComponent->minPositionInBubbleChain > forbiddenRange.second; + if(not(disjointLeft or disjointRight)) { + overlaps = true; + break; + } + } + + if(debug) { + cout << "Phased component at " << phasedComponent->minPositionInBubbleChain << " " << + phasedComponent->maxPositionInBubbleChain; + if(overlaps) { + cout << " overlaps a previous phased component." << endl; + } else { + cout << " has no overlaps with previous phased components." << endl; + } + } + + if(not overlaps) { + forbiddenRanges.push_back( + {phasedComponent->minPositionInBubbleChain, phasedComponent->maxPositionInBubbleChain}); + continue; + } + + + + // This PhasedComponent overlaps a forbiddenRange. + // We need to remove the offending bubbles. + shared_ptr<PhasedComponent> newPhasedComponent = make_shared<PhasedComponent>(); + for(const auto& p: *phasedComponent) { + const uint64_t positionInBubbleChain = p.first; + + // See if this bubble overlaps any forbidden ranges. + bool overlaps = false; + for(const auto& forbiddenRange: forbiddenRanges) { + if( positionInBubbleChain >= forbiddenRange.first and + positionInBubbleChain <= forbiddenRange.second) { + overlaps = true; + break; + } + } + + // Only keep it if there is no overlap. + if(not overlaps) { + newPhasedComponent->push_back(p); + } + + } + + // Replace this phased component with the new one. + phasedComponent = newPhasedComponent; + phasedComponent->computePositionRange(); + forbiddenRanges.push_back({phasedComponent->minPositionInBubbleChain, phasedComponent->maxPositionInBubbleChain}); + + if(debug) { + cout << "After removing overlap, this phased component has " << phasedComponent->size() << + " diploid bubbles and position range " << phasedComponent->minPositionInBubbleChain << " " << + phasedComponent->maxPositionInBubbleChain << endl; + } + } + } + + + + // This could have created empty PhasedComponents. + // Remove them if they are present. + { + vector< shared_ptr<PhasedComponent> > nonEmptyPhasedComponents; + for(const shared_ptr<PhasedComponent>& phasedComponent: phasedComponents) { + if(not phasedComponent->empty()) { + nonEmptyPhasedComponents.push_back(phasedComponent); + } else { + if(debug) { + cout << "Removing empty phased component." << endl; + } + } + } + if(nonEmptyPhasedComponents.size() != phasedComponents.size()) { + phasedComponents.swap(nonEmptyPhasedComponents); + } + } + + + + // Compute the position ranges. + for(const auto& phasedComponent: phasedComponents) { + phasedComponent->computePositionRange(); + } + + // Sort the phased components in order of increasing position. + class SortHelper { + public: + bool operator()( + const shared_ptr<PhasedComponent>& p0, + const shared_ptr<PhasedComponent>& p1 + ) const + { + return p0->minPositionInBubbleChain < p1->minPositionInBubbleChain; + } + }; + sort(phasedComponents.begin(), phasedComponents.end(), SortHelper()); + + if(debug) { + cout << phasedComponents.size() << " phased components:" << endl; + for(const auto& phasedComponent: phasedComponents) { + cout << phasedComponent->size() << " diploid bubbles at positions " << + phasedComponent->minPositionInBubbleChain << "..." << + phasedComponent->maxPositionInBubbleChain << " in bubble chain." << endl; + + } + // phasingGraph.writeGraphviz("PhasingGraph.dot"); + } +} diff --git a/src/mode3-PhasingTable.hpp b/src/mode3-PhasingTable.hpp new file mode 100644 index 0000000..8f7d61a --- /dev/null +++ b/src/mode3-PhasingTable.hpp @@ -0,0 +1,250 @@ +#pragma once + +// Shasta. +#include "invalid.hpp" +#include "ReadId.hpp" + +// Boost libraries. +#include <boost/multi_index_container.hpp> +#include <boost/multi_index/ordered_index.hpp> +#include <boost/multi_index/member.hpp> +#include <boost/multi_index/composite_key.hpp> + +// Standard libraries. +#include "array.hpp" +#include <map> +#include <cmath> +#include "utility.hpp" +#include "vector.hpp" + +namespace shasta { + namespace mode3 { + class PhasingComponent; + class PhasingTable; + class PhasingTableEntry; + + class BubbleChain; + } + class MarkerGraph; +} + + + +// A PhasingTableEntry describes the appearances of one oriented read +// on one or both sides of a diploid Bubble of a BubbleChain. +// The frequency array contains the number of times the oriented read +// appears on non-terminal marker graph edges of the two Chains of the diploid Bubble. +class shasta::mode3::PhasingTableEntry { +public: + + PhasingTableEntry( + OrientedReadId orientedReadId, + uint64_t positionInBubbleChain) : + orientedReadId(orientedReadId), + positionInBubbleChain(positionInBubbleChain) + {} + + // The OrientedReadId this PhasingTableEntry refers to, + // and its index in the PhasingTable::orientedReads vector. + OrientedReadId orientedReadId; + uint64_t orientedReadIndex = invalid<uint64_t>; + + // The position in the bubble chain of the diploid bubble + // this PhasingTableEntry refers to, + // and its index in the PhasingTable::orientedReads vector. + uint64_t positionInBubbleChain; + uint64_t bubbleIndex = invalid<uint64_t>; + + // The number of times this oriented read + // appears on non-terminal marker graph edges of the two Chains of the diploid Bubble. + // The two entries in the array corresponds to the two chains of the diploid Bubble. + array<uint64_t, 2> frequency = {0, 0}; + + // The phase of this oriented read relative to this bubble + // is computed from the frequency array. + + // The relative phase varies continuously between -1 and 1 and is: + // * +1 if this oriented read always appears in Chain 0 (that is, frequency[1] is 0). + // * -1 if this oriented read always appears in Chain 1 (that is, frequency[0] is 0). + // * 0 if this oriented appears with equal frequency on Chain 0 and Chain 1 + // (that is, frequency[0] = frequency[1]). + double relativePhase = invalid<double>; + + // The discrete relative phase can be: + // +1 if relativePhase > +1. - phaseErrorThreshold. + // -1 if relativePhase < -1. + phaseErrorThreshold. + // 0 otherwise. + int64_t discreteRelativePhase = invalid<int64_t>; + + // Compute and store the relativePhase and discreteRelativePhase. + void storeRelativePhase(double phaseErrorThreshold) + { + relativePhase = 2. * double(frequency[0]) / double(frequency[0] + frequency[1]) - 1.; + if(relativePhase > 1. - phaseErrorThreshold) { + discreteRelativePhase = +1; + } else if(relativePhase < -1. + phaseErrorThreshold) { + discreteRelativePhase = -1; + } else { + discreteRelativePhase = 0; + } + } + + void writeCsv(ostream&) const; +}; + + + +// A PhasingTable is a set of PhasingTableEntry objects, +// randomly accessible by orientedReadId and by positionInBubbleChain. +class shasta::mode3::PhasingTable: public boost::multi_index_container<PhasingTableEntry, + boost::multi_index::indexed_by < + + // Index by (orientedReadId, positionInBubbleChain) (unique). + boost::multi_index::ordered_unique< + boost::multi_index::composite_key< + PhasingTableEntry, + boost::multi_index::member<PhasingTableEntry, OrientedReadId ,&PhasingTableEntry::orientedReadId>, + boost::multi_index::member<PhasingTableEntry, uint64_t, &PhasingTableEntry::positionInBubbleChain> + > >, + + // Index by orientedReadId (non-unique). + boost::multi_index::ordered_non_unique<boost::multi_index::member< + PhasingTableEntry, + OrientedReadId, + &PhasingTableEntry::orientedReadId> >, + + // Index by positionInBubbleChain (non-unique). + boost::multi_index::ordered_non_unique<boost::multi_index::member< + PhasingTableEntry, + uint64_t, + &PhasingTableEntry::positionInBubbleChain> > + > > { +public: + + PhasingTable( + const BubbleChain&, + const MarkerGraph&, + double phaseErrorThreshold); + + uint64_t entryCount() const + { + return size(); + } + uint64_t unambiguousEntryCount() const; + uint64_t ambiguousEntryCount() const; + + uint64_t bubbleCount() const + { + return bubbles.size(); + } + + uint64_t orientedReadCount() const + { + return orientedReads.size(); + } + + // Experimental. Do not use. + void simpleIterativePhasing1(); + void simpleIterativePhasing2(); + + // Optimize the phases of the oriented reads and of the bubbles. + void greedyPhasing(); + + void writeCsv(const string& fileNamePrefix) const; + enum class ColoringMethod { + byRelativePhase, + byDiscreteRelativePhase, + byConsistency + }; + void writePng(const string& fileName, ColoringMethod) const; + + double bubbleErrorRate(uint64_t positionInBubbleChain) const; + + vector< shared_ptr<PhasedComponent> > phasedComponents; + void constructPhasedComponents(bool debug); + +private: + const auto& indexByBoth() const {return get<0>();} + const auto& indexByOrientedReadId() const {return get<1>();} + const auto& indexByPositionInBubbleChain() const {return get<2>();} + + void fill( + const BubbleChain&, + const MarkerGraph&, + double phaseErrorThreshold); + + + + // Information about the orientedReads that appears in the PhasingTable. + class OrientedRead { + public: + OrientedReadId id; + uint64_t minPositionInBubbleChain; + uint64_t maxPositionInBubbleChain; + int64_t phase = 0; // -1, 0 or +1 + uint64_t phasingComponent = invalid<uint64_t>; + }; + void gatherOrientedReads(); + vector<OrientedRead> orientedReads; + + // Map OrientedReadId to an index in the orientedReadInfos vector. + std::map<OrientedReadId, uint64_t> orientedReadsMap; + + + + // Information about the diploid bubbles in this PhasingTable. + class Bubble { + public: + uint64_t positionInBubbleChain; + int64_t phase = 0; // -1, 0 or +1 + uint64_t phasingComponent = invalid<uint64_t>; + }; + vector<Bubble> bubbles; + void gatherBubbles(); + + // Map a positionInBubbleChain to an index in the bubbles vector. +public: + std::map<uint64_t, uint64_t> bubblesMap; +private: + + + + // Fill the orientedReadIndex and bubbleIndex in all PhasingTableEntries. + // This can only be done after gatherOrientedReads and gatherBubbles + // have been called. + void fillIndexes(); + + // Compute the consistency state of a PhasingTableEntry relative + // to the current phases of its oriented read and bubble. + // It can be +1 (consistent), -1 (inconsistent), or 0 (unassigned or ambiguous). + // See the implementation for details. + int64_t consistencyState(const PhasingTableEntry&) const; + + // Count the number of (consistent,inconsistent) PhasingTableEntries + // for an oriented read based on the phases currently assigned + // to bubbles and oriented reads. + pair<uint64_t, uint64_t> countConsistentEntriesForOrientedRead(OrientedReadId) const; + + // Count the number of (consistent,inconsistent) PhasingTableEntries + // for the bubble at a given bubble chain position based on the phases currently assigned + // to bubbles and oriented reads. + pair<uint64_t, uint64_t> countConsistentEntriesForBubble(uint64_t positionInBubbleChain) const; + + // Count the number of (unambiguous, ambiguous) PhasingTableEntries + // for the bubble at a given bubble chain position based on the phases currently assigned + // to bubbles and oriented reads. + pair<uint64_t, uint64_t> countEntriesForBubble(uint64_t positionInBubbleChain) const; + +public: + // Count the number of (consistent,inconsistent) PhasingTableEntries + // based on the phases currently assigned + // to bubbles and oriented reads. + pair<uint64_t, uint64_t> countConsistentEntries() const; + +private: + void writeOrientedReadsCsv(const string& fileNamePrefix) const; + void writeBubblesCsv(const string& fileNamePrefix, bool writePhasingInformation) const; + void writeDetailsCsv(const string& fileNamePrefix) const; +}; + + diff --git a/src/mode3-PrimaryGraph.cpp b/src/mode3-PrimaryGraph.cpp new file mode 100644 index 0000000..2988636 --- /dev/null +++ b/src/mode3-PrimaryGraph.cpp @@ -0,0 +1,548 @@ +// Shasta. +#include "mode3-PrimaryGraph.hpp" +#include "Assembler.hpp" +#include "deduplicate.hpp" +#include "longestPath.hpp" +#include "MarkerGraph.hpp" +#include "MurmurHash2.hpp" +#include "orderPairs.hpp" +#include "performanceLog.hpp" +#include "timestamp.hpp" +using namespace shasta; +using namespace mode3; + +// Boost libraries. +#include <boost/graph/iteration_macros.hpp> +#include <boost/multi_index_container.hpp> +#include <boost/multi_index/ordered_index.hpp> +#include <boost/multi_index/member.hpp> +#include <boost/pending/disjoint_sets.hpp> + +// Standard library. +#include "fstream.hpp" +#include <queue> + + + +PrimaryGraph::vertex_descriptor PrimaryGraph::addVertex(MarkerGraphEdgeId edgeId) +{ + SHASTA_ASSERT(not vertexMap.contains(edgeId)); + const vertex_descriptor v = add_vertex({edgeId}, *this); + vertexMap.insert({edgeId, v}); + return v; +} + + + +void PrimaryGraph::addEdgeFromVertexDescriptors( + vertex_descriptor v0, + vertex_descriptor v1, + const MarkerGraphEdgePairInfo& info, + uint64_t coverage) +{ + add_edge(v0, v1, {info, coverage}, *this); +} + + + +void PrimaryGraph::addEdge( + MarkerGraphEdgeId edgeId0, + MarkerGraphEdgeId edgeId1, + const MarkerGraphEdgePairInfo& info, + uint64_t coverage) +{ + auto it0 = vertexMap.find(edgeId0); + auto it1 = vertexMap.find(edgeId1); + SHASTA_ASSERT(it0 != vertexMap.end()); + SHASTA_ASSERT(it1 != vertexMap.end()); + const vertex_descriptor v0 = it0->second; + const vertex_descriptor v1 = it1->second; + + addEdgeFromVertexDescriptors(v0, v1, info, coverage); +} + + + +// Write a PrimaryGraph in graphviz format. +void PrimaryGraph::writeGraphviz( + const string& name, + const PrimaryGraphDisplayOptions& options, + const MarkerGraph& markerGraph) const +{ + ofstream out(name + ".dot"); + + const PrimaryGraph& graph = *this; + out << "digraph " << name << " {\n"; + + BGL_FORALL_VERTICES(v, graph, PrimaryGraph) { + const PrimaryGraphVertex& vertex = graph[v]; + out << vertex.edgeId; + + if(options.labels or options.tooltips or options.colorVertices) { + out << "["; + } + + if(options.labels) { + out << "label=\""; + out << vertex.edgeId << "\\n" << markerGraph.edgeCoverage(vertex.edgeId); + out << "\" "; + } + + if(options.tooltips) { + out << "tooltip=\""; + out << vertex.edgeId; + out << "\" "; + } + + if(options.labels or options.tooltips or options.colorVertices) { + out << "]"; + } + out << ";\n"; + } + + + + BGL_FORALL_EDGES(e, graph, PrimaryGraph) { + const PrimaryGraphEdge& edge = graph[e]; + if(not options.showNonTransitiveReductionEdges and edge.isNonTransitiveReductionEdge) { + continue; + } + const vertex_descriptor v0 = source(e, graph); + const vertex_descriptor v1 = target(e, graph); + + out << + graph[v0].edgeId << "->" << + graph[v1].edgeId; + + if(edge.isNonTransitiveReductionEdge or options.labels or options.tooltips or options.colorEdges) { + out << " ["; + } + + if(edge.isNonTransitiveReductionEdge) { + out << "style=dashed "; + } + + if(options.tooltips) { + out << + "tooltip=\"" << + graph[v0].edgeId << "->" << + graph[v1].edgeId << " "; + if(edge.coverage != invalid<uint64_t>) { + out << edge.coverage << "/"; + } + out << + edge.info.common << " " << + std::fixed << std::setprecision(2) << edge.info.correctedJaccard() << " " << + edge.info.offsetInBases << "\" "; + } + + if(options.labels) { + out << + "label=\""; + if(edge.coverage != invalid<uint64_t>) { + out << edge.coverage << "/"; + } + out << + edge.info.common << "\\n" << + std::fixed << std::setprecision(2) << edge.info.correctedJaccard() << "\\n" << + edge.info.offsetInBases << "\" "; + + } + + // Color. + if(options.colorEdges) { + const double correctedJaccard = edge.info.correctedJaccard(); + if(correctedJaccard <= options.redJ) { + out << " color=red "; + } else if(correctedJaccard >= options.greenJ) { + out << " color=green "; + } else { + const double hue = (correctedJaccard - options.redJ) / (3. * (options.greenJ - options.redJ)); + out << " color=\"" << hue << ",1,1\" "; + } + } + + if(edge.isNonTransitiveReductionEdge or options.labels or options.tooltips or options.colorEdges) { + out << "]"; + } + out << ";\n"; + } + + out << "}\n"; +} + + + +void PrimaryGraph::writeEdgeCoverageHistogram(const string& fileName) const +{ + const PrimaryGraph& primaryGraph = *this; + + // Create a histogram indexed by histogram[coverage][commonCount]. + vector< vector<uint64_t> > histogram; + + // Loop over all edges. + BGL_FORALL_EDGES(e, primaryGraph, PrimaryGraph) { + const PrimaryGraphEdge& edge = primaryGraph[e]; + const uint64_t coverage = edge.coverage; + const uint64_t commonCount = edge.info.common; + SHASTA_ASSERT(coverage <= commonCount); + + // Increment the histogram, making space as necessary. + if(coverage >= histogram.size()) { + histogram.resize(coverage + 1); + } + vector<uint64_t>& h = histogram[coverage]; + if(commonCount >= h.size()) { + h.resize(commonCount + 1, 0); + } + ++h[commonCount]; + } + + // Write out the histogram. + ofstream csv(fileName); + csv << "Coverage,Common count,Loss,Frequency\n"; + for(uint64_t coverage=0; coverage<histogram.size(); coverage++) { + const vector<uint64_t>& h = histogram[coverage]; + for(uint64_t commonCount=0; commonCount<h.size(); commonCount++) { + const uint64_t frequency = h[commonCount]; + + if(frequency > 0) { + const uint64_t loss = commonCount - coverage; + csv << coverage << ","; + csv << commonCount << ","; + csv << loss << ","; + csv << frequency << "\n"; + } + } + } +} + + + +// Create the connected components of this PrimaryGraph, +// without changing the PrimaryGraph itself. +vector< shared_ptr<PrimaryGraph> > PrimaryGraph::createConnectedComponents( + uint64_t minComponentSize) const +{ + const PrimaryGraph& graph = *this; + + // Compute connected components. + // We can't use boost::connected_components because it only works + // for undirected graphs. + const uint64_t n = num_vertices(graph); + vector<uint64_t> rank(n); + vector<uint64_t> parent(n); + boost::disjoint_sets<uint64_t*, uint64_t*> disjointSets(&rank[0], &parent[0]); + for(uint64_t vertexId=0; vertexId<n; vertexId++) { + disjointSets.make_set(vertexId); + } + BGL_FORALL_EDGES(e, graph, PrimaryGraph) { + const PrimaryGraph::vertex_descriptor v0 = source(e, graph); + const PrimaryGraph::vertex_descriptor v1 = target(e, graph); + disjointSets.union_set(v0, v1); + } + + + // Gather the vertices in each connected component. + vector< shared_ptr<PrimaryGraph> > allComponentPointers(num_vertices(graph)); + BGL_FORALL_VERTICES(v, graph, PrimaryGraph) { + const PrimaryGraphVertex& vertex = graph[v]; + const uint64_t componentId = disjointSets.find_set(v); + shared_ptr<PrimaryGraph>& componentPointer = allComponentPointers[componentId]; + if(not componentPointer) { + componentPointer = make_shared<PrimaryGraph>(); + } + PrimaryGraph& component = *componentPointer; + component.addVertex(vertex.edgeId); + } + + + // Gather the edges in each connected component. + BGL_FORALL_EDGES(e, graph, PrimaryGraph) { + const PrimaryGraph::vertex_descriptor v0 = source(e, graph); + const PrimaryGraph::vertex_descriptor v1 = target(e, graph); + const uint64_t edgeId0 = graph[v0].edgeId; + const uint64_t edgeId1 = graph[v1].edgeId; + const uint64_t componentId = disjointSets.find_set(v0); + SHASTA_ASSERT(componentId == disjointSets.find_set(v1)); + shared_ptr<PrimaryGraph>& componentPointer = allComponentPointers[componentId]; + SHASTA_ASSERT(componentPointer); + PrimaryGraph& component = *componentPointer; + component.addEdge( + edgeId0, + edgeId1, + graph[e].info, + graph[e].coverage); + } + + + + // Keep only the components with at least minComponentSize vertices + // and sort them by size. + vector< pair<shared_ptr<PrimaryGraph>, uint64_t> > componentPointersWithSizes; + for(const shared_ptr<PrimaryGraph>& p: allComponentPointers) { + if(p) { + const uint64_t componentSize = num_vertices(*p); + if(componentSize >= minComponentSize) { + componentPointersWithSizes.push_back({p, componentSize}); + } + } + } + sort(componentPointersWithSizes.begin(), componentPointersWithSizes.end(), + OrderPairsBySecondOnlyGreater<shared_ptr<PrimaryGraph>, uint64_t>()); + + + // For now return all components, including the empty ones. + // But we want to remove the small ones and sort them by size. + vector< shared_ptr<PrimaryGraph> > componentPointers; + for(const auto& p: componentPointersWithSizes) { + componentPointers.push_back(p.first); + } + return componentPointers; +} + + + +// Remove cross-edges. +// This removes an edge v0->v1 if the following are all true: +// - It is not marked as removed by transitive reduction. +// - Its coverage is at most lowCoverageThreshold. +// - Its estimated offset is at least minOffset. +// - v0 has at least one out-edge with coverage at least highCoverageThreshold +// (ignoring edges marked as removed by transitive reduction). +// - v1 has at least one in-edge with coverage at least highCoverageThreshold. +// (ignoring edges marked as removed by transitive reduction). +void PrimaryGraph::removeCrossEdges( + uint64_t lowCoverageThreshold, + uint64_t highCoverageThreshold, + uint64_t minOffset) +{ + PrimaryGraph& graph = *this; + + // Find the edges we are going to remove. + vector<edge_descriptor> edgesToBeRemoved; + BGL_FORALL_EDGES(e, graph, PrimaryGraph) { + const PrimaryGraphEdge& edge = graph[e]; + + // If it is marked as removed by transitive reduction, skip it. + if(edge.isNonTransitiveReductionEdge) { + continue; + } + + // Check coverage. + if(edge.coverage > lowCoverageThreshold) { + continue; + } + + // Check estimated offset. + if(edge.info.offsetInBases < int64_t(minOffset)) { + continue; + } + + // Check out-edges of v0. + const vertex_descriptor v0 = source(e, graph); + bool v0HasStrongOutEdge = false; + BGL_FORALL_OUTEDGES(v0, e0, graph, PrimaryGraph) { + // If it is marked as removed by transitive reduction, ignore it. + if(graph[e0].isNonTransitiveReductionEdge) { + continue; + } + if(graph[e0].coverage >= highCoverageThreshold) { + v0HasStrongOutEdge = true; + break; + } + } + if(not v0HasStrongOutEdge) { + continue; + } + + // Check in-edges of v1. + const vertex_descriptor v1 = target(e, graph); + bool v1HasStrongOutEdge = false; + BGL_FORALL_INEDGES(v1, e1, graph, PrimaryGraph) { + // If it is marked as removed by transitive reduction, ignore it. + if(graph[e1].isNonTransitiveReductionEdge) { + continue; + } + if(graph[e1].coverage >= highCoverageThreshold) { + v1HasStrongOutEdge = true; + break; + } + } + if(not v1HasStrongOutEdge) { + continue; + } + + // If all above checks passed, this edge will be removed. + edgesToBeRemoved.push_back(e); + } + + // Remove the edges we found. + for(const edge_descriptor e: edgesToBeRemoved) { + boost::remove_edge(e, graph); + } +} + + + +// Remove edges for which loss = (commonCount - coverage) / commonCount > maxLoss +void PrimaryGraph::removeWeakEdges(double maxLoss) +{ + PrimaryGraph& graph = *this; + + // Find the edges we are going to remove. + vector<edge_descriptor> edgesToBeRemoved; + BGL_FORALL_EDGES(e, graph, PrimaryGraph) { + const PrimaryGraphEdge& edge = graph[e]; + const double loss = double(edge.info.common - edge.coverage) / double(edge.info.common); + if(loss > maxLoss) { + edgesToBeRemoved.push_back(e); + } + } + + + + // Remove the edges we found. + for(const edge_descriptor e: edgesToBeRemoved) { + boost::remove_edge(e, graph); + } + +} + + +#if 0 +// Given sets of two primary in-edges and two primary out-edges, +// find primary mid-edges in-between that can be used for detangling. +void GlobalPathGraph::searchForDetangling( + const array<MarkerGraphEdgeId, 2>& in, + const array<MarkerGraphEdgeId, 2>& out, + uint64_t highCommonCountThreshold, + uint64_t lowCommonCountThreshold, + const Assembler& assembler, + array<array<vector<MarkerGraphEdgeId>, 2>, 2>& mid) +{ + // Loop over the primary journeys of oriented reads in the "in" primary edges. + // Only use the journey portion following the "in" primary edges. + array<vector<MarkerGraphEdgeId>, 2> inFollowers; + array<vector<uint64_t>, 2> inFollowersCommonCount; + for(uint64_t i=0; i<2; i++) { + assembler.markerGraph.followPrimaryJourneysForward(in[i], inFollowers[i], inFollowersCommonCount[i]); + } + + + + // Find inFollowers that have high common count with in[0] + // and low common count with in[1], or vice versa. + array<vector<MarkerGraphEdgeId>, 2> inCandidates; + { + uint64_t i0 = 0; + uint64_t i1 = 0; + uint64_t end0 = inFollowers[0].size(); + uint64_t end1 = inFollowers[1].size(); + while(i0<end0 and i1<end1) { + const MarkerGraphEdgeId edgeId0 = inFollowers[0][i0]; + const MarkerGraphEdgeId edgeId1 = inFollowers[1][i1]; + + if(edgeId0 < edgeId1) { + // edgeId0 is in inFollowers[0] but not in inFollowers[1]. + if(inFollowersCommonCount[0][i0] >= highCommonCountThreshold) { + inCandidates[0].push_back(edgeId0); + } + ++i0; + } + + else if(edgeId1 < edgeId0) { + // edgeId1 is in inFollowers[1] but not in inFollowers[0]. + if(inFollowersCommonCount[1][i1] >= highCommonCountThreshold) { + inCandidates[1].push_back(edgeId1); + } + ++i1; + } + + else { + // edgeId0 is in inFollowers[0] and in inFollowers[1]. + const uint64_t common0 = inFollowersCommonCount[0][i0]; + const uint64_t common1 = inFollowersCommonCount[1][i1]; + if(common0 >= highCommonCountThreshold and common1 <= lowCommonCountThreshold) { + inCandidates[0].push_back(edgeId0); + } + else if(common1 >= highCommonCountThreshold and common0 <= lowCommonCountThreshold) { + inCandidates[1].push_back(edgeId1); + } + ++i0; + ++i1; + } + } + } + + + + // Loop over the primary journeys of oriented reads in the "out" primary edges. + // Only use the journey portion preceding the "out" primary edges. + array<vector<MarkerGraphEdgeId>, 2> outPreceders; + array<vector<uint64_t>, 2> outPrecedersCommonCount; + for(uint64_t i=0; i<2; i++) { + assembler.markerGraph.followPrimaryJourneysBackward(out[i], outPreceders[i], outPrecedersCommonCount[i]); + } + + + + // Find outPreceders that have high common count with out[0] + // and low common count with out[1], or vice versa. + array<vector<MarkerGraphEdgeId>, 2> outCandidates; + { + uint64_t i0 = 0; + uint64_t i1 = 0; + uint64_t end0 = outPreceders[0].size(); + uint64_t end1 = outPreceders[1].size(); + while(i0<end0 and i1<end1) { + const MarkerGraphEdgeId edgeId0 = outPreceders[0][i0]; + const MarkerGraphEdgeId edgeId1 = outPreceders[1][i1]; + + if(edgeId0 < edgeId1) { + // edgeId0 is in outPreceders[0] but not in outPreceders[1]. + if(outPrecedersCommonCount[0][i0] >= highCommonCountThreshold) { + outCandidates[0].push_back(edgeId0); + } + ++i0; + } + + else if(edgeId1 < edgeId0) { + // edgeId1 is in outPreceders[1] but not in outPreceders[0]. + if(outPrecedersCommonCount[1][i1] >= highCommonCountThreshold) { + outCandidates[1].push_back(edgeId1); + } + ++i1; + } + + else { + // edgeId0 is in outPreceders[0] and in outPreceders[1]. + const uint64_t common0 = outPrecedersCommonCount[0][i0]; + const uint64_t common1 = outPrecedersCommonCount[1][i1]; + if(common0 >= highCommonCountThreshold and common1 <= lowCommonCountThreshold) { + outCandidates[0].push_back(edgeId0); + } + else if(common1 >= highCommonCountThreshold and common0 <= lowCommonCountThreshold) { + outCandidates[1].push_back(edgeId1); + } + ++i0; + ++i1; + } + } + } + + + + // Find MarkerGraphEdgeIds that are both inCandidates and outCandidates. + for(uint64_t i0=0; i0<2; i0++) { + for(uint64_t i1=0; i1<2; i1++) { + mid[i0][i1].clear(); + std::set_intersection( + inCandidates[i0].begin(), inCandidates[i0].end(), + outCandidates[i1].begin(), outCandidates[i1].end(), + back_inserter(mid[i0][i1])); + } + } +} +#endif + diff --git a/src/mode3-PrimaryGraph.hpp b/src/mode3-PrimaryGraph.hpp new file mode 100644 index 0000000..2a10877 --- /dev/null +++ b/src/mode3-PrimaryGraph.hpp @@ -0,0 +1,148 @@ +#pragma once + +/******************************************************************************* + +In a PrimaryGraph, each vertex represents a primary edge of the marker graph. +Edges are generated by following the reads. + +*******************************************************************************/ + +// Shasta. +#include "Base.hpp" +#include "MarkerGraphEdgePairInfo.hpp" +#include "MultithreadedObject.hpp" +#include "ReadId.hpp" +#include "shastaTypes.hpp" + +// Boost libraries. +#include <boost/graph/adjacency_list.hpp> + +// Standard library. +#include "iosfwd.hpp" +#include "memory.hpp" +#include "string.hpp" +#include "utility.hpp" +#include "vector.hpp" + +namespace shasta { + class Assembler; + class MarkerGraph; + namespace mode3 { + + // A connected component of the primary graph, + // in which each vertex represents a primary edge of the marker graph. + // Edges are created by following the reads on their journeys + // over primary marker graph edges. + class PrimaryGraphVertex; + class PrimaryGraphEdge; + class PrimaryGraph; + using PrimaryGraphBaseClass = boost::adjacency_list< + boost::listS, + boost::vecS, + boost::bidirectionalS, + PrimaryGraphVertex, + PrimaryGraphEdge>; + + class PrimaryGraphDisplayOptions; + + } +} + + + +// Class to control Graphviz output of PrimaryGraph. +class shasta::mode3::PrimaryGraphDisplayOptions { +public: + bool labels = true; + bool tooltips = true; + bool colorVertices = true; + bool colorEdges = true; + bool showNonTransitiveReductionEdges = true; + + // Thresholds for coloring by corrected Jaccard similarity J'. + // If J' <= redJ, the edge is drawn red. + // If J' >= greenJ, the edge is drawn green. + // For values in between, the color is interpolated. + double redJ; + double greenJ; + + PrimaryGraphDisplayOptions(double redJ = 0., double greenJ = 1.) : + redJ(redJ), greenJ(greenJ) {} + + void makeCompact() + { + labels = false; + tooltips = false; + colorVertices = false; + colorEdges = false; + } +}; + + + +class shasta::mode3::PrimaryGraphVertex { +public: + + // The corresponding marker graph edgeId. + MarkerGraphEdgeId edgeId; +}; + + + +class shasta::mode3::PrimaryGraphEdge { +public: + MarkerGraphEdgePairInfo info; + uint64_t coverage; + bool isNonTransitiveReductionEdge = false; +}; + + + +class shasta::mode3::PrimaryGraph : public PrimaryGraphBaseClass { +public: + + std::map<MarkerGraphEdgeId, vertex_descriptor> vertexMap; + vertex_descriptor addVertex(MarkerGraphEdgeId); + + void addEdge( + MarkerGraphEdgeId, + MarkerGraphEdgeId, + const MarkerGraphEdgePairInfo&, + uint64_t coverage); + void addEdgeFromVertexDescriptors( + vertex_descriptor, + vertex_descriptor, + const MarkerGraphEdgePairInfo&, + uint64_t coverage); + + void writeGraphviz( + const string& name, + const PrimaryGraphDisplayOptions&, + const MarkerGraph&) const; + + void writeEdgeCoverageHistogram(const string& fileName) const; + + // Create the connected components of this PrimaryGraph, + // without changing the PrimaryGraph itself. + vector< shared_ptr<PrimaryGraph> > createConnectedComponents(uint64_t minComponentSize) const; + + void localTransitiveReduction( + uint64_t distance, + uint64_t maxCoverage); + + // Remove cross-edges. + // This removes an edge v0->v1 if the following are all true: + // - Its coverage is at most lowCoverageThreshold. + // - Its estimated offset is at least minOffset. + // - v0 has at least one out-edge with coverage at least highCoverageThreshold. + // - v1 has at least one in-edge with coverage at least highCoverageThreshold. + void removeCrossEdges( + uint64_t lowCoverageThreshold, + uint64_t highCoverageThreshold, + uint64_t minOffset); + + // Remove edges for which loss = (commonCount - coverage) / commonCount > maxLoss + void removeWeakEdges(double maxLoss); + +}; + diff --git a/src/mode3-SegmentPairInformation.hpp b/src/mode3-SegmentPairInformation.hpp deleted file mode 100644 index 80db6e3..0000000 --- a/src/mode3-SegmentPairInformation.hpp +++ /dev/null @@ -1,80 +0,0 @@ -#ifndef SHASTA_MODE3_SEGMENT_PAIR_INFORMATION_HPP -#define SHASTA_MODE3_SEGMENT_PAIR_INFORMATION_HPP - -// Shasta. -#include "invalid.hpp" -#include "SHASTA_ASSERT.hpp" - -// Standard library. -#include "algorithm.hpp" -#include "array.hpp" -#include "cstdint.hpp" - -namespace shasta { - namespace mode3 { - class SegmentPairInformation; - } -} - - - -// Information for a pair of segments, as computed by -// mode3::AssemblyGraph::analyzeSegmentPair. -class shasta::mode3::SegmentPairInformation { -public: - - // The total number of oriented reads present in each segment. - array<uint64_t, 2> totalCount = {0, 0}; - - // The number of oriented reads present in both segments. - // If this is zero, the rest of the information is not valid. - uint64_t commonCount = 0; - - // The offset of segment 1 relative to segment 0, in markers. - int64_t offset = invalid<int64_t>; - - // The number of oriented reads present in each segment - // but missing from the other segment, - // and which should have been present based on the above estimated offset. - array<uint64_t, 2> unexplainedCount = {0, 0}; - - // The number of oriented reads that appear in only one - // of the two segments, but based on the estimated offset - // are too short to appear in the other segment. - array<uint64_t, 2> shortCount = {0, 0}; - - // Check that the above counts are consistent. - void check() const - { - for(uint64_t i=0; i<2; i++) { - SHASTA_ASSERT(commonCount + unexplainedCount[i] + shortCount[i] == - totalCount[i]); - } - } - - // This computes the fraction of unexplained oriented reads, - // without counting the short ones. - double unexplainedFraction(uint64_t i) const - { - // return double(unexplainedCount[i]) / double(totalCount[i]); - return double(unexplainedCount[i]) / double(commonCount + unexplainedCount[i]); - } - double maximumUnexplainedFraction() const - { - return max(unexplainedFraction(0), unexplainedFraction(1)); - } - - // Jaccard similarity, without counting the short reads. - double jaccard() const - { - return double(commonCount) / double(commonCount + unexplainedCount[0] + unexplainedCount[1]); - } - - // Raw Jaccard similarity (no special treatment of short reads) - double rawJaccard() const - { - return double(commonCount) / double(totalCount[0] + totalCount[1] - commonCount); - } -}; - -#endif diff --git a/src/mode3.cpp b/src/mode3.cpp deleted file mode 100644 index a800cdb..0000000 --- a/src/mode3.cpp +++ /dev/null @@ -1,3001 +0,0 @@ - -// Shasta -#include "mode3.hpp" -#include "assembleMarkerGraphPath.hpp" -#include "deduplicate.hpp" -#include "findMarkerId.hpp" -#include "html.hpp" -#include "MarkerGraph.hpp" -#include "mode3-AssemblyPath.hpp" -#include "mode3-JaccardGraph.hpp" -#include "orderPairs.hpp" -#include "Reads.hpp" -#include "ReadFlags.hpp" -#include "mode3-SegmentPairInformation.hpp" -#include "SubsetGraph.hpp" -using namespace shasta; -using namespace mode3; - -// Boost libraries. -// Include disjoint_sets.hpp first to avoid Boost problems. -#include <boost/pending/disjoint_sets.hpp> -#include <boost/icl/discrete_interval.hpp> -#include <boost/icl/right_open_interval.hpp> -#include <boost/graph/iteration_macros.hpp> -#include <boost/graph/strong_components.hpp> - -// Standard library. -#include <bitset> -#include "fstream.hpp" -#include <map> -#include <queue> -#include <set> -#include <unordered_set> - -#include "MultithreadedObject.tpp" -template class MultithreadedObject<mode3::AssemblyGraph>; - - -// Each linear chain of marker graph edges generates a segment. -void AssemblyGraph::createSegmentPaths() -{ - const bool debug = false; - - createNew(markerGraphPaths, "Mode3-MarkerGraphPaths"); - const MarkerGraph::EdgeId edgeCount = markerGraph.edges.size(); - vector<bool> wasFound(edgeCount, false); - - using MarkerGraphPath = vector<MarkerGraph::EdgeId>; - MarkerGraphPath nextEdges; - MarkerGraphPath previousEdges; - MarkerGraphPath path; - MarkerGraphPath reverseComplementedPath; - - // Main loop over all edges of the marker graph. - // At each iteration we find a new linear path of edges. - for(MarkerGraph::EdgeId startEdgeId=0; startEdgeId<edgeCount; startEdgeId++) { - - // If we already found this edge, skip it. - // It is part of a path we already found. - if(wasFound[startEdgeId]) { - continue; - } - - if(debug) { - cout << "Starting a new path at edge " << startEdgeId << endl; - } - - // Follow the path forward. - nextEdges.clear(); - MarkerGraph::EdgeId edgeId = startEdgeId; - bool isCircular = false; - while(true) { - const MarkerGraph::Edge edge = markerGraph.edges[edgeId]; - const MarkerGraph::VertexId v1 = edge.target; - const auto outEdges = markerGraph.edgesBySource[v1]; - if(outEdges.size() != 1) { - break; - } - const auto inEdges = markerGraph.edgesByTarget[v1]; - if(inEdges.size() != 1) { - break; - } - edgeId = outEdges[0]; - if(edgeId == startEdgeId) { - isCircular = true; - break; - } - nextEdges.push_back(edgeId); - SHASTA_ASSERT(not wasFound[edgeId]); - if(debug) { - cout << "Moving forward: added " << edgeId << endl; - } - } - - // Follow the path backward. - previousEdges.clear(); - if(!isCircular) { - edgeId = startEdgeId; - while(true) { - const MarkerGraph::Edge edge = markerGraph.edges[edgeId]; - const MarkerGraph::VertexId v0 = edge.source; - const auto outEdges = markerGraph.edgesBySource[v0]; - if(outEdges.size() != 1) { - break; - } - const auto inEdges = markerGraph.edgesByTarget[v0]; - if(inEdges.size() != 1) { - break; - } - edgeId = inEdges[0]; - previousEdges.push_back(edgeId); - SHASTA_ASSERT(not wasFound[edgeId]); - if(debug) { - cout << "Moving backward: added " << edgeId << endl; - } - } - } - - // Gather the path. - path.clear(); - copy(previousEdges.rbegin(), previousEdges.rend(), back_inserter(path)); - path.push_back(startEdgeId); - copy(nextEdges.begin(), nextEdges.end(), back_inserter(path)); - - // Mark all the edges in the path as found. - for(const MarkerGraph::EdgeId edgeId: path) { - if(wasFound[edgeId]) { - cout << "Assertion failed at " << edgeId << endl; - SHASTA_ASSERT(0); - } - wasFound[edgeId] = true; - } - - // Store this path as a new segment. - markerGraphPaths.appendVector(); - for(const MarkerGraphEdgeId edgeId: path) { - markerGraphPaths.append(edgeId); - } - } - - - - // Check that all edges of the marker graph were found. - SHASTA_ASSERT(find(wasFound.begin(), wasFound.end(), false) == wasFound.end()); - - - // Debug output: write the paths. - if(debug) { - ofstream csv("Paths.csv"); - for(uint64_t segmentId=0; segmentId<markerGraphPaths.size(); segmentId++) { - const auto path = markerGraphPaths[segmentId]; - for(const MarkerGraphEdgeId edgeId: path) { - csv << segmentId << ","; - csv << edgeId << "\n"; - } - } - } - -} - - - -// Compute coverage for all segments. -// It is computed as average marker graph edge coverage -// over the marker graph edges in the path of each segment. -void AssemblyGraph::computeSegmentCoverage() -{ - // Initialize segmentCoverage. - createNew(segmentCoverage, "Mode3-SegmentCoverage"); - const uint64_t segmentCount = markerGraphPaths.size(); - segmentCoverage.resize(segmentCount); - - // Loop over all segments. - for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) { - - // Access the marker graph path for this segment. - const span<MarkerGraphEdgeId> path = markerGraphPaths[segmentId]; - - - // Loop over this path. - uint64_t coverageSum = 0.; - for(uint64_t position=0; position<path.size(); position++) { - MarkerGraphEdgeId& edgeId = path[position]; - - // Add the marker intervals on this marker graph edge. - const span<const MarkerInterval> markerIntervals = markerGraph.edgeMarkerIntervals[edgeId]; - coverageSum += markerIntervals.size(); - } - - segmentCoverage[segmentId] = float(coverageSum) / float(path.size()); - - } - - - // Write a histogram of segment coverage. - vector<uint64_t> histogram; - for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) { - const uint64_t coverage = uint64_t(std::round(segmentCoverage[segmentId])); - if(coverage >= histogram.size()) { - histogram.resize(coverage + 1, 0); - } - ++histogram[coverage]; - } - ofstream csv("SegmentCoverageHistogram.csv"); - csv << "Coverage,Frequency\n"; - for(uint64_t coverage=0; coverage<histogram.size(); coverage++) { - csv << coverage << "," << histogram[coverage] << "\n"; - } -} - - - -void AssemblyGraph::computeMarkerGraphEdgeTable(size_t threadCount) -{ - - // Initialize the marker graph edge table. - createNew(markerGraphEdgeTable, "Mode3-MarkerGraphEdgeTable"); - markerGraphEdgeTable.resize(markerGraph.edges.size()); - fill(markerGraphEdgeTable.begin(), markerGraphEdgeTable.end(), make_pair( - std::numeric_limits<uint64_t>::max(), - std::numeric_limits<uint32_t>::max() - )); - - // Fill in the marker graph edge table. - const uint64_t batchSize = 100; - setupLoadBalancing(markerGraphPaths.size(), batchSize); - runThreads(&AssemblyGraph::computeMarkerGraphEdgeTableThreadFunction, threadCount); -} - - - -void AssemblyGraph::computeMarkerGraphEdgeTableThreadFunction(size_t threadId) -{ - - // Loop over all batches assigned to this thread. - uint64_t begin, end; - while(getNextBatch(begin, end)) { - - // Loop over all vertices assigned to this batch. - for(uint64_t segmentId=begin; segmentId!=end; ++segmentId) { - const span<MarkerGraphEdgeId> path = markerGraphPaths[segmentId]; - - // Loop over the path of this segment. - for(uint64_t position=0; position<path.size(); position++) { - const MarkerGraphEdgeId edgeId = path[position]; - - // Store the marker graph edge table entry for this edge. - SHASTA_ASSERT(edgeId < markerGraphEdgeTable.size()); - markerGraphEdgeTable[edgeId] = make_pair(segmentId, position); - } - } - - } -} - - - -void AssemblyGraph::computeMarkerGraphJourneys(size_t threadCount) -{ - const bool debug = true; - - createNew(markerGraphJourneys, "tmp-mode3-MarkerGraphJourneys"); - - uint64_t batchSize = 1000; - markerGraphJourneys.beginPass1(markers.size()); - setupLoadBalancing(markerGraphEdgeTable.size(), batchSize); - runThreads(&AssemblyGraph::computeMarkerGraphJourneysPass1, threadCount); - markerGraphJourneys.beginPass2(); - setupLoadBalancing(markerGraphEdgeTable.size(), batchSize); - runThreads(&AssemblyGraph::computeMarkerGraphJourneysPass2, threadCount); - markerGraphJourneys.endPass2(); - - batchSize = 100; - setupLoadBalancing(markerGraphJourneys.size(), batchSize); - runThreads(&AssemblyGraph::sortMarkerGraphJourneys, threadCount); - - if(debug) { - ofstream csv("MarkerGraphJourneys.csv"); - csv << "OrientedReadId,SegmentId,Position,ordinal0,Ordinal1\n"; - for(uint64_t i=0; i<markers.size(); i++) { - const OrientedReadId orientedReadId = OrientedReadId::fromValue(ReadId(i)); - const auto markerGraphJourney = markerGraphJourneys[i]; - for(uint64_t position=0; position<markerGraphJourney.size(); position++) { - const MarkerGraphJourneyEntry& entry = markerGraphJourney[position]; - csv << orientedReadId << ","; - csv << entry.segmentId << ","; - csv << entry.position << ","; - csv << entry.ordinals[0] << ","; - csv << entry.ordinals[1] << "\n"; - } - } - - } -} - - - -void AssemblyGraph::computeMarkerGraphJourneysPass1(size_t threadId) -{ - computeMarkerGraphJourneysPass12(1); -} - - - -void AssemblyGraph::computeMarkerGraphJourneysPass2(size_t threadId) -{ - computeMarkerGraphJourneysPass12(2); -} - - - -void AssemblyGraph::computeMarkerGraphJourneysPass12(uint64_t pass) -{ - // Loop over all batches assigned to this thread. - uint64_t begin, end; - while(getNextBatch(begin, end)) { - - // Loop over marker graph edges assigned to this batch. - for(MarkerGraph::EdgeId edgeId=begin; edgeId!=end; ++edgeId) { - const auto& p = markerGraphEdgeTable[edgeId]; - const uint64_t segmentId = p.first; - const uint32_t position = p.second; - SHASTA_ASSERT(segmentId != std::numeric_limits<uint64_t>::max()); - SHASTA_ASSERT(position != std::numeric_limits<uint32_t>::max()); - - // Loop over the marker intervals of this marker graph edge.. - const auto markerIntervals = markerGraph.edgeMarkerIntervals[edgeId]; - for(const MarkerInterval& markerInterval: markerIntervals) { - const OrientedReadId orientedReadId = markerInterval.orientedReadId; - - if(pass == 1) { - markerGraphJourneys.incrementCountMultithreaded(orientedReadId.getValue()); - } else { - MarkerGraphJourneyEntry markerGraphJourneyEntry; - markerGraphJourneyEntry.segmentId = segmentId; - markerGraphJourneyEntry.position = position; - markerGraphJourneyEntry.ordinals = markerInterval.ordinals; - markerGraphJourneys.storeMultithreaded(orientedReadId.getValue(), markerGraphJourneyEntry); - } - } - } - } -} - - - -void AssemblyGraph::sortMarkerGraphJourneys(size_t threadId) -{ - uint64_t begin, end; - while(getNextBatch(begin, end)) { - - // Loop over marker graph edges assigned to this batch. - for(uint64_t i=begin; i!=end; ++i) { - auto markerGraphJourney = markerGraphJourneys[i]; - sort(markerGraphJourney.begin(), markerGraphJourney.end()); - } - } -} - - -// The assembly graph journey of an oriented read -// is the sequence of segmentIds it encounters. -void AssemblyGraph::computeAssemblyGraphJourneys() -{ - const bool debug = true; - - // Initialize the assembly graph journeys. - createNew(assemblyGraphJourneys, "Mode3-AssemblyGraphJourneys"); - - // Work vector defined outside the loop to reduce memory allocation overhead. - vector<AssemblyGraphJourneyEntry> assemblyGraphJourney; - - // Loop over all oriented reads. - for(uint64_t i=0; i<markerGraphJourneys.size(); i++) { - - // Access the marker graph journey for this oriented read. - const span<MarkerGraphJourneyEntry> markerGraphJourney = markerGraphJourneys[i]; - - // Compute the assembly graph journey. - computeAssemblyGraphJourney(markerGraphJourney, assemblyGraphJourney); - - // Store it. - assemblyGraphJourneys.appendVector(assemblyGraphJourney); - } - - - - // Write them out. - if(debug) { - ofstream csv("AssemblyGraphJourneys.csv"); - for(uint64_t i=0; i<assemblyGraphJourneys.size(); i++) { - const ReadId readId = ReadId(i >> 1); - const Strand strand = i & 1; - const OrientedReadId orientedReadId(readId, strand); - const span<AssemblyGraphJourneyEntry> assemblyGraphJourney = assemblyGraphJourneys[i]; - - csv << orientedReadId << ","; - for(const AssemblyGraphJourneyEntry entry: assemblyGraphJourney) { - csv << entry.segmentId << ","; - } - csv << endl; - } - } - - - - // Write them out again, with more details. - if(debug) { - ofstream csv("AssemblyGraphJourneysDetails.csv"); - csv << "OrientedReadId,Position,SegmentId," - "First position,First ordinal0,First ordinal1," - "Last position,Last ordinal0,Last ordinal1\n"; - for(uint64_t i=0; i<assemblyGraphJourneys.size(); i++) { - const ReadId readId = ReadId(i >> 1); - const Strand strand = i & 1; - const OrientedReadId orientedReadId(readId, strand); - const span<AssemblyGraphJourneyEntry> assemblyGraphJourney = assemblyGraphJourneys[i]; - - for(uint64_t position=0; position<assemblyGraphJourney.size(); position++) { - const AssemblyGraphJourneyEntry& entry = assemblyGraphJourney[position]; - const MarkerGraphJourneyEntry& first = entry.markerGraphJourneyEntries[0]; - const MarkerGraphJourneyEntry& last = entry.markerGraphJourneyEntries[1]; - csv << orientedReadId << ","; - csv << position << ","; - csv << entry.segmentId << ","; - csv << first.position << ","; - csv << first.ordinals[0] << ","; - csv << first.ordinals[1] << ","; - csv << last.position << ","; - csv << last.ordinals[0] << ","; - csv << last.ordinals[1] << "\n"; - } - } - } - - -} - - - -// Given the marker graph journey of an oriented read, -// find the corresponding assembly graph journey. -void AssemblyGraph::computeAssemblyGraphJourney( - const span<MarkerGraphJourneyEntry> markerGraphJourney, - vector<AssemblyGraphJourneyEntry>& assemblyGraphJourney) -{ - // Start with an empty journey. - assemblyGraphJourney.clear(); - - // Loop over the marker graph journey, looking for places - // where the segmentId changes. - for(uint32_t i=0; i<markerGraphJourney.size(); /* Increment later */) { - const MarkerGraphJourneyEntry& markerGraphJourneyEntry = markerGraphJourney[i]; - const uint64_t segmentId = markerGraphJourneyEntry.segmentId; - - // Move to the end of the streak with the same segmentId. - const uint32_t streakBegin = i; - uint32_t streakEnd = streakBegin + 1; - for(; - streakEnd<markerGraphJourney.size() and - (markerGraphJourney[streakEnd].segmentId == segmentId); - streakEnd++) { - } - - // Store this segmentId in the assembly graph journey. - AssemblyGraphJourneyEntry assemblyGraphJourneyEntry; - assemblyGraphJourneyEntry.segmentId = segmentId; - assemblyGraphJourneyEntry.markerGraphJourneyEntries[0] = markerGraphJourney[streakBegin]; - assemblyGraphJourneyEntry.markerGraphJourneyEntries[1] = markerGraphJourney[streakEnd - 1]; - assemblyGraphJourney.push_back(assemblyGraphJourneyEntry); - - // Prepare to handle the next segment. - i = streakEnd; - } -} - - - -void AssemblyGraph::computeAssemblyGraphJourneyInfos() -{ - const bool debug = true; - - const uint64_t segmentCount = markerGraphPaths.size(); - const uint64_t readCount = assemblyGraphJourneys.size()/2; - - createNew(assemblyGraphJourneyInfos, "Mode3-AssemblyGraphJourneyInfos"); - - // Pass 1. - assemblyGraphJourneyInfos.beginPass1(segmentCount); - for(ReadId readId=0; readId<readCount; readId++) { - for(Strand strand=0; strand<2; strand++) { - const OrientedReadId orientedReadId(readId, strand); - const auto assemblyGraphJourney = assemblyGraphJourneys[orientedReadId.getValue()]; - - for(uint64_t position=0; position<assemblyGraphJourney.size(); position++) { - const AssemblyGraphJourneyEntry& entry = assemblyGraphJourney[position]; - assemblyGraphJourneyInfos.incrementCount(entry.segmentId); - } - } - } - - // Pass 2. - assemblyGraphJourneyInfos.beginPass2(); - for(ReadId readId=0; readId<readCount; readId++) { - for(Strand strand=0; strand<2; strand++) { - const OrientedReadId orientedReadId(readId, strand); - const auto assemblyGraphJourney = assemblyGraphJourneys[orientedReadId.getValue()]; - - for(uint64_t position=0; position<assemblyGraphJourney.size(); position++) { - const AssemblyGraphJourneyEntry& entry = assemblyGraphJourney[position]; - assemblyGraphJourneyInfos.store(entry.segmentId, make_pair(orientedReadId, position)); - } - } - } - assemblyGraphJourneyInfos.endPass2(); - - // Sort. - for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) { - const auto v = assemblyGraphJourneyInfos[segmentId]; - sort(v.begin(), v.end()); - } - - - if(debug) { - ofstream csv("SegmentJourneyInfo.csv"); - csv << "SegmentId,OrientedReadId,Position in assembly graph journey\n"; - for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) { - const auto v = assemblyGraphJourneyInfos[segmentId]; - for(const auto& p: v) { - csv << segmentId << ","; - csv << p.first << ","; - csv << p.second << "\n"; - } - } - } -} - - - -// Find out if a segment contains a given OrientedReadId. -// This returns true if assemblyGraphJourneyInfos[segmentId] -// contains an entry with the given OrientedReadId. -bool AssemblyGraph::segmentContainsOrientedRead( - uint64_t segmentId, - OrientedReadId orientedReadId) const -{ - for(const auto& p: assemblyGraphJourneyInfos[segmentId]) { - if(p.first == orientedReadId) { - return true; - } - } - return false; -} - - - -void AssemblyGraph::findTransitions(std::map<SegmentPair, Transitions>& transitionMap) -{ - transitionMap.clear(); - - for(ReadId readId=0; readId<assemblyGraphJourneys.size()/2; readId++) { - for(Strand strand=0; strand<2; strand++) { - const OrientedReadId orientedReadId(readId, strand); - const auto journey = assemblyGraphJourneys[orientedReadId.getValue()]; - - for(uint64_t i=1; i<journey.size(); i++) { - const auto& previous = journey[i-1]; - const auto& current = journey[i]; - SHASTA_ASSERT(previous.segmentId != current.segmentId); - - const SegmentPair segmentPair = make_pair(previous.segmentId, current.segmentId); - transitionMap[segmentPair].push_back( - make_pair(orientedReadId, Transition({ - previous.markerGraphJourneyEntries[1], - current.markerGraphJourneyEntries[0]}))); - - } - } - } -} - - - -void AssemblyGraph::createLinks( - const std::map<SegmentPair, Transitions>& transitionMap, - uint64_t minCoverage) -{ - createNew(links, "Mode3-Links"); - createNew(transitions, "Mode3-Transitions"); - for(const auto& p: transitionMap) { - const auto& transitionVector = p.second; - const uint64_t coverage = transitionVector.size(); - if(coverage >= minCoverage) { - const uint64_t segmentId0 = p.first.first; - const uint64_t segmentId1 = p.first.second; - links.push_back(Link(segmentId0, segmentId1)); - transitions.appendVector(transitionVector); - } - } - - // Store link separation. - for(uint64_t linkId=0; linkId<links.size(); linkId++) { - Link& link = links[linkId]; - - // Check if these two segments are adjacent in the marker graph. - const uint64_t segmentId0 = link.segmentId0; - const uint64_t segmentId1 = link.segmentId1; - const auto path0 = markerGraphPaths[segmentId0]; - const auto path1 = markerGraphPaths[segmentId1]; - const MarkerGraph::Edge lastEdge0 = markerGraph.edges[path0.back()]; - const MarkerGraph::Edge firstEdge1 = markerGraph.edges[path1.front()]; - if(lastEdge0.target == firstEdge1.source) { - // The segments are adjacent. Set the link separation to 0. - link.segmentsAreAdjacent = true; - link.separation = 0; - } else { - // The segments are not adjacent. - // Use the transitions to estimate the separation. - const auto linkTransitions = transitions[linkId]; - const double separation = linkSeparation(linkTransitions, path0.size()); - - link.segmentsAreAdjacent = false; - link.separation = int32_t(std::round(separation)); - } - } - - - - ofstream csv("Links.csv"); - csv << "LinkId,SegmentId0,SegmentId1,Coverage,Adjacent,Separation\n"; - for(uint64_t linkId=0; linkId<links.size(); linkId++) { - Link& link = links[linkId]; - - csv << linkId << ","; - csv << link.segmentId0 << ","; - csv << link.segmentId1 << ","; - csv << transitions[linkId].size() << ","; - csv << (link.segmentsAreAdjacent ? "Yes" : "No") << ","; - csv << link.separation << "\n"; - } - -} - - - -// Initial construction of the AssemblyGraph. -AssemblyGraph::AssemblyGraph( - const string& largeDataFileNamePrefix, - size_t largeDataPageSize, - size_t threadCount, - uint64_t readRepresentation, - uint64_t k, // Marker length - const Reads& reads, - const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, - const MarkerGraph& markerGraph, - const ConsensusCaller& consensusCaller) : - MultithreadedObject<AssemblyGraph>(*this), - largeDataFileNamePrefix(largeDataFileNamePrefix), - largeDataPageSize(largeDataPageSize), - readRepresentation(readRepresentation), - k(k), - reads(reads), - markers(markers), - markerGraph(markerGraph), - consensusCaller(consensusCaller) -{ - // K must be even. - SHASTA_ASSERT((k % 2) == 0); - - // This assumes RLE is not used. - SHASTA_ASSERT(reads.representation == 0); - - // Minimum number of transitions (oriented reads) to create a link. - // If this equals 1, then the sequence of segments visited by every - // oriented read is a path in the graph. - // But that is not desirable because of the extra edges it causes. - const uint64_t minCoverage = 3; // EXPOSE WHEN CODE STABILIZES - - // Create a segment for each linear chain of marker graph edges. - createSegmentPaths(); - computeSegmentCoverage(); - - // Assembled sequence for each segment. - assembleSegments(); - - // Keep track of the segment and position each marker graph edge corresponds to. - computeMarkerGraphEdgeTable(threadCount); - - // Compute marker graph and assembly graph journeys of all oriented reads. - // We permanently store only the assembly graph journeys. - computeMarkerGraphJourneys(threadCount); - computeAssemblyGraphJourneys(); - markerGraphJourneys.remove(); - computeAssemblyGraphJourneyInfos(); - - // Find transitions from segment to segment in the marker graph - // journeys of all oriented reads, and store them keyed by the pair of segments. - std::map<SegmentPair, Transitions> transitionMap; - findTransitions(transitionMap); - - // Create a links between pairs of segments with a sufficient number of transitions. - createLinks(transitionMap, minCoverage); - createConnectivity(); - flagBackSegments(); - - cout << "The mode 3 assembly graph has " << markerGraphPaths.size() << " segments and " << - links.size() << " links." << endl; -} - - - -string AssemblyGraph::largeDataName(const string& name) const -{ - if(largeDataFileNamePrefix.empty()) { - return ""; // Anonymous; - } else { - return largeDataFileNamePrefix + name; - } -} - - - -// Constructor from binary data. -AssemblyGraph::AssemblyGraph( - const string& largeDataFileNamePrefix, - uint64_t readRepresentation, - uint64_t k, // Marker length - const Reads& reads, - const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, - const MarkerGraph& markerGraph, - const ConsensusCaller& consensusCaller) : - MultithreadedObject<AssemblyGraph>(*this), - largeDataFileNamePrefix(largeDataFileNamePrefix), - readRepresentation(readRepresentation), - k(k), - reads(reads), - markers(markers), - markerGraph(markerGraph), - consensusCaller(consensusCaller) -{ - accessExistingReadOnly(markerGraphPaths, "Mode3-MarkerGraphPaths"); - accessExistingReadOnly(segmentCoverage, "Mode3-SegmentCoverage"); - accessExistingReadOnly(segmentSequences, "Mode3-SegmentSequences"); - accessExistingReadOnly(segmentVertexOffsets, "Mode3-SegmentVertexOffsets"); - accessExistingReadOnly(markerGraphEdgeTable, "Mode3-MarkerGraphEdgeTable"); - accessExistingReadOnly(assemblyGraphJourneys, "Mode3-AssemblyGraphJourneys"); - accessExistingReadOnly(assemblyGraphJourneyInfos, "Mode3-AssemblyGraphJourneyInfos"); - accessExistingReadOnly(links, "Mode3-Links"); - accessExistingReadOnly(transitions, "Mode3-Transitions"); - accessExistingReadOnly(linksBySource, "Mode3-LinksBySource"); - accessExistingReadOnly(linksByTarget, "Mode3-LinksByTarget"); - accessExistingReadOnly(isBackSegment, "Mode3-IsBackSegment"); - accessExistingReadOnly(clusterIds, "Mode3-ClusterIds"); -} - - - -void AssemblyGraph::createConnectivity() -{ - createNew(linksBySource, "Mode3-LinksBySource"); - createNew(linksByTarget, "Mode3-LinksByTarget"); - - linksBySource.beginPass1(links.size()); - linksByTarget.beginPass1(links.size()); - for(uint64_t linkId=0; linkId<links.size(); linkId++) { - const Link& link = links[linkId]; - linksBySource.incrementCount(link.segmentId0); - linksByTarget.incrementCount(link.segmentId1); - } - linksBySource.beginPass2(); - linksByTarget.beginPass2(); - for(uint64_t linkId=0; linkId<links.size(); linkId++) { - const Link& link = links[linkId]; - linksBySource.store(link.segmentId0, linkId); - linksByTarget.store(link.segmentId1, linkId); - } - linksBySource.endPass2(); - linksByTarget.endPass2(); -} - - - -uint64_t AssemblyGraph::findLink(uint64_t segmentId0, uint64_t segmentId1) const -{ - for(const uint64_t linkId: linksBySource[segmentId0]) { - if(links[linkId].segmentId1 == segmentId1) { - return linkId; - } - } - SHASTA_ASSERT(0); -} - - - -// Flag back-segments. -// This does not do a full blown search for locally strongly connected components. -// A segment is marked as a back-segment if: -// - It has only a single incoming link. -// - It has a single outgoing link. -// - The incoming and outgoing links both connect to/from the same segment. -void AssemblyGraph::flagBackSegments() -{ - const uint64_t segmentCount = markerGraphPaths.size(); - createNew(isBackSegment, "Mode3-IsBackSegment"); - isBackSegment.resize(segmentCount); - - uint64_t backSegmentCount = 0; - for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) { - - // Initially flag it as not a back-segment. - isBackSegment[segmentId] = false; - - // For a back-segment, there must be a single incoming link. - const auto incomingLinks = linksByTarget[segmentId]; - if(incomingLinks.size() != 1) { - continue; - } - - // For a back-segment, there must be a single outgoing link. - const auto outgoingLinks = linksBySource[segmentId]; - if(outgoingLinks.size() != 1) { - continue; - } - - // For a back-segment, the incoming and outgoing links - // both connect to/from the same segment. - const uint64_t incomingLinkId = incomingLinks[0]; - const uint64_t outgoingLinkId = outgoingLinks[0]; - const Link& incomingLink = links[incomingLinkId]; - const Link& outgoingLink = links[outgoingLinkId]; - if(incomingLink.segmentId0 != outgoingLink.segmentId1) { - continue; - } - - // Flag it as a back-segment. - isBackSegment[segmentId] = true; - ++backSegmentCount; - } - - cout << "Found " << backSegmentCount << " back-segments." << endl; -} - - - -// Get the children or parents of a given segment. -// Only use links with at least a specified coverage. -void AssemblyGraph::getChildrenOrParents( - uint64_t segmentId, - uint64_t direction, // 0=forward (children), 1=backward (parents). - uint64_t minimumLinkCoverage, - vector<uint64_t>& childrenOrParents) const -{ - switch(direction) { - case 0: - getChildren(segmentId, minimumLinkCoverage, childrenOrParents); - break; - case 1: - getParents(segmentId, minimumLinkCoverage, childrenOrParents); - break; - default: - SHASTA_ASSERT(0); - } -} - - - -void AssemblyGraph::getChildren( - uint64_t segmentId, - uint64_t minimumLinkCoverage, - vector<uint64_t>& children) const -{ - children.clear(); - for(const auto linkId: linksBySource[segmentId]) { - if(transitions.size(linkId) >= minimumLinkCoverage) { - const Link& link = links[linkId]; - children.push_back(link.segmentId1); - } - } -} - - - -void AssemblyGraph::getParents( - uint64_t segmentId, - uint64_t minimumLinkCoverage, - vector<uint64_t>& parents) const -{ - parents.clear(); - for(const auto linkId: linksByTarget[segmentId]) { - if(transitions.size(linkId) >= minimumLinkCoverage) { - const Link& link = links[linkId]; - parents.push_back(link.segmentId0); - } - } -} - - - -void AssemblyGraph::writeGfa(const string& baseName) const -{ - ofstream gfa(baseName + ".gfa"); - ofstream csv(baseName + ".csv"); - - // Write the headers. - gfa << "H\tVN:Z:1.0\n"; - csv << "Segment,Path Length,Sequence Length,Average coverage,Read count\n"; - - // Write the segments. - for(uint64_t segmentId=0; segmentId<markerGraphPaths.size(); segmentId++) { - - const auto sequence = segmentSequences[segmentId]; - gfa <<"S\t" << segmentId << "\t"; - copy(sequence.begin()+k/2, sequence.end()-k/2, ostream_iterator<Base>(gfa)); - gfa << "\n"; - - const auto path = markerGraphPaths[segmentId]; - csv << segmentId << ","; - csv << path.size() << ","; - csv << sequence.size() << ","; - csv << segmentCoverage[segmentId] << ","; - csv << assemblyGraphJourneyInfos[segmentId].size() << "\n"; - } - - // Write the links. - for(const Link& link: links) { - if(true /*link.segmentsAreAdjacent*/) { - gfa << "L\t" << - link.segmentId0 << "\t+\t" << - link.segmentId1 << "\t+\t0M\n"; - } else { - // This writes non-adjacent links as Jumps (GFA 1.2). - // The original Bandage does not display them. - // BandageNG does, but they are not taken into account during graph - // creation, so it i not useful to write them like this. - // For this reason, the if condition above was set to true, - // so this branch is never reached. - // Leaveing the code in place for possible future use. - gfa << "J\t" << - link.segmentId0 << "\t+\t" << - link.segmentId1 << "\t+\t" << k * link.separation << "\n"; - } - } - -} - - - -// Find the distinct oriented reads that appear on the path -// of a segment. Also return the average edge coverage for the path. -double AssemblyGraph::findOrientedReadsOnSegment( - uint64_t segmentId, - vector<OrientedReadId>& orientedReadIdsArgument) const -{ - // Loop over the marker graph path corresponding to this segment. - const span<const MarkerGraphEdgeId> path = markerGraphPaths[segmentId]; - double coverage = 0.; - std::set<OrientedReadId> orientedReadIds; - for(const MarkerGraphEdgeId& edgeId: path) { - - // Loop over the marker intervals for this marker graph edge. - const span<const MarkerInterval> markerIntervals = markerGraph.edgeMarkerIntervals[edgeId]; - coverage += double(markerIntervals.size()); - for(const MarkerInterval& markerInterval: markerIntervals) { - orientedReadIds.insert(markerInterval.orientedReadId); - } - } - - // Copy the oriented reads to the vector passed as an argument. - orientedReadIdsArgument.clear(); - orientedReadIdsArgument.insert(orientedReadIdsArgument.end(), - orientedReadIds.begin(), orientedReadIds.end()); - - return coverage / double(path.size()); -} - - - -// Get information about the oriented reads that appear on the -// marker graph path of a segment. -void AssemblyGraph::getOrientedReadsOnSegment( - uint64_t segmentId, - SegmentOrientedReadInformation& information) const -{ - // A data structure that, for each oriented read we find, - // contains a sum of offsets and the number of marker graph vertices - // that contributed to the sum. - std::map<OrientedReadId, pair<uint64_t, int64_t> > table; - - // Loop over the marker graph path corresponding to this segment. - const span<const MarkerGraphEdgeId> path = markerGraphPaths[segmentId]; - std::set<OrientedReadId> orientedReadIds; - for(uint64_t position=0; position<path.size(); position++) { - const MarkerGraphEdgeId& edgeId = path[position]; - - // Loop over the marker intervals for this marker graph edge. - const span<const MarkerInterval> markerIntervals = markerGraph.edgeMarkerIntervals[edgeId]; - for(const MarkerInterval& markerInterval: markerIntervals) { - const OrientedReadId orientedReadId = markerInterval.orientedReadId; - - // Update our table for this oriented read. - auto it = table.find(orientedReadId); - if(it == table.end()) { - tie(it, ignore) = table.insert(make_pair(orientedReadId, make_pair(0ULL, 0LL))); - } - auto& p = it->second; - p.first += 2; - p.second += int32_t(position) - int32_t(markerInterval.ordinals[0]); - p.second += int32_t(position + 1) -int32_t(markerInterval.ordinals[1]); - } - } - - - - // Store what we found. - information.infos.clear(); - for(const auto& p: table) { - SegmentOrientedReadInformation::Info info; - info.orientedReadId = p.first; - const uint64_t n = p.second.first; - const int64_t sum = p.second.second; - info.averageOffset = int32_t(std::round(double(sum) / double(n))); - information.infos.push_back(info); - } - } - - - -// Estimate the offset between two segments. -// Takes as input SegmentOrientedReadInformation objects -// for the two segments. -// Common oriented reads between the two segments are used -// to estimate the average offset, in markers, -// between the beginning of the segments. -// The number of common oriented reads -// is computed and stored in the last argument. -// If that is zero, the computed offset is not valid. -void AssemblyGraph::estimateOffset( - const SegmentOrientedReadInformation& info0, - const SegmentOrientedReadInformation& info1, - int64_t& offset, - uint64_t& commonOrientedReadCount - ) const -{ - offset = 0; - commonOrientedReadCount = 0; - - // Joint loop over common oriented reads in the two segments. - const auto begin0 = info0.infos.begin(); - const auto begin1 = info1.infos.begin(); - const auto end0 = info0.infos.end(); - const auto end1 = info1.infos.end(); - auto it0 = begin0; - auto it1 = begin1; - while((it0 != end0) and (it1 != end1)) { - - if(it0->orientedReadId < it1->orientedReadId) { - ++it0; - } else if(it1->orientedReadId < it0->orientedReadId) { - ++it1; - } else { - SHASTA_ASSERT(it0->orientedReadId == it1->orientedReadId); - - commonOrientedReadCount++; - offset += (int64_t(it0->averageOffset) - int64_t(it1->averageOffset)); - - ++it0; - ++it1; - } - } - - if(commonOrientedReadCount) { - offset = int64_t(std::round(double(offset) / double(commonOrientedReadCount))); - } else { - offset = std::numeric_limits<uint64_t>::max(); - } - -} - - - -// Analyze a pair of segments for common oriented reads, -// offsets, missing reads, etc. -void AssemblyGraph::analyzeSegmentPair( - uint64_t segmentId0, - uint64_t segmentId1, - const SegmentOrientedReadInformation& info0, - const SegmentOrientedReadInformation& info1, - const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, - SegmentPairInformation& info01 - ) const -{ - using boost::icl::discrete_interval; - using boost::icl::intersects; - - // Store the number of oriented reads in each segment. - info01.totalCount[0] = info0.infos.size(); - info01.totalCount[1] = info1.infos.size(); - - // Use common oriented reads to estimate the offset between the two segments. - // If there are no common oriented reads, stop here. - estimateOffset(info0, info1, info01.offset, info01.commonCount); - if(info01.commonCount == 0) { - return; - } - - - // Count the oriented reads missing from each segment, - // and which should have been present based on - // the known relative offsets. - info01.unexplainedCount = {0, 0}; - info01.shortCount = {0, 0}; - - // Set up a joint loop over oriented reads in the two segments. - const auto begin0 = info0.infos.begin(); - const auto begin1 = info1.infos.begin(); - const auto end0 = info0.infos.end(); - const auto end1 = info1.infos.end(); - auto it0 = begin0; - auto it1 = begin1; - - const uint64_t length0 = markerGraphPaths.size(segmentId0); - const uint64_t length1 = markerGraphPaths.size(segmentId1); - while(true) { - - // At end of both segments. - if((it0 == end0) and (it1 == end1)) { - break; - } - - - - // This read only appears on segment 0. - if((it1 == end1) or ((it0 != end0) and (it0->orientedReadId < it1->orientedReadId))) { - const int64_t orientedReadLength = markers.size(it0->orientedReadId.getValue()); - - // Compute the hypothetical range of the oriented read relative - // to the beginning of segment 1. - const discrete_interval<int64_t> orientedReadRange1( - it0->averageOffset - info01.offset, - it0->averageOffset - info01.offset + orientedReadLength); - const discrete_interval<int64_t> segment1Range(0, length1); - - // Figure out if it the oriented read would overlap segment 1. - const bool wouldOverlap = intersects(orientedReadRange1, segment1Range); - - if(wouldOverlap) { - ++info01.unexplainedCount[0]; - } else { - ++info01.shortCount[0]; - } - - SHASTA_ASSERT(it0 != end0); - ++it0; - } - - - - // Only on segment 1 - else if((it0 == end0) or ((it1 != end1) and (it1->orientedReadId < it0->orientedReadId))) { - const int64_t orientedReadLength = markers.size(it1->orientedReadId.getValue()); - - // Compute the hypothetical range of the oriented read relative - // to the beginning of segment 0. - const discrete_interval<int64_t> orientedReadRange0( - it1->averageOffset + info01.offset, - it1->averageOffset + info01.offset + orientedReadLength); - const discrete_interval<int64_t> segment0Range(0, length0); - - // Figure out if it the oriented read would overlap segment 0. - const bool wouldOverlap = intersects(orientedReadRange0, segment0Range); - - if(wouldOverlap) { - ++info01.unexplainedCount[1]; - } else { - ++info01.shortCount[1]; - } - - SHASTA_ASSERT(it1 != end1); - ++it1; - } - - // On both segments. - else { - SHASTA_ASSERT(it0 != end0); - SHASTA_ASSERT(it1 != end1); - ++it0; - ++it1; - } - } - - info01.check(); - -} - - - -// Gather oriented read information for each segment. -void AssemblyGraph::storeSegmentOrientedReadInformation(size_t threadCount) -{ - const uint64_t segmentCount = markerGraphPaths.size(); - segmentOrientedReadInformation.resize(segmentCount); - const uint64_t batchSize = 10; - setupLoadBalancing(segmentCount, batchSize); - runThreads(&AssemblyGraph::storeSegmentOrientedReadInformationThreadFunction, threadCount); -} - - - - -// Gather oriented read information for each segment. -void AssemblyGraph::storeSegmentOrientedReadInformationThreadFunction(size_t threadId) -{ - - // Loop over batches assigned to this thread. - uint64_t begin, end; - while(getNextBatch(begin, end)) { - - // Loop over segments assigned to this batch. - for(uint64_t segmentId=begin; segmentId!=end; ++segmentId) { - - // Get oriented read information for this segment. - getOrientedReadsOnSegment(segmentId, segmentOrientedReadInformation[segmentId]); - } - } -} - - - -#if 0 -void AssemblyGraph::clusterSegments(size_t threadCount, uint64_t minClusterSize) -{ - // Gather oriented read information for all segments. - storeSegmentOrientedReadInformation(threadCount); - - // Find the segment pairs. - const uint64_t segmentCount = markerGraphPaths.size(); - const uint64_t batchSize = 10; - setupLoadBalancing(segmentCount, batchSize); - clusterSegmentsData.threadPairs.resize(threadCount); - runThreads(&AssemblyGraph::clusterSegmentsThreadFunction1, threadCount); - - // For now, write a dot file with the pairs. - ofstream dot("SegmentGraph.dot"); - dot << "graph segmentGraph {\n"; - for(const auto& threadPairs: clusterSegmentsData.threadPairs) { - for(const auto& p: threadPairs) { - dot << p.first << "--" << p.second << ";\n"; - } - } - dot << "}\n"; - - - - // The segment pairs we found define a subgraph of the assembly graph. - // Compute connected components of this subgraph. - // The connected components of sufficient size become clusters. - vector<uint64_t> rank(segmentCount); - vector<uint64_t> parent(segmentCount); - boost::disjoint_sets<uint64_t*, uint64_t*> disjointSets(&rank[0], &parent[0]); - for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) { - disjointSets.make_set(segmentId); - } - for(const auto& threadPairs: clusterSegmentsData.threadPairs) { - for(const auto& p: threadPairs) { - disjointSets.union_set(p.first, p.second); - } - } - - // Gather the segments in each connected component. - vector< vector<uint64_t> > components(segmentCount); - for(uint64_t segmentId=0; segmentId<segmentCount; segmentId++) { - const uint64_t componentId = disjointSets.find_set(segmentId); - components[componentId].push_back(segmentId); - } - - // Each connected components of size at least minClusterSize - // becomes a cluster. - vector< pair<uint64_t, uint64_t> > clusterTable; - for(uint64_t componentId=0; componentId<segmentCount; componentId++) { - const vector<uint64_t>& component = components[componentId]; - const uint64_t componentSize = component.size(); - if(component.size() >= minClusterSize) { - clusterTable.push_back(make_pair(componentId, componentSize)); - } - } - - // Sort the clusters by decreasing size. - sort(clusterTable.begin(), clusterTable.end(), - OrderPairsBySecondOnlyGreater<uint64_t, uint64_t>()); - - cout << "Found " << clusterTable.size() << " segment clusters with the following sizes:" << endl; - uint64_t clusteredSegmentCount = 0; - for(uint64_t clusterId=0; clusterId<clusterTable.size(); clusterId++) { - const auto& p = clusterTable[clusterId]; - const uint64_t componentSize = p.second; - cout << " " << componentSize; - clusteredSegmentCount += componentSize; - } - cout << endl; - cout << "Out of " << segmentCount << " segments, " << - clusteredSegmentCount << " were assigned to a cluster." << endl; - - - - // Store the cluster id of each segment. - createNew(clusterIds, "Mode3-ClusterIds"); - clusterIds.resize(segmentCount); - fill(clusterIds.begin(), clusterIds.end(), std::numeric_limits<uint64_t>::max()); - for(uint64_t clusterId=0; clusterId<clusterTable.size(); clusterId++) { - const auto& p = clusterTable[clusterId]; - const uint64_t componentId = p.first; - const vector<uint64_t>& cluster = components[componentId]; - for(const uint64_t segmentId: cluster) { - clusterIds[segmentId] = clusterId; - } - } - - - - // Clean up. - clusterSegmentsData.threadPairs.clear(); - clusterSegmentsData.threadPairs.shrink_to_fit(); - segmentOrientedReadInformation.clear(); - segmentOrientedReadInformation.shrink_to_fit(); -} - - - -void AssemblyGraph::clusterSegmentsThreadFunction1(size_t threadId) -{ - - auto& threadPairs = clusterSegmentsData.threadPairs[threadId]; - threadPairs.clear(); - vector<uint64_t> descendants; - - // Loop over batches assigned to this thread. - uint64_t begin, end; - while(getNextBatch(begin, end)) { - - // Loop over segments assigned to this batch. - for(uint64_t segmentId0=begin; segmentId0!=end; ++segmentId0) { - - // Add pairs for which the lowest numbered segment is segmentId0. - addClusterPairs(threadId, segmentId0); - } - } -} - - - -void AssemblyGraph::addClusterPairs(size_t threadId, uint64_t startSegmentId) -{ - // EXPOSE THESE CONSTANTS WHEN CODE STABILIZES. - const uint64_t minCommonReadCount = 10; - const double maxUnexplainedFraction = 0.25; - const double minJaccard = 0.7; - const uint64_t pairCountPerSegment = 1; - const uint64_t maxDistance = 200; - - // std::lock_guard<std::mutex> lock(mutex); // *********** TAKE OUT - - // Do a BFS and check each pair as we encounter it. - // The BFS terminates when we found enough pairs. - - // Do the BFS in both directions. - for(uint64_t direction=0; direction<1; direction++) { // ********* ONE DIRECTION ONLY - // cout << startSegmentId << " direction " << direction << endl; - - // Initialize the BFS. - std::queue<uint64_t> q; - q.push(startSegmentId); - std::map<uint64_t, uint64_t> distanceMap; - distanceMap.insert(make_pair(startSegmentId, 0)); - uint64_t foundCount = 0; - - // BFS loop. - while(not q.empty()) { - const uint64_t segmentId0 = q.front(); - // cout << "Dequeued " << segmentId0 << endl; - q.pop(); - - const uint64_t distance0 = distanceMap[segmentId0]; - const uint64_t distance1 = distance0 + 1; - - // Loop over children or parents of segmentId0. - const auto neighbors = (direction == 0) ? linksBySource[segmentId0] : linksByTarget[segmentId0]; - for(const uint64_t linkId01: neighbors) { - const Link& link01 = links[linkId01]; - const uint64_t segmentId1 = (direction==0) ? link01.segmentId1 : link01.segmentId0; - - // If we already encountered segmentId1, skip it. - if(distanceMap.find(segmentId1) != distanceMap.end()) { - continue; - } - - // Enqueue it. - if(distance1 < maxDistance) { - q.push(segmentId1); - } - distanceMap.insert(make_pair(segmentId1, distance1)); - - // cout << "Found " << segmentId1 << endl; - - // Check the pair (startSegmentId, segmentId1). - SegmentPairInformation info; - analyzeSegmentPair(startSegmentId, segmentId1, - segmentOrientedReadInformation[startSegmentId], - segmentOrientedReadInformation[segmentId1], - markers, info); - if(info.commonCount < minCommonReadCount) { - continue; - } - if(info.maximumUnexplainedFraction() > maxUnexplainedFraction) { - continue; - } - if(info.jaccard() < minJaccard) { - continue; - } - - // Store it. - // cout << "Stored " << segmentId1 << endl; - clusterSegmentsData.threadPairs[threadId].push_back(make_pair(startSegmentId, segmentId1)); - ++foundCount; - if(foundCount >= pairCountPerSegment) { - break; - } - } - - if(foundCount >= pairCountPerSegment) { - break; - } - } - } -} -#endif - - - -// Find descendants of a given segment, up to a given distance in the graph. -void AssemblyGraph::findDescendants( - uint64_t startSegmentId, - uint64_t maxDistance, - vector<uint64_t>& descendants - ) const -{ - // Initialize the BFS. - descendants.clear(); - std::queue<uint64_t> q; - q.push(startSegmentId); - std::map<uint64_t, uint64_t> distanceMap; - distanceMap.insert(make_pair(startSegmentId, 0)); - - // BFS loop. - while(not q.empty()) { - const uint64_t segmentId0 = q.front(); - q.pop(); - - const uint64_t distance0 = distanceMap[segmentId0]; - const uint64_t distance1 = distance0 + 1; - - // Loop over children of segmentId0. - for(const uint64_t linkId01: linksBySource[segmentId0]) { - const Link& link01 = links[linkId01]; - const uint64_t segmentId1 = link01.segmentId1; - - // If we already encountered segmentId1, skip it. - if(distanceMap.find(segmentId1) != distanceMap.end()) { - continue; - } - - descendants.push_back(segmentId1); - distanceMap.insert(make_pair(segmentId1, distance1)); - if(distance1 < maxDistance) { - q.push(segmentId1); - } - } - } -} - - - -void AssemblyGraph::analyzeSubgraph( - const vector<uint64_t>& segmentIds, - vector<AnalyzeSubgraphClasses::Cluster>& clusters, - bool debug) const -{ - if(segmentIds.size() <= 64) { - analyzeSubgraphTemplate<64>(segmentIds, clusters, debug); - } else if(segmentIds.size() <= 128) { - analyzeSubgraphTemplate<128>(segmentIds, clusters, debug); - } else if(segmentIds.size() <= 192) { - analyzeSubgraphTemplate<192>(segmentIds, clusters, debug); - } else if(segmentIds.size() <= 256) { - analyzeSubgraphTemplate<256>(segmentIds, clusters, debug); - } else if(segmentIds.size() <= 320) { - analyzeSubgraphTemplate<320>(segmentIds, clusters, debug); - } else if(segmentIds.size() <= 384) { - analyzeSubgraphTemplate<384>(segmentIds, clusters, debug); - } else if(segmentIds.size() <= 448) { - analyzeSubgraphTemplate<448>(segmentIds, clusters, debug); - } else if(segmentIds.size() <= 512) { - analyzeSubgraphTemplate<512>(segmentIds, clusters, debug); - } else { - SHASTA_ASSERT(0); - } -} - - - -template<uint64_t N> void AssemblyGraph::analyzeSubgraphTemplate( - const vector<uint64_t>& unsortedSegmentIds, - vector<AnalyzeSubgraphClasses::Cluster>& clusters, - bool debug) const -{ - // EXPOSE WHEN CODE STABILIZES. - const double fractionThreshold = 0.05; - const uint64_t minClusterCoverage = 6; - const uint64_t minSegmentCoverage = 6; - - using BitVector = std::bitset<N>; - using JourneySnippet = AnalyzeSubgraphClasses::JourneySnippet; - using Cluster = AnalyzeSubgraphClasses::Cluster; - using SnippetGraphVertex = AnalyzeSubgraphClasses::SnippetGraphVertex; - using SnippetGraph = AnalyzeSubgraphClasses::SnippetGraph; - using vertex_descriptor = SnippetGraph::vertex_descriptor; - - // Create a sorted version of the segmentIds. We will need it later. - vector<uint64_t> segmentIds = unsortedSegmentIds; - sort(segmentIds.begin(), segmentIds.end()); - - // Gather triplets (orientedReadId, position in assembly graph journey, segmentId). - using Triplet = tuple<OrientedReadId, uint64_t, uint64_t>; - vector<Triplet> triplets; - for(const uint64_t segmentId: segmentIds) { - const auto v = assemblyGraphJourneyInfos[segmentId]; - for(const auto& p: v) { - const OrientedReadId orientedReadId = p.first; - const uint64_t position = p.second; - triplets.push_back(Triplet(orientedReadId, position, segmentId)); - } - } - sort(triplets.begin(), triplets.end()); - - // Write the triplets. - if(debug) { - ofstream csv("Triplets.csv"); - for(const Triplet& triplet: triplets) { - csv << get<0>(triplet) << ","; - csv << get<1>(triplet) << ","; - csv << get<2>(triplet) << "\n"; - } - } - - - - // Find streaks for the same OrientedReadId where the position - // increases by 1 each time. - // Each streak generates a JourneySnippet. - vector<JourneySnippet> snippets; - for(uint64_t i=0; i<triplets.size(); /* Increment later */) { - const OrientedReadId orientedReadId = get<0>(triplets[i]); - - // Find this streak. - uint64_t streakBegin = i; - uint64_t streakEnd = streakBegin + 1; - for(; streakEnd<triplets.size(); streakEnd++) { - if(get<0>(triplets[streakEnd]) != orientedReadId) { - break; - } - if(get<1>(triplets[streakEnd]) != get<1>(triplets[streakEnd-1]) + 1) { - break; - } - } - - // Add a snippet. - JourneySnippet snippet; - snippet.orientedReadId = orientedReadId; - snippet.firstPosition = get<1>(triplets[streakBegin]); - for(uint64_t j=streakBegin; j!=streakEnd; ++j) { - snippet.segmentIds.push_back(get<2>(triplets[j])); - } - snippets.push_back(snippet); - - // Prepare to process the next streak. - i = streakEnd; - } - - - - // Write the snippets. - if(debug) { - ofstream csv("JourneySnippets.csv"); - csv << "SnippetIndex,OrientedReadId,First position,LastPosition,SegmentIds\n"; - for(uint64_t snippetIndex=0; snippetIndex<snippets.size(); snippetIndex++) { - const JourneySnippet& snippet = snippets[snippetIndex]; - csv << snippetIndex << ","; - csv << snippet.orientedReadId << ","; - csv << snippet.firstPosition << ","; - csv << snippet.lastPosition() << ","; - for(const uint64_t segmentId: snippet.segmentIds) { - csv << segmentId << ","; - } - csv << "\n"; - } - } - - - - // For each snippet, create a BitVector that describes the segments - // the snippet visits. - const uint64_t snippetCount = snippets.size(); - vector<BitVector> bitVectors(snippetCount); - vector<uint64_t> bitVectorsPopCount(snippetCount); - for(uint64_t snippetIndex=0; snippetIndex<snippetCount; snippetIndex++) { - const JourneySnippet& snippet = snippets[snippetIndex]; - BitVector& bitVector = bitVectors[snippetIndex]; - - for(const uint64_t segmentId: snippet.segmentIds) { - auto it = lower_bound(segmentIds.begin(), segmentIds.end(), segmentId); - SHASTA_ASSERT(it != segmentIds.end()); - SHASTA_ASSERT(*it == segmentId); - const uint64_t bitIndex = it - segmentIds.begin(); - bitVector.set(bitIndex); - } - bitVectorsPopCount[snippetIndex] = bitVector.count(); - } - - - - // Create the SnippetGraph. - // A vertex represents a set of snippets and stores - // the corresponding snippet indexes. - // An edge x->y is created if there is at least one snippet in y - // that is an approximate subset of a snippet in x. - // We express this condition as |y-x| < fractionThreshold * |y| - // We start with one snippet per vertex. - SnippetGraph graph; - vector<vertex_descriptor> vertexTable; - std::map<vertex_descriptor, uint64_t> vertexMap; - for(uint64_t snippetIndex=0; snippetIndex<snippetCount; snippetIndex++) { - const auto v = add_vertex(SnippetGraphVertex(snippetIndex), graph); - vertexTable.push_back(v); - vertexMap.insert(make_pair(v, snippetIndex)); - } - for(uint64_t iy=0; iy<snippetCount; iy++) { - const BitVector& y = bitVectors[iy]; - const uint64_t threshold = uint64_t(std::round(fractionThreshold * double(bitVectorsPopCount[iy]))); - const vertex_descriptor vy = vertexTable[iy]; - for(uint64_t ix=0; ix<snippetCount; ix++) { - if(ix == iy) { - continue; - } - const BitVector& x = bitVectors[ix]; - - // Compute z = y-x. - BitVector z = y; - z &= (~x); - - if(z.count() <= threshold) { - const vertex_descriptor vx = vertexTable[ix]; - add_edge(vx, vy, graph); - } - } - } - - - - // Compute strongly connected components of the SnippetGraph. - std::map<vertex_descriptor, uint64_t> componentMap; - const uint64_t componentCount = boost::strong_components( - graph, - boost::make_assoc_property_map(componentMap), - boost::vertex_index_map(boost::make_assoc_property_map(vertexMap))); - // cout << "Found " << componentCount << " strongly connected components." << endl; - - // Gather the vertices of each strongly connected component. - vector< vector<vertex_descriptor> > components(componentCount); - BGL_FORALL_VERTICES_T(v, graph, SnippetGraph) { - const uint64_t componentId = componentMap[v]; - SHASTA_ASSERT(componentId < componentCount); - components[componentId].push_back(v); - } - if(false) { - cout << "Strongly connected components:\n"; - for(uint64_t componentId=0; componentId<componentCount; componentId++) { - cout << componentId << ": "; - for(const vertex_descriptor v: components[componentId]) { - cout << vertexMap[v] << " "; - } - cout << "\n"; - } - } - - - - // Condense the strongly connected components. - // After this, the SnippetGraph is guaranteed to be acyclic. - for(const vector<vertex_descriptor>& component: components) { - if(component.size() == 1) { - continue; - } - - // Create a new vertex to represent this component. - const auto vNew = add_vertex(graph); - vector<uint64_t>& snippetsNew = graph[vNew].snippetIndexes; - for(const vertex_descriptor v: component) { - const vector<uint64_t>& snippets = graph[v].snippetIndexes; - SHASTA_ASSERT(snippets.size() == 1); - snippetsNew.push_back(snippets.front()); - } - - // Create the new edges. - for(const vertex_descriptor v0: component) { - - // Out-edges. - BGL_FORALL_OUTEDGES_T(v0, e01, graph, SnippetGraph) { - const vertex_descriptor v1 = target(e01, graph); - if(v1 != vNew) { - add_edge(vNew, v1,graph); - } - } - - // In-edges. - BGL_FORALL_INEDGES_T(v0, e10, graph, SnippetGraph) { - const vertex_descriptor v1 = source(e10, graph); - if(v1 != vNew) { - add_edge(v1, vNew, graph); - } - } - } - - // Remove the old vertices and their edges. - for(const vertex_descriptor v: component) { - clear_vertex(v, graph); - remove_vertex(v, graph); - } - } - - - // Compute which maximal vertices each vertex is a descendant of. - std::map<vertex_descriptor, vector<vertex_descriptor> > ancestorMap; - BGL_FORALL_VERTICES_T(v0, graph, SnippetGraph) { - if(in_degree(v0, graph) != 0) { - continue; // Not a maximal vertex. - } - - // Find the descendants of this maximal vertex. - vector<vertex_descriptor> descendants; - graph.findDescendants(v0, descendants); - - // Update the ancestor map. - for(const vertex_descriptor v1: descendants) { - ancestorMap[v1].push_back(v0); - } - } - - - - // Each maximal vertex generates a cluster consisting of the vertices - // that descend from it and from no other maximal vertex. - // Gather the vertices in each cluster. - std::map<vertex_descriptor, vector<vertex_descriptor> > clusterMap; - uint64_t unclusterVertexCount = 0; - BGL_FORALL_VERTICES_T(v1, graph, SnippetGraph) { - const vector<vertex_descriptor>& ancestors = ancestorMap[v1]; - if(ancestors.size() == 1) { - const vertex_descriptor v0 = ancestors.front(); - clusterMap[v0].push_back(v1); - } else { - ++unclusterVertexCount; - } - } - cout << "Found " << unclusterVertexCount << " unclustered vertices." << endl; - - - - // Gather the snippets in each cluster. - clusters.clear(); - for(const auto& p: clusterMap) { - const vector<vertex_descriptor>& clusterVertices = p.second; - clusters.resize(clusters.size() + 1); - Cluster& cluster = clusters.back(); - - vector<uint64_t> clusterSnippetIndexes; // Only used for debug output. - for(const vertex_descriptor v: clusterVertices) { - const vector<uint64_t>& snippetIndexes = graph[v].snippetIndexes; - for(const uint64_t snippetIndex: snippetIndexes) { - cluster.snippets.push_back(snippets[snippetIndex]); - clusterSnippetIndexes.push_back(snippetIndex); - } - } - cluster.constructSegments(); - cluster.cleanupSegments(minSegmentCoverage); - cout << "Found a cluster candidate with " << - clusterVertices.size() << " vertices and " << - cluster.snippets.size() << " snippets:" << endl; - for(const uint64_t snippetIndex: clusterSnippetIndexes) { - cout << snippetIndex << " "; - } - cout << endl; - - // If coverage on this cluster is too low, discard it. - if(cluster.coverage() < minClusterCoverage) { - clusters.resize(clusters.size() - 1); - cout << "This cluster candidate was discarded because of low coverage." << endl; - continue; - } - - // This cluster will be stored and is assigned this clusterId; - const uint64_t clusterId = clusters.size() - 1; - - if(debug) { - - cout << "This cluster was stored as cluster " << clusterId << endl; - cout << "Segment(coverage) for this cluster:\n"; - for(const auto& p: cluster.segments) { - cout << p.first << "(" << p.second << ") "; - } - cout << endl; - } - - // Mark the vertices of this cluster. - for(const vertex_descriptor v: clusterVertices) { - graph[v].clusterId = clusterId; - } - } - graph.clusterCount = clusters.size(); - - - - // Write out the SnippetGraph. - if(debug) { - graph.writeGraphviz("SnippetGraph.dot"); - } -} - - - -void AssemblyGraph::AnalyzeSubgraphClasses::SnippetGraph::findDescendants( - const vertex_descriptor vStart, - vector<vertex_descriptor>& descendants) const -{ - const SnippetGraph& graph = *this; - - // Initialize the BFS. - std::queue<vertex_descriptor> q; - q.push(vStart); - std::set<vertex_descriptor> descendantsSet; - descendantsSet.insert(vStart); - - // BFS loop. - while(not q.empty()) { - const vertex_descriptor v0 = q.front(); - q.pop(); - - BGL_FORALL_OUTEDGES(v0, e01, graph, SnippetGraph) { - const vertex_descriptor v1 = target(e01, graph); - if(descendantsSet.find(v1) == descendantsSet.end()) { - q.push(v1); - descendantsSet.insert(v1); - } - } - } - - descendants.clear(); - copy(descendantsSet.begin(), descendantsSet.end(), back_inserter(descendants)); -} - - - -void AssemblyGraph::AnalyzeSubgraphClasses::SnippetGraph::writeGraphviz( - const string& fileName) const -{ - const SnippetGraph& graph = *this; - - ofstream dot(fileName); - dot << "digraph SnippetGraph{\n" - "node [shape=rectangle];\n"; - BGL_FORALL_VERTICES(v, graph, SnippetGraph) { - dot << "\"" << v << "\" [label=\""; - const vector<uint64_t>& snippetIndexes = graph[v].snippetIndexes; - for(const uint64_t snippetIndex: snippetIndexes) { - dot << snippetIndex; - dot << "\\n"; - } - dot << "\""; - const uint64_t clusterId = graph[v].clusterId; - if(clusterId != std::numeric_limits<uint64_t>::max()) { - dot << " style=filled fillcolor=\"" << - float(clusterId)/float(clusterCount) << - ",0.3,1\""; - } - dot << "];\n"; - } - BGL_FORALL_EDGES(e, graph, SnippetGraph) { - const vertex_descriptor vx = source(e, graph); - const vertex_descriptor vy = target(e, graph); - dot << "\"" << vx << "\"->\"" << vy << "\";\n"; - } - dot << "}\n"; - -} - - - -void AssemblyGraph::AnalyzeSubgraphClasses::Cluster::constructSegments() -{ - // A map with Key=segmentId, value = coverage. - std::map<uint64_t, uint64_t> segmentMap; - - for(const JourneySnippet& snippet: snippets) { - for(const uint64_t segmentId: snippet.segmentIds) { - auto it = segmentMap.find(segmentId); - if(it == segmentMap.end()) { - segmentMap.insert(make_pair(segmentId, 1)); - } else { - ++(it->second); - } - } - } - - segments.clear(); - copy(segmentMap.begin(), segmentMap.end(), back_inserter(segments)); -} - - - -void AssemblyGraph::AnalyzeSubgraphClasses::Cluster::cleanupSegments(uint64_t minSegmentCoverage) -{ - vector< pair<uint64_t, uint64_t > > newSegments; - for(const auto& p: segments) { - if(p.second >= minSegmentCoverage) { - newSegments.push_back(p); - } - } - segments.swap(newSegments); -} - - - -vector<uint64_t> AssemblyGraph::AnalyzeSubgraphClasses::Cluster::getSegments() const -{ - vector<uint64_t> v; - for(const auto& p: segments) { - v.push_back(p.first); - } - return v; -} - - - -// Create an assembly path starting at a given segment. -void AssemblyGraph::createAssemblyPath( - uint64_t startSegmentId, - uint64_t direction, // 0 = forward, 1 = backward - AssemblyPath& path - ) const -{ - // EXPOSE WHEN CODE STABILIZES. - const uint64_t minCommonForLink = 3; - const uint64_t minCommonForReference = 3; - const double minJaccard = 0.75; - const int32_t minLinkSeparation = -20; - - const bool debug = false; - if(true) { - cout << timestamp << "createAssemblyPath begins at segment " << startSegmentId << - ", direction " << direction << endl; - } - - - - // At each iteration, we start from segmentIdA (the current "primary segment") - // and move in the specified direction until we find segmentIdB with - // sufficiently high Jaccard similarity and number of - // common oriented reads with segmentIdA. - // At each step, we choose the links that has the most common oriented - // reads with the current primary segment. - uint64_t referenceSegmentId = startSegmentId; - SegmentOrientedReadInformation infoReference; - getOrientedReadsOnSegment(referenceSegmentId, infoReference); - uint64_t segmentId0 = startSegmentId; - path.clear(); - path.segments.push_back(AssemblyPathSegment(startSegmentId, true)); - vector<uint64_t> lastIterationSegments; - std::set< pair<uint64_t, uint64_t> > previousPairs; // (reference segment, current segment). - while(true) { - - if(debug) { - cout << "Reference segment " << referenceSegmentId << - ", segmentId0 " << segmentId0 << endl; - } - - // Loop over outgoing or incoming links of the current segment. - // Find the link with the most common reads with the reference segment. - const auto linkIds = (direction == 0) ? linksBySource[segmentId0] : linksByTarget[segmentId0]; - if(linkIds.empty()) { - if(debug) { - cout << "No links in this direction." << endl; - } - break; - } - uint64_t linkIdBest = invalid<uint64_t>; - uint64_t commonOrientedReadCountBest = 0; - for(const uint64_t linkId: linkIds) { - - // If link separation is too negative, skip it. - // The goal here is to avoid cycles in paths. - const Link& link = links[linkId]; - if(link.separation < minLinkSeparation) { - continue; - } - - // Count the number of common oriented reads between the reference segment and this link. - uint64_t commonOrientedReadCount; - analyzeSegmentLinkPair(referenceSegmentId, linkId, commonOrientedReadCount); - - // If better than the one we have it, record it. - if(commonOrientedReadCount > commonOrientedReadCountBest) { - linkIdBest = linkId; - commonOrientedReadCountBest = commonOrientedReadCount; - } - } - if(commonOrientedReadCountBest < minCommonForLink) { - if(debug) { - cout << "No good links found." << endl; - } - break; - } - const uint64_t linkId = linkIdBest; - if(debug) { - cout << "Best link " << linkId << - ", " << commonOrientedReadCountBest << - " common oriented reads with the reference segment." << endl; - } - - // Get the segment at the other side of this link. - const Link& link = links[linkId]; - const uint64_t segmentId1 = (direction==0) ? link.segmentId1 : link.segmentId0; - if(debug) { - cout << "segmentId1 " << segmentId1 << endl; - } - lastIterationSegments.push_back(segmentId1); - - // Check that we haven't been here before. - if(previousPairs.contains(make_pair(referenceSegmentId, segmentId1))) { - break; - } - previousPairs.insert(make_pair(referenceSegmentId, segmentId1)); - - // Check segmentId1 against the reference segment. - SegmentOrientedReadInformation info1; - getOrientedReadsOnSegment(segmentId1, info1); - SegmentPairInformation info; - analyzeSegmentPair( - referenceSegmentId, segmentId1, - infoReference, info1, - markers, info); - if(debug) { - cout << "Jaccard " << info.jaccard() << endl; - } - - // If the Jaccard similarity is high, this becomes the new reference segment. - if(info.commonCount >= minCommonForReference and info.jaccard() >= minJaccard) { - referenceSegmentId = segmentId1; - getOrientedReadsOnSegment(referenceSegmentId, infoReference); - const uint64_t lastPrimarySegmentId = path.segments.back().id; - if(debug) { - cout << "New reference segment is " << segmentId1 << endl; - cout << "Previous reference segment is " << lastPrimarySegmentId << endl; - } - for(const uint64_t segmentId: lastIterationSegments) { - path.segments.push_back(AssemblyPathSegment(segmentId, false)); - if(debug) { - cout << "Added segment " << segmentId << " to path." << endl; - } - if(segmentId != segmentId1) { - if(direction == 0) { - path.segments.back().previousPrimarySegmentId = lastPrimarySegmentId; - path.segments.back().nextPrimarySegmentId = segmentId1; - } else { - path.segments.back().previousPrimarySegmentId = segmentId1; - path.segments.back().nextPrimarySegmentId = lastPrimarySegmentId; - } - } - } - path.segments.back().isPrimary = true; - lastIterationSegments.clear(); - } - - segmentId0 = segmentId1; - } - - - - if(true) { - cout << timestamp << "createAssemblyPath3 ends." << endl; - } -} - - - -// Count the number of common oriented reads between a segment and a link, -// without counting oriented reads that appear more than once on the -// segment or on the link. -void AssemblyGraph::analyzeSegmentLinkPair( - uint64_t segmentId, - uint64_t linkId, - uint64_t& commonOrientedReadCount -) const -{ - // The oriented reads in this segment, - // with some extra information that we don't care about here. - const auto segmentOrientedReads = assemblyGraphJourneyInfos[segmentId]; - - // The oriented reads in this link, - // with some extra information that we don't care about here. - const auto linkOrientedReads = transitions[linkId]; - - // Joint loop over oriented reads. - commonOrientedReadCount = 0; - const auto segmentBegin = segmentOrientedReads.begin(); - const auto segmentEnd = segmentOrientedReads.end(); - const auto linkBegin = linkOrientedReads.begin(); - const auto linkEnd = linkOrientedReads.end(); - auto itSegment = segmentBegin; - auto itLink = linkBegin; - while(itSegment != segmentEnd and itLink != linkEnd) { - - if(itSegment->first < itLink->first) { - ++itSegment; - continue; - } - if(itLink->first < itSegment->first) { - ++itLink; - continue; - } - SHASTA_ASSERT(itSegment->first == itLink->first); - - // If it appears more than once in the segment, skip it. - auto itSegmentNext = itSegment + 1; - if(itSegmentNext != segmentEnd and itSegmentNext->first == itSegment->first) { - ++itSegment; - ++itLink; - continue; - } - if(itSegment != segmentBegin) { - auto itSegmentPrevious = itSegment - 1; - if(itSegmentPrevious->first == itSegment->first) { - ++itSegment; - ++itLink; - continue; - } - } - - // If it appears more than once in the link, skip it. - auto itLinkNext = itLink + 1; - if(itLinkNext != linkEnd and itLinkNext->first == itLink->first) { - ++itSegment; - ++itLink; - continue; - } - if(itLink != linkBegin) { - auto itLinkPrevious = itLink - 1; - if(itLinkPrevious->first == itLink->first) { - ++itSegment; - ++itLink; - continue; - } - } - - // Ok, this is a common oriented read that appears only once - // in both the segment and the link. - ++commonOrientedReadCount; - ++itSegment; - ++itLink; - } - -} - - - -// Given a segment, use a BFS to move in the specified direction until -// we find a segment with sufficiently high Jaccard similarity -// and number of common reads. -// This returns invalid<uint64_t> if no such segment is found -// within the specified distance. -uint64_t AssemblyGraph::findSimilarSegmentBfs( - uint64_t segmentIdA, - uint64_t direction, // 0 = forward, 1 = backward - uint64_t maxDistance, - uint64_t minCommon, - double minJaccard) const -{ - const bool debug = true; - if(debug) { - cout << "findSimilarSegmentBfs starts " << segmentIdA << " " << direction << endl; - } - - // Sanity check. - SHASTA_ASSERT(maxDistance > 0); - - // Get the oriented reads on segmentIdA. - SegmentOrientedReadInformation infoA; - getOrientedReadsOnSegment(segmentIdA, infoA); - - // Initialize a BFS. - std::queue<uint64_t> q; - q.push(segmentIdA); - - // Keep track of segments we already encountered and their distance. - // Key = segmentId; - // Value = distance. - std::map<uint64_t, uint64_t> distanceMap; - distanceMap.insert(make_pair(segmentIdA, 0)); - - - - // BFS loop. - while(not q.empty()) { - - // Dequeue a segment. - const uint64_t segmentId0 = q.front(); - q.pop(); - const uint64_t distance0 = distanceMap[segmentId0]; - SHASTA_ASSERT(distance0 < maxDistance); - const uint64_t distance1 = distance0 + 1; - if(debug) { - cout << "dequeued " << segmentId0 << " " << distance0 << endl; - } - - // Loop over outgoing or incoming links. - const auto linkIds = (direction == 0) ? linksBySource[segmentId0] : linksByTarget[segmentId0]; - for(const uint64_t linkId: linkIds) { - const Link& link = links[linkId]; - - // Get the segment at the other side of this link. - const uint64_t segmentId1 = (direction==0) ? link.segmentId1 : link.segmentId0; - - // If we already found it, skip it. - if(distanceMap.contains(segmentId1)) { - continue; - } - - if(debug) { - cout << "found " << segmentId1 << " " << distance1 << endl; - } - - // Get the oriented reads on segmentId1. - SegmentOrientedReadInformation info1; - getOrientedReadsOnSegment(segmentId1, info1); - - // See how similar this is to segmentIdA. - SegmentPairInformation infoA1; - analyzeSegmentPair( - segmentIdA, segmentId1, - infoA, info1, - markers, infoA1); - - // If this satisfies our criteria, we are done. - if(infoA1.commonCount >= minCommon and - infoA1.jaccard() >= minJaccard) { - if(debug) { - cout << "findSimilarSegmentBFS returns " << segmentId1 << " " << direction << endl; - } - return segmentId1; - } - - // This segment did not satisfy our criteria, so we - // have to continue the BFS. - if(distance1 < maxDistance) { - q.push(segmentId1); - distanceMap.insert(make_pair(segmentId1, distance1)); - if(debug) { - cout << "enqueued " << segmentId1 << " " << distance1 << endl; - } - } - } - - } - - - - // If getting here, we did not find a segment that satisfies - // the requested criteria. - if(debug) { - cout << "findSimilarSegmentBfs returns invalid" << endl; - } - return invalid<uint64_t>; -} - - - -// Given a segment, move in the specified direction, -// in order of increasing distance in markers, until -// we find a segment with sufficiently high Jaccard similarity -// and number of common reads. -// This returns invalid<uint64_t> if no such segment is found -// within the specified distance. -uint64_t AssemblyGraph::findSimilarSegment( - uint64_t segmentIdA, - uint64_t direction, // 0 = forward, 1 = backward - uint64_t maxDistance, // In markers - uint64_t minLinkCoverage, - int32_t minLinkSeparation, - uint64_t minCommon, - double maxUnexplainedFraction, - double minJaccard, - vector<uint64_t>& segments) const -{ - const bool debug = false; - if(debug) { - cout << "findSimilarSegment begins, segmentIdA " << segmentIdA << endl; - } - // Sanity check. - SHASTA_ASSERT(maxDistance > 0); - - segments.clear(); - - // Get the oriented reads on segmentIdA. - SegmentOrientedReadInformation infoA; - getOrientedReadsOnSegment(segmentIdA, infoA); - - // (Offset, segmentId) for queued segments. - std::multimap<uint64_t, uint64_t> q; - q.insert(make_pair(0, segmentIdA)); - - // The segments that we already encountered. - std::set<uint64_t> visitedSegmentSet; - visitedSegmentSet.insert(segmentIdA); - - - - // Search loop. - while(not q.empty()) { - - // Dequeue the segment with the smallest offset. - const auto it0 = q.begin(); - const uint64_t segmentId0 = it0->second; - q.erase(it0); - - // Analyze against segmentIdA. - SegmentOrientedReadInformation info0; - getOrientedReadsOnSegment(segmentId0, info0); - SegmentPairInformation infoA0; - analyzeSegmentPair( - segmentIdA, segmentId0, - infoA, info0, - markers, infoA0); - - // Add it to our list of segments, if possible. - const double unexplainedFraction = infoA0.unexplainedFraction(0); - if(unexplainedFraction < maxUnexplainedFraction) { - segments.push_back(segmentId0); - } - - // If unexplained fraction and Jaccard similarity are low, we are done. - if(segmentId0 != segmentIdA) { - if((unexplainedFraction < maxUnexplainedFraction) and (infoA0.jaccard() >= minJaccard)) { - SHASTA_ASSERT(segments.back() == segmentId0); - return segmentId0; - } - } - - if(debug) { - cout << "Dequeued " << segmentId0 << endl; - } - - // Loop over outgoing or incoming links. - const auto linkIds = (direction == 0) ? linksBySource[segmentId0] : linksByTarget[segmentId0]; - for(const uint64_t linkId: linkIds) { - // If link coverage is too low, skip. - if(transitions.size(linkId) < minLinkCoverage) { - continue; - } - - // If link separation is too negative, skip it. - // The goal here is to avoid cycles in paths. - const Link& link = links[linkId]; - if(link.separation < minLinkSeparation) { - continue; - } - - // Get the segment at the other side of this link. - const uint64_t segmentId1 = (direction==0) ? link.segmentId1 : link.segmentId0; - if(debug) { - cout << "Found " << segmentId1 << endl; - } - - // If we already found it, skip it. - if(visitedSegmentSet.contains(segmentId1)) { - if(debug) { - cout << "Already found, skipping." << endl; - } - continue; - } - visitedSegmentSet.insert(segmentId1); - - // Get the oriented reads on segmentId1. - SegmentOrientedReadInformation info1; - getOrientedReadsOnSegment(segmentId1, info1); - - // Analyze similarity to segmentIdA. - SegmentPairInformation infoA1; - analyzeSegmentPair( - segmentIdA, segmentId1, - infoA, info1, - markers, infoA1); - - // If not enough common segments, skip it. - if(infoA1.commonCount < minCommon) { - if(debug) { - cout << "Not enough common reads." << endl; - } - continue; - } - - // Offset estimates are not reliable. - // Don't use them to rule out segments. -#if 0 - // If not in the expected direction, skip it. - uint64_t offset; - if(direction == 0) { - if(infoA1.offset < 0) { - if(debug) { - cout << "Not in the forward direction." << endl; - } - continue; - } else { - offset = uint64_t(infoA1.offset); - } - } else { - if(infoA1.offset > 0) { - if(debug) { - cout << "Not in the backward direction." << endl; - } - continue; - } else { - offset = uint64_t(-infoA1.offset); - } - } -#endif - - // If we went too far, skip it. - if(labs(infoA1.offset) > maxDistance) { - if(debug) { - cout << "Too far." << endl; - } - continue; - } - - // Queue it. - q.insert(make_pair(labs(infoA1.offset), segmentId1)); - if(debug) { - cout << "Queued " << segmentId1 << endl; - } - - } - - } - - - - // If getting here, we did not find a segment that satisfies - // the requested criteria. - return invalid<uint64_t>; -} - - - -// BFS with given begin/end. -// Does a BFS which starts at segmentIdA. -// and ends when segmentIdB is encountered. -// The BFS if forward if direction is 0 -// and backward if direction is 1. -// Computes a vector of all the segments encountered, -// excluding segmentIdA and segmentIdB, -// in the order in which they are encountered in the BFS. -void AssemblyGraph::targetedBfs( - uint64_t segmentIdA, - uint64_t segmentIdB, - uint64_t direction, - vector<uint64_t>& segments - ) const -{ - - // Initialize the BFS. - std::queue<uint64_t> q; - q.push(segmentIdA); - - // Keep track of segments we already encountered. - std::set<uint64_t> segmentSet; - segmentSet.insert(segmentIdA); - - - - // BFS loop. - segments.clear(); - while(not q.empty()) { - - // Dequeue a segment. - const uint64_t segmentId0 = q.front(); - q.pop(); - - // Loop over outgoing or incoming links. - const auto linkIds = (direction == 0) ? linksBySource[segmentId0] : linksByTarget[segmentId0]; - for(const uint64_t linkId: linkIds) { - const Link& link = links[linkId]; - - // Get the segment at the other side of this link. - const uint64_t segmentId1 = (direction==0) ? link.segmentId1 : link.segmentId0; - - // If we found segmentIdB, we are done. - if(segmentId1 == segmentIdB) { - break; - } - - // If we already found it, skip it. - if(segmentSet.contains(segmentId1)) { - continue; - } - - // Queue and store this segment. - q.push(segmentId1); - segments.push_back(segmentId1); - segmentSet.insert(segmentId1); - } - } - -} - - -// Assemble the assembly paths stored in the JaccardGraph. -void AssemblyGraph::assembleJaccardGraphPaths() -{ - const JaccardGraph& jaccardGraph = *jaccardGraphPointer; - ofstream fasta("JaccardGraphPaths.fasta"); - - uint64_t totalSequenceAssembled = 0; - for(uint64_t clusterId=0; clusterId<jaccardGraph.assemblyPaths.size(); clusterId++) { - const vector<uint64_t>& primarySegments = jaccardGraph.assemblyPaths[clusterId]; - AssemblyPath assemblyPath; - assembleJaccardGraphPath(primarySegments, assemblyPath); - - const auto& sequence = assemblyPath.rawSequence; - totalSequenceAssembled += sequence.size(); - fasta << ">" << clusterId << " " << sequence.size() << "\n"; - copy(sequence.begin(), sequence.end(), ostream_iterator<Base>(fasta)); - fasta << "\n"; - } - cout << "Assembled a total " << totalSequenceAssembled << " bases." << endl; -} - - - -void AssemblyGraph::assembleJaccardGraphPath( - const vector<uint64_t>& primarySegments, - AssemblyPath& assemblyPath) -{ - SHASTA_ASSERT(primarySegments.size() >= 2); - - const JaccardGraph& jaccardGraph = *jaccardGraphPointer; - - // Initialize the path and add the first primary segment. - assemblyPath.segments.clear(); - assemblyPath.links.clear(); - assemblyPath.segments.push_back(AssemblyPathSegment(primarySegments.front(), true)); - - // Add the remaining primary and secondary segments. - for(uint64_t i=1; i<primarySegments.size(); i++) { - const uint64_t primarySegment0 = primarySegments[i-1]; - const uint64_t primarySegment1 = primarySegments[i]; - - // Get the JaccardGraph vertices corresponding to these primary segments. - const JaccardGraph::vertex_descriptor v0 = jaccardGraph.vertexTable[primarySegment0]; - const JaccardGraph::vertex_descriptor v1 = jaccardGraph.vertexTable[primarySegment1]; - - // Get the JaccardGraph edge between these two vertices. - JaccardGraph::edge_descriptor e; - bool edgeWasFound = false; - tie(e, edgeWasFound) = edge(v0, v1, jaccardGraph); - SHASTA_ASSERT(edgeWasFound); - const JaccardGraphEdge& edge = jaccardGraph[e]; - - // Access the secondary segments on this edge. - const vector<uint64_t>& secondarySegmentIds = edge.segmentIds; - - // Add the secondary segments between these two primary segments. - for(const uint64_t segmentId: secondarySegmentIds) { - AssemblyPathSegment assemblyPathSegment(segmentId, false); - assemblyPathSegment.previousPrimarySegmentId = primarySegment0; - assemblyPathSegment.nextPrimarySegmentId = primarySegment1; - assemblyPath.segments.push_back(assemblyPathSegment); - - } - - // Add the next primary segment. - assemblyPath.segments.push_back(AssemblyPathSegment(primarySegment1, true)); - } - - // Assemble sequence for this path. - assemblyPath.assemble(*this); - -} - - - -// De Bruijn graph of the assembly graph journeys of all oriented reads. -// Each assembly graph journey is interpreted as a sequence of segment ids. -// Each vertex represents a sequence of K segment ids. -template<uint64_t K> void AssemblyGraph::createDeBruijnGraphTemplated() const -{ - // EXPOSE WHEN CODE STABILIZES. - const uint64_t minCoverage = 8; - - // Type used to store the sequence of a vertex (K segment ids). - using VertexSequence = array<uint64_t, K>; - - // Loop over all oriented reads to gather vertex sequences. - vector<VertexSequence> vertexSequences; - for(uint64_t i=0; i<assemblyGraphJourneys.size(); i++) { - - // Get the assembly graph journey for this oriented read. - const span<const AssemblyGraphJourneyEntry>& journey = assemblyGraphJourneys[i]; - const uint64_t journeyLength = journey.size(); - - // If too short, skip. - if(journeyLength < K) { - continue; - } - - // Extract sequences of length K from the journey. - // Loop over starting positions. - for(uint64_t j=0; j<=journeyLength-K; j++) { - - // Fill in the seqyence of length K starting here. - VertexSequence vertexSequence; - for(uint64_t k=0; k<K; k++) { - vertexSequence[k] = journey[j+k].segmentId; - } - // Store it. - vertexSequences.push_back(vertexSequence); - } - } - - // Count how many times each sequence was found. - vector<uint64_t> coverage; - deduplicateAndCount(vertexSequences, coverage); - SHASTA_ASSERT(vertexSequences.size() == coverage.size()); - - // Each sequence with sufficient coverage generates a vertex. - using Vertex = pair<VertexSequence, uint64_t>; // Stores sequence and coverage. - using Graph = boost::adjacency_list<boost::listS, boost::listS, boost::bidirectionalS, Vertex>; - using vertex_descriptor = Graph::vertex_descriptor; - Graph graph; - - for(uint64_t i=0; i<vertexSequences.size(); i++) { - const uint64_t c = coverage[i]; - if(c >= minCoverage) { - add_vertex(Vertex(vertexSequences[i], c), graph); - } - } - - // To generate edges, index the vertices by their (K-1)-prefix. - using Prefix = array<uint64_t, K-1>; - std::map<Prefix, vector<vertex_descriptor> > vertexMap; - BGL_FORALL_VERTICES_T(v, graph, Graph) { - const VertexSequence& sequence = graph[v].first; - Prefix prefix; - copy(sequence.begin(), sequence.begin()+K-1, prefix.begin()); - vertexMap[prefix].push_back(v); - } - - // Now we can generate the edges. - using Suffix = Prefix; - BGL_FORALL_VERTICES_T(v0, graph, Graph) { - const VertexSequence& sequence = graph[v0].first; - Suffix suffix; - copy(sequence.begin()+1, sequence.end(), suffix.begin()); - auto it = vertexMap.find(suffix); - if(it == vertexMap.end()) { - continue; - } - for(const vertex_descriptor v1: it->second) { - add_edge(v0, v1, graph); - } - } - cout << "The DeBruijn graph has " << num_vertices(graph) << " vertices and " << - num_edges(graph) << " edges." << endl; - - // Write it out. - ofstream dot("DeBruijnGraph.dot"); - dot << "digraph DeBruijnGraph {\n"; - BGL_FORALL_VERTICES_T(v, graph, Graph) { - const uint64_t coverage = graph[v].second; - dot << "\"" << v << "\" [" - "tooltip=\"" << coverage << "\" " - "width=" << 0.001*double(coverage) << - "];\n"; - } - BGL_FORALL_EDGES_T(e, graph, Graph) { - const vertex_descriptor v0 = source(e, graph); - const vertex_descriptor v1 = target(e, graph); - dot << "\"" << v0 << "\"->\"" << v1 << "\";\n"; - } - dot << "}\n"; -} - - - -void AssemblyGraph::createDeBruijnGraph() const -{ - createDeBruijnGraphTemplated<3>(); -} - - - -void AssemblyGraph::assembleSegments() -{ - createNew(segmentSequences, "Mode3-SegmentSequences"); - createNew(segmentVertexOffsets, "Mode3-SegmentVertexOffsets"); - for(uint64_t segmentId=0; segmentId<markerGraphPaths.size(); segmentId++) { - assembleSegment(segmentId); - } -} -void AssemblyGraph::assembleSegment(uint64_t segmentId) -{ - // Assemble it. - AssembledSegment assembledSegment; - assembleMarkerGraphPath( - readRepresentation, - k, - markers, - markerGraph, - markerGraphPaths[segmentId], - false, - assembledSegment); - - // Store assembled sequence and vertex offsets. - segmentSequences.appendVector(assembledSegment.rawSequence); - segmentVertexOffsets.appendVector(assembledSegment.vertexOffsets); - -} - - - -// Assemble a link, given a set of allowed OrientedReadId(s). -// The returned sequence overrides: -// - The trim0 last bases of the preceding segment. -// - The trim1 first bases of the following segment. -void AssemblyGraph::assembleLink( - uint64_t linkId, - const vector<OrientedReadId>& allowedOrientedReadIds, - vector<Base>& sequence, // The entire MSA sequence - uint64_t& leftTrim, // The number of MSA sequence to be trimmed on the left for assembly - uint64_t& rightTrim, // The number of MSA sequence to be trimmed on the left for assembly - uint64_t& trim0, // The number of bases at the end of segment0 to be trimmed for assembly - uint64_t& trim1, // The number of bases at the beginning of segment1 to be trimmed for assembly - ostream& html -) const -{ - const bool debug = false; - const Link& link = links[linkId]; - - SHASTA_ASSERT(std::is_sorted(allowedOrientedReadIds.begin(), allowedOrientedReadIds.end())); - - // If the preceding and last segment are adjacent, - // assembling the link is trivial. - // We return an empty sequence which overrides - // the last k/2 bases of the preceding segment - // and the first k/2 bases of the following segment. - // This was the resulting segment sequences are exactly adjacent. - if(link.segmentsAreAdjacent) { - sequence.clear(); - leftTrim = 0; - rightTrim = 0; - trim0 = k / 2; - trim1 = k / 2; - return; - } - - // If getting here, the two segments of this link are not adjacent. - // Get some infomation we are going to need below. - const uint64_t segmentId0 = link.segmentId0; - const uint64_t segmentId1 = link.segmentId1; - const auto sequence0 = segmentSequences[segmentId0]; - const auto sequence1 = segmentSequences[segmentId1]; - SHASTA_ASSERT(not sequence0.empty()); - SHASTA_ASSERT(not sequence1.empty()); - const auto vertexOffsets0 = segmentVertexOffsets[segmentId0]; - const auto vertexOffsets1 = segmentVertexOffsets[segmentId1]; - - - // First, find: - // - The position in segmentId0 of the leftmost transition. - // - The position in segmentId1 of the rightmost transition. - uint64_t minEdgePosition0 = markerGraphPaths[segmentId0].size(); - uint64_t maxEdgePosition1 = 0; - for(const auto &p : transitions[linkId]) { - const OrientedReadId orientedReadId = p.first; - - // If not one of the allowed OrientedReadId(s), skip it. - if(not binary_search(allowedOrientedReadIds.begin(), allowedOrientedReadIds.end(), orientedReadId)) { - continue; - } - - // Access the transition from segmentId0 to segmentId1 for this oriented read. - const Transition &transition = p.second; - - minEdgePosition0 = min(minEdgePosition0, - uint64_t(transition[0].position)); - maxEdgePosition1 = max(maxEdgePosition1, - uint64_t(transition[1].position)); - } - - // When getting here: - // - minEdgePosition0 is the leftmost position of the transitions in path0. - // - maxEdgePosition1 is the rightmost position of the transitions in path1. - // These positions are edge positions in markerGraphPath0 and markerGraphPath1. - // We will do a multiple sequence alignment of the oriented reads, - // using the sequence of segmentId0 to extend to the left all reads to minEdgePosition0, - // and using the sequence of segmentId1 to extend to the right all reads to maxEdgePosition1, - - // Get the corresponding vertex positions in segmentId0 and segmentId1. - const uint64_t minVertexPosition0 = minEdgePosition0 + 1; - const uint64_t maxVertexPosition1 = maxEdgePosition1; - - // To compute an MSA anchored at both sides,we will extend the - // sequence of each read to the left/right using the sequence of - // adjacent segments. - - - // Now extract the portion of each oriented read sequence that - // will be used to assemble this link. - vector<OrientedReadId> orientedReadIdsForAssembly; - vector<vector<Base> > orientedReadsSequencesForAssembly; - for(const auto &p : transitions[linkId]) { - const OrientedReadId orientedReadId = p.first; - - // If not one of the allowed OrientedReadId(s), skip it. - if(not binary_search(allowedOrientedReadIds.begin(), allowedOrientedReadIds.end(), orientedReadId)) { - continue; - } - - // Access the transition from segmentId0 to segmentId1 for this oriented read. - const Transition &transition = p.second; - - // Get the ordinals of the last appearance of this oriented - // read on segmentId0 and the first on segmentId1, - // and the corresponding markers. - const uint32_t ordinal0 = transition[0].ordinals[1]; - const uint32_t ordinal1 = transition[1].ordinals[0]; - const CompressedMarker &marker0 = markers[orientedReadId.getValue()][ordinal0]; - const CompressedMarker &marker1 = markers[orientedReadId.getValue()][ordinal1]; - - // Get the positions of these markers on the oriented read. - // If using RLE, these are RLE positions. - const uint32_t position0 = marker0.position; - const uint32_t position1 = marker1.position; - - // Extract the sequence between these markers (including the markers). - vector<Base> orientedReadSequence; - for(uint64_t position = position0; - position < position1 + k; position++) { - const Base b = reads.getOrientedReadBase(orientedReadId, uint32_t(position)); - orientedReadSequence.push_back(b); - } - - // We need to extend the sequence of this read to the left, - // using segmentId0 sequence, up to minVertexPosition0, - // so the portions of all reads we will be using for the MSA - // all begin in the same place. - vector<Base> leftSequence; - vector<uint32_t> leftRepeatCounts; - const uint64_t vertexPosition0 = transition[0].position + 1; // Add 1 to get vertex position. - const uint64_t begin0 = - vertexOffsets0[minVertexPosition0]; - const uint64_t end0 = vertexOffsets0[vertexPosition0]; - for(uint64_t position = begin0; position != end0; position++) { - leftSequence.push_back(sequence0[position]); - } - - vector<Base> rightSequence; - const uint64_t vertexPosition1 = transition[1].position; - const uint64_t begin1 = vertexOffsets1[vertexPosition1] + k; - const uint64_t end1 = vertexOffsets1[maxVertexPosition1] + k; - for(uint64_t position = begin1; position != end1; position++) { - rightSequence.push_back(sequence1[position]); - } - - // Construct the extended sequence for this oriented read, - // to be used in the MSA. - vector<Base> orientedReadExtendedSequence; - const auto addToExtendedSequence = back_inserter(orientedReadExtendedSequence); - copy(leftSequence, addToExtendedSequence); - copy(orientedReadSequence, addToExtendedSequence); - copy(rightSequence, addToExtendedSequence); - - orientedReadIdsForAssembly.push_back(orientedReadId); - orientedReadsSequencesForAssembly.push_back(orientedReadExtendedSequence); - - if(debug) { - copy(orientedReadExtendedSequence, ostream_iterator<Base>(cout)); - cout << " " << orientedReadId << endl; - } - } - - - - // Compute the consensus sequence for the link. - if(html) { - html << "<h2>Link " << linkId << "</h2>\n"; - } - vector<Base> msaRleSequence; - computeLinkConsensusUsingSpoa( - orientedReadIdsForAssembly, - orientedReadsSequencesForAssembly, - consensusCaller, - debug, - html, - msaRleSequence); - - if(debug) { - cout << "Consensus RLE sequence length before trimming " << msaRleSequence.size() << endl; - cout << "Portion of segment on left involved in the MSA begins at position " << - vertexOffsets0[minVertexPosition0] << endl; - cout << "Portion of segment on right involved in the MSA ends at position " << - vertexOffsets1[maxVertexPosition1] + k << endl; - } - - // Count the number of identical (RLE) bases at the beginning of the - // link consensus sequence and of the segmentId0 sequence portion - // involved in assembling this link. - uint64_t identicalOnLeft = 0; - const uint64_t begin0 = vertexOffsets0[minVertexPosition0]; - const uint64_t end0 = sequence0.size(); - for(uint64_t i=begin0; (i!=end0 and (i-begin0)<msaRleSequence.size()); i++) { - if(msaRleSequence[i-begin0] == sequence0[i]) { - // cout << "*** " << begin0 << " " << end0 << " " << i << endl; - ++identicalOnLeft; - } else { - break; - } - } - if(debug) { - cout << "Identical on left: " << identicalOnLeft << endl; - } - - // Count the number of identical (RLE) bases at the end of the - // link consensus sequence and the beginning of segmentId1 . - uint64_t identicalOnRight = 0; - const uint64_t end1 = vertexOffsets1[maxVertexPosition1] + k; - for(uint64_t i=end1-1; ; i--) { - const uint64_t j = msaRleSequence.size() - (end1 - i); - if(msaRleSequence[j] == sequence1[i]) { - // cout << "*** " << i << " " << assembledSegment1.runLengthSequence[i] << " " << - // j << " " << consensusRleSequence[j] << endl; - ++identicalOnRight; - } else { - break; - } - if(i == 0) { - break; - } - if(j == 0) { - break; - } - } - identicalOnRight = min(identicalOnRight, msaRleSequence.size()-identicalOnLeft); - if(debug) { - cout << "Identical on right: " << identicalOnRight << endl; - } - - // Trim these identical bases from the link consensus sequence. - leftTrim = identicalOnLeft; - rightTrim = identicalOnRight; - - // Compute and store the number of bases to be trimmed at the end of segmentId0 - // and at the beginning of segmentId1. - trim0 = - sequence0.size() - - vertexOffsets0[minVertexPosition0] - - identicalOnLeft; - trim1 = - vertexOffsets1[maxVertexPosition1] + k - - identicalOnRight; -} - - - -void AssemblyGraph::computeLinkConsensusUsingSpoa( - const vector<OrientedReadId> orientedReadIds, - const vector< vector<Base> > rleSequences, - const ConsensusCaller&, - bool debug, - ostream& html, - vector<Base>& consensusRleSequence - ) -{ - SHASTA_ASSERT(0); -} diff --git a/src/mode3.hpp b/src/mode3.hpp deleted file mode 100644 index 39285d6..0000000 --- a/src/mode3.hpp +++ /dev/null @@ -1,710 +0,0 @@ -#ifndef SHASTA_MODE3_HPP -#define SHASTA_MODE3_HPP - -/******************************************************************************* - -Class mode3::AssemblyGraph is the class used for Mode 3 assembly. -Using GFA terminology, the graph consists of Segments and Links. - -A Segment corresponds to a linear sequence of edges, without branches, -in the marker graph. - -If an oriented read enters segment 1 immediately after exiting segment 0, -we say that there is a transition 0->1. If there is a sufficient -number of transitions 0->1, we create a link 0->1. - -*******************************************************************************/ - -// Shasta. -#include "invalid.hpp" -#include "MemoryMappedVectorOfVectors.hpp" -#include "MultithreadedObject.hpp" -#include "ReadId.hpp" -#include "shastaTypes.hpp" - -// Boost libraries. -#include <boost/graph/adjacency_list.hpp> - -// Standard library. -#include "array.hpp" -#include "memory.hpp" -#include "tuple.hpp" -#include "unordered_map" -#include "vector.hpp" - - - -namespace shasta { - namespace mode3 { - class AssemblyGraph; - class AssemblyGraphJourneyEntry; - class MarkerGraphJourneyEntry; - class AssemblyGraphJourneyInterval; - class AssemblyPath; - class JaccardGraph; - class JaccardGraphEdgeInfo; - class SegmentPairInformation; - class Transition; - - } - - // Some forward declarations of classes in the shasta namespace. - class AssembledSegment; - class Base; - class ConsensusCaller; - class Reads; - class CompressedMarker; - class MarkerGraph; - - extern template class MultithreadedObject<mode3::AssemblyGraph>; -} - - - -// The marker graph journey of an oriented read is the sequence -// of marker graph edges it encounters. -// (An oriented read encounters a marker graph edge -// if the oriented read appears in the marker intervals for the edge). -// The marker graph journey of an oriented read is not necessarily -// a path in the marker graph because the oriented read -// can "skip" marker graph edges due to errors. -// In other places in Shasta, journeys are called "pseudopaths". -// We describe the marker graph journey of each oriented read as a sequence -// of MarkerGraphJourneyEntry objects. -// The MarkerGraphJourneyEntry identifies a marker graph edge -// by the segmentId in the assembly graph and the position in that segment -// (that is, the first marker graph in the segment is at -// position 0, and so on). -class shasta::mode3::MarkerGraphJourneyEntry { -public: - uint64_t segmentId; - uint32_t position; - array<uint32_t, 2> ordinals; - - bool operator<(const MarkerGraphJourneyEntry& that) const - { - return ordinals[0] < that.ordinals[0]; - } - bool operator==(const MarkerGraphJourneyEntry& that) const - { - return - tie(segmentId, position, ordinals) == - tie(that.segmentId, that.position, that.ordinals); - } -}; - - - -// The assembly graph journey of an oriented read is the sequence -// of assembly graph segments (vertices) it encounters. -// The journey on an oriented read is not necessarily -// a path in the assembly graph because the oriented read -// can "skip" segments due to errors. -// We store the assembly graph journey of each oriented read as a sequence -// of AssemblyGraphJourneyEntry objects. -// The AssemblyGraphJourneyEntry stores the segment id and -// the first and last MarkerGraphJourneyEntry objects -// on the segment for the given oriented read. -// Indexed by OrientedReadId::getValue(). -// Note a segmentId can appear more than once in the assembly -// graph journey of an oriented read. This can happen -// if the oriented read "goes around" in a tangle caused by repeats. -class shasta::mode3::AssemblyGraphJourneyEntry { -public: - uint64_t segmentId; - - // The first and last MarkerGraphJourneyEntry that contributed to this - // AssemblyGraphJourneyEntry. - array<MarkerGraphJourneyEntry, 2> markerGraphJourneyEntries; -}; - - - -// A portion of the assembly graph journey of an oriented read. -class shasta::mode3::AssemblyGraphJourneyInterval { -public: - - OrientedReadId orientedReadId; - - // The first and last position in the assembly graph journey - // for this oriented read. - uint64_t first; - uint64_t last; - - - bool operator<(const AssemblyGraphJourneyInterval& that) const - { - return tie(orientedReadId, first) < tie(that.orientedReadId, that.first); - } - -}; - - - -// A transition is a sequence of two consecutive positions -// in the assembly graph journey of an oriented reads. -// In other words, it describes the transition of an oriented read -// from a segment to the next segment it encounters. -// Transitions are used to create edges in the AssemblyGraph (gfa links). -// Indexed by the linkId. For each link, they are sorted. -class shasta::mode3::Transition : public array<MarkerGraphJourneyEntry, 2> { -public: - Transition(const array<MarkerGraphJourneyEntry, 2>& x) : array<MarkerGraphJourneyEntry, 2>(x) {} - Transition() {} -}; - - - -// The AssemblyGraph is used to store the Mode 3 assembly graph, -// when it no longer needs to be changed, -// in memory mapped data structures. -class shasta::mode3::AssemblyGraph : - public MultithreadedObject<AssemblyGraph> { -public: - - // Initial construction. - AssemblyGraph( - const string& largeDataFileNamePrefix, - size_t largeDataPageSize, - size_t threadCount, - uint64_t readRepresentation, - uint64_t k, // Marker length - const Reads& reads, - const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, - const MarkerGraph&, - const ConsensusCaller& consensusCaller); - - // Constructor from binary data. - AssemblyGraph( - const string& largeDataFileNamePrefix, - uint64_t readRepresentation, - uint64_t k, // Marker length - const Reads& reads, - const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, - const MarkerGraph&, - const ConsensusCaller& consensusCaller); - - // Data and functions to handle memory mapped data. - const string& largeDataFileNamePrefix; - size_t largeDataPageSize; - string largeDataName(const string&) const; - template<class T> void createNew(T& t, const string& name) - { - t.createNew(largeDataName(name), largeDataPageSize); - } - template<class T> void accessExistingReadOnly(T& t, const string& name) - { - t.accessExistingReadOnly(largeDataName(name)); - } - - // References or copies for Assembler objects. - uint64_t readRepresentation; - uint64_t k; - const Reads& reads; - const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers; - const MarkerGraph& markerGraph; - const ConsensusCaller& consensusCaller; - - uint64_t readCount() const - { - return markers.size() / 2; - } - - // Each linear chain of marker graph edges generates a segment. - // The marker graph path corresponding to each segment is stored - // indexed by segment id. - MemoryMapped::VectorOfVectors<MarkerGraphEdgeId, uint64_t> markerGraphPaths; - void createSegmentPaths(); - - // Average marker graph edge coverage for each segment. - MemoryMapped::Vector<float> segmentCoverage; - void computeSegmentCoverage(); - - // Assembled sequence for each segment. - // For each segment we store the entire sequance, including - // the complete sequences of the first and last vertex. - // When writing to gfa, we skip the first and last k/2 bases. - MemoryMapped::VectorOfVectors<Base, uint64_t> segmentSequences; - MemoryMapped::VectorOfVectors<uint32_t, uint64_t> segmentVertexOffsets; // Filed in by assembleSegment. - void assembleSegments(); - void assembleSegment(uint64_t segmentId); - - // Keep track of the segment and position each marker graph edge corresponds to. - // For each marker graph edge, store in the marker graph edge table - // the corresponding segment id and position in the path, if any. - // Indexed by the edge id in the marker graph. - // This is needed when computing assembly graph journeys. - MemoryMapped::Vector< pair<uint64_t, uint32_t> > markerGraphEdgeTable; - void computeMarkerGraphEdgeTable(size_t threadCount); - void computeMarkerGraphEdgeTableThreadFunction(size_t threadId); - - - - // The marker graph journeys of all oriented reads. - // Indexed by OrientedReadId::getValue(). - // This is only stored temporarily and used to compute assembly graph journeys. - MemoryMapped::VectorOfVectors<MarkerGraphJourneyEntry, uint64_t> markerGraphJourneys; - void computeMarkerGraphJourneys(size_t threadCount); - void computeMarkerGraphJourneysPass1(size_t threadId); - void computeMarkerGraphJourneysPass2(size_t threadId); - void computeMarkerGraphJourneysPass12(uint64_t pass); - void sortMarkerGraphJourneys(size_t threadId); - - - - // The assembly graph journeys of all oriented reads. - // Indexed by OrientedReadId::getValue(). - MemoryMapped::VectorOfVectors<AssemblyGraphJourneyEntry, uint64_t> assemblyGraphJourneys; - void computeAssemblyGraphJourneys(); - void computeAssemblyGraphJourney( - const span<MarkerGraphJourneyEntry> markerGraphJourney, - vector<AssemblyGraphJourneyEntry>& assemblyGraphJourney); - - // Store appearances of segments in assembly graph journeys. - // For each segment, store pairs (orientedReadId, position in assembly graph journey). - // Indexed by the segmentId. - // For each segment, they are sorted. - MemoryMapped::VectorOfVectors<pair<OrientedReadId, uint64_t>, uint64_t> - assemblyGraphJourneyInfos; - void computeAssemblyGraphJourneyInfos(); - - //Coverage is th enumber of oriented reads that appear in this segment. - // This is not the same as average coverage on marker graph vertices or edges. - uint64_t coverage(uint64_t segmentId) const - { - return assemblyGraphJourneyInfos.size(segmentId); - } - - // Find out if a segment contains a given OrientedReadId. - // This returns true if assemblyGraphJourneyInfos[segmentId] - // contains an entry with the given OrientedReadId. - bool segmentContainsOrientedRead( - uint64_t segmentId, - OrientedReadId) const; - - - - using SegmentPair = pair<uint64_t, uint64_t>; - using Transitions = vector< pair<OrientedReadId, Transition> >; - std::map<SegmentPair, Transitions> transitionMap; - void findTransitions(std::map<SegmentPair, Transitions>& transitionMap); - - - - // The links. - class Link { - public: - uint64_t segmentId0; - uint64_t segmentId1; - - // Flag to indicate whether the two segments are adjacent. - // This is set if the last marker graph vertex of segmentId0 - // is the same as the first marker graph vertex of segmentId1. - // In that case the separation will be set to 0. - // However, the separation is just an estimate, so it - // could be 0 even when the segments are ot adjacent. - bool segmentsAreAdjacent; - - // Estimated separation in markers. - int32_t separation; - - - Link( - uint64_t segmentId0 = 0, - uint64_t segmentId1 = 0) : - segmentId0(segmentId0), - segmentId1(segmentId1) {} - }; - MemoryMapped::Vector<Link> links; - void createLinks( - const std::map<SegmentPair, Transitions>& transitionMap, - uint64_t minCoverage); - - // The transitions for each link. - // Indexed by linkId. - MemoryMapped::VectorOfVectors< pair<OrientedReadId, Transition>, uint64_t> transitions; - uint64_t linkCoverage(uint64_t linkId) const - { - return transitions.size(linkId); - } - - // Assemble a link, given a set of allowed OrientedReadId(s). - // The returned sequence overrides: - // - The trim0 last bases of the preceding segment. - // - The trim1 first bases of the following segment. - void assembleLink( - uint64_t linkId, - const vector<OrientedReadId>& allowedOrientedReadIds, - vector<Base>& sequence, // The entire MSA sequence - uint64_t& leftTrim, // The number of MSA sequence to be trimmed on the left for assembly - uint64_t& rightTrim, // The number of MSA sequence to be trimmed on the left for assembly - uint64_t& trim0, // The number of bases at the end of segment0 to be trimmed for assembly - uint64_t& trim1, // The number of bases at the beginning of segment1 to be trimmed for assembly - ostream& html - ) const; - - // Use spoa to compute consensus sequence for a link. - static void computeLinkConsensusUsingSpoa( - const vector<OrientedReadId> orientedReadIds, - const vector< vector<Base> > rleSequences, - const ConsensusCaller&, - bool debug, - ostream& html, - vector<Base>& consensusRleSequence - ); - - // The links for each source or target segments. - // Indexed by segment id. - MemoryMapped::VectorOfVectors<uint64_t, uint64_t> linksBySource; - MemoryMapped::VectorOfVectors<uint64_t, uint64_t> linksByTarget; - void createConnectivity(); - uint64_t findLink(uint64_t segmentId0, uint64_t segmentId1) const; - - - // Flag back-segments. - // This does not do a full blown search for locally strongly connected components. - // A segment is marked as a back-segment if: - // - It has only a single incoming link. - // - It has a single outgoing link. - // - The incoming and outgoing links both connect to/from the same segment. - void flagBackSegments(); - MemoryMapped::Vector<bool> isBackSegment; - - - - // Get the children or parents of a given segment. - // Only use links with at least a specified coverage. - void getChildren( - uint64_t segmentId, - uint64_t minimumLinkCoverage, - vector<uint64_t>& - ) const; - void getParents( - uint64_t segmentId, - uint64_t minimumLinkCoverage, - vector<uint64_t>& - ) const; - void getChildrenOrParents( - uint64_t segmentId, - uint64_t direction, // 0=forward (children), 1=backward (parents). - uint64_t minimumLinkCoverage, - vector<uint64_t>& - ) const; - - - // Find descendants of a given segment, up to a given distance in the graph. - void findDescendants( - uint64_t segmentId, - uint64_t maxDistance, - vector<uint64_t>& segmentIds - ) const; - - // BFS with given begin/end. - // Does a BFS which starts at segmentIdA. - // and ends when segmentIdB is encountered. - // The BFS if forward if direction is 0 - // and backward if direction is 1. - // Computes a vector of all the segments encountered, - // excluding segmentIdA and segmentIdB, - // in the order in which they are encountered in the BFS. - void targetedBfs( - uint64_t segmentIdA, - uint64_t segmentIdB, - uint64_t direction, - vector<uint64_t>& segments - ) const; - - void writeGfa(const string& baseName) const; - - // Find the distinct oriented reads that appear on the path - // of a segment. Also return the average edge coverage for the path. - double findOrientedReadsOnSegment( - uint64_t segmentId, - vector<OrientedReadId>&) const; - - - - // Get information about the oriented reads that appear on the - // marker graph path of a segment. - class SegmentOrientedReadInformation { - public: - - // The oriented reads on this segment, - // each storage with an average offset relative to the segment. - class Info { - public: - OrientedReadId orientedReadId; - - // The average offset, in markers, between the - // beginning of this oriented read and the - // beginning of the segment. - int32_t averageOffset; - }; - vector<Info> infos; - }; - void getOrientedReadsOnSegment( - uint64_t segmentId, - SegmentOrientedReadInformation&) const; - - // Oriented read information for each segment. - // This is only stored when needed. - vector<SegmentOrientedReadInformation> segmentOrientedReadInformation; - void storeSegmentOrientedReadInformation(size_t threadCount); - void storeSegmentOrientedReadInformationThreadFunction(size_t threadId); - - - - // Estimate the offset between two segments. - // Takes as input SegmentOrientedReadInformation objects - // for the two segments. - // Common oriented reads between the two segments are used - // to estimate the average offset, in markers, - // between the beginning of the segments. - // The number of common oriented reads - // is computed and stored in the last argument. - // If that is zero, the computed offset is not valid. - void estimateOffset( - const SegmentOrientedReadInformation& info0, - const SegmentOrientedReadInformation& info1, - int64_t& offset, - uint64_t& commonOrientedReadCount - ) const; - - - - // Analyze a pair of segments for common oriented reads, - // offsets, missing reads, etc. - void analyzeSegmentPair( - uint64_t segmentId0, - uint64_t segmentId1, - const SegmentOrientedReadInformation& info0, - const SegmentOrientedReadInformation& info1, - const MemoryMapped::VectorOfVectors<CompressedMarker, uint64_t>& markers, - SegmentPairInformation& - ) const; - - // Count the number of common oriented reads between a segment and a link, - // without counting oriented reads that appear more than once on the - // segment or on the link. - void analyzeSegmentLinkPair( - uint64_t segmentId, - uint64_t linkId, - uint64_t& commonOrientedReadCount - ) const; - - - -#if 0 - // Find segment pairs a sufficient number of common reads - // and with low unexplained fraction (in both directions) - // between segmentId0 and one of its descendants within the specified distance. - // This requires the vector segmentOrientedReadInformation above to be - // available. - void findSegmentPairs( - uint64_t segmentId0, - uint64_t maxDistance, - uint64_t minCommonReadCount, - double maxUnexplainedFraction, - vector<uint64_t>& segmentIds1 - ) const; - - - // Cluster the segments based on read composition. - // We find segment pairs a sufficient number of common reads - // and with low unexplained fraction (in both directions). - void clusterSegments(size_t threadCount, uint64_t minClusterSize); - class ClusterSegmentsData { - public: - - // The segment pairs found by each thread. - // In each pair, the lower number segment comes first. - vector< vector< pair<uint64_t, uint64_t> > > threadPairs; - }; - ClusterSegmentsData clusterSegmentsData; - void clusterSegmentsThreadFunction1(size_t threadId); - void addClusterPairs(size_t threadId, uint64_t segmentId0); -#endif - - // The cluster that each segment belongs to. - // Each connected component of the Jaccard graph corresponds to a cluster. - MemoryMapped::Vector<uint64_t> clusterIds; - - - - // Analyze a subgraph of the assembly graph. - - // Classes used in analyzeSubgraph. - class AnalyzeSubgraphClasses { - public: - - // A JourneySnippet describes a sequence of consecutive positions - // of the assembly graph journey of an oriented read. - // An OrientedReadId can have than more one JourneySnippet in a given subgraph, - // but this is not common. It can happen if the assembly graph contains a cycle. - class JourneySnippet { - public: - - // The OrientedReadId this refers to. - OrientedReadId orientedReadId; - - // The sequence of segments encountered. - vector<uint64_t> segmentIds; - - // The first and last position of this snippet - // in the assembly graph journey of this OrientedReadId. - uint64_t firstPosition; - uint64_t lastPosition() const - { - return firstPosition + segmentIds.size() - 1; - } - }; - - // A Cluster is a set of JourneySnippet's. - class Cluster { - public: - - // The snippets in this cluster. - vector<JourneySnippet> snippets; - uint64_t coverage() const - { - return snippets.size(); - } - - // The segments visited by the snippets of this cluster, - // each stored with its coverage (number of snippets); - vector< pair<uint64_t, uint64_t > > segments; - vector<uint64_t> getSegments() const; - - // Remove segments with coverage less than the specified value. - void cleanupSegments(uint64_t minClusterCoverage); - - // Construct the segments given the snippets. - void constructSegments(); - }; - - - - // The SnippetGraph is used by analyzeSubgraph2. - // A vertex represents a set of snippets and stores - // the corresponding snippet indexes. - // An edge x->y is created if there is at least one snippet in y - // that is an approximate subset of a snippet in x. - // Strongly connected components are condensed, so after that - //the graph is guaranteed to have no cycles. - class SnippetGraphVertex { - public: - vector<uint64_t> snippetIndexes; - uint64_t clusterId = invalid<uint64_t>; - SnippetGraphVertex() {} - SnippetGraphVertex(uint64_t snippetIndex) : - snippetIndexes(1, snippetIndex) {} - }; - using SnippetGraphBaseClass = - boost::adjacency_list<boost::setS, boost::listS, boost::bidirectionalS, SnippetGraphVertex>; - class SnippetGraph : public SnippetGraphBaseClass { - public: - uint64_t clusterCount = 0; - void findDescendants(const vertex_descriptor, vector<vertex_descriptor>&) const; - void writeGraphviz(const string& fileName) const; - }; - }; - - - - void analyzeSubgraph( - const vector<uint64_t>& segmentIds, - vector<AnalyzeSubgraphClasses::Cluster>&, - bool debug) const; - template<uint64_t N> void analyzeSubgraphTemplate( - const vector<uint64_t>& segmentIds, - vector<AnalyzeSubgraphClasses::Cluster>&, - bool debug) const; - - // Given a segment, use a BFS to move in the specified direction until - // we find a segment with sufficiently high Jaccard similarity - // and number of common reads. - // This returns invalid<uint64_t> if no such segment is found - // within the specified distance. - uint64_t findSimilarSegmentBfs( - uint64_t segmentId, - uint64_t direction, // 0 = forward, 1 = backward - uint64_t maxDistance, - uint64_t minCommon, - double minJaccard) const; - - // Given a segment, move in the specified direction, - // in order of increasing distance in markers, until - // we find a segment with sufficiently high Jaccard similarity - // and number of common reads. - // This returns invalid<uint64_t> if no such segment is found - // within the specified distance. - uint64_t findSimilarSegment( - uint64_t segmentId, - uint64_t direction, // 0 = forward, 1 = backward - uint64_t maxDistance, // In markers - uint64_t minLinkCoverage, - int32_t minLinkSeparation, - uint64_t minCommon, - double maxUnexplainedFraction, - double minJaccard, - vector<uint64_t>& segments) const; - - // Create an assembly path starting at a given segment. - void createAssemblyPath( - uint64_t segmentId, - uint64_t direction, // 0 = forward, 1 = backward - AssemblyPath& - ) const; - - // Compute link separation given a set of Transitions. - template<class Container> static double linkSeparation( - const Container& transitions, - uint64_t pathLength0) - { - double averageLinkSeparation = 0.; - - for(const pair<OrientedReadId, Transition>& p: transitions) { - const Transition& transition = p.second; - const MarkerGraphJourneyEntry& entry0 = transition[0]; - const MarkerGraphJourneyEntry& entry1 = transition[1]; - - SHASTA_ASSERT(entry1.ordinals[0] >= entry0.ordinals[1]); - - const int64_t linkSeparation = - int64_t(entry1.ordinals[0] - entry0.ordinals[1]) - - int64_t(pathLength0 - 1 - entry0.position) - - int64_t(entry1.position); - averageLinkSeparation += double(linkSeparation); - } - averageLinkSeparation /= double(transitions.size()); - - return averageLinkSeparation; - } - - // Jaccard graph. - shared_ptr<JaccardGraph> jaccardGraphPointer; - void createJaccardGraph(size_t threadCount); - void createJaccardGraphThreadFunction(size_t threadId); - void createJaccardGraphEdges( - uint64_t segmentId, - vector<JaccardGraphEdgeInfo>& edges); - void createJaccardGraphEdges( - uint64_t segmentId, - uint64_t direction, - vector<JaccardGraphEdgeInfo>& edges); - - // Assemble the assembly paths stored in the JaccardGraph. - void assembleJaccardGraphPaths(); - void assembleJaccardGraphPath(const vector<uint64_t>& primarySegments, AssemblyPath&); - - // De Bruijn graph of the assembly graph journeys of all oriented reads. - // Each assembly graph journey is interpreted as a sequence of segment ids. - void createDeBruijnGraph() const; - template<uint64_t K> void createDeBruijnGraphTemplated() const; -}; - - - - -#endif - diff --git a/src/removeReciprocalEdges.hpp b/src/removeReciprocalEdges.hpp new file mode 100644 index 0000000..4dbd1ac --- /dev/null +++ b/src/removeReciprocalEdges.hpp @@ -0,0 +1,34 @@ +#ifndef SHASTA_REMOVE_RECIPROCAL_ERDGES_HPP +#define SHASTA_REMOVE_RECIPROCAL_ERDGES_HPP + +#include <boost/graph/iteration_macros.hpp> +#include "vector.hpp" + +namespace shasta { + template<class Graph> void removeReciprocalEdges(Graph&); +} + + + +template<class Graph> void shasta::removeReciprocalEdges(Graph& graph) +{ + vector<typename Graph::edge_descriptor> edgesTobeRemoved; + + BGL_FORALL_EDGES_T(e, graph, Graph) { + const typename Graph::vertex_descriptor v0 = source(e, graph); + const typename Graph::vertex_descriptor v1 = target(e, graph); + + bool reverseEdgeExists = false; + tie(ignore, reverseEdgeExists) = boost::edge(v1, v0, graph); + if(reverseEdgeExists) { + edgesTobeRemoved.push_back(e); + } + } + + for(const typename Graph::edge_descriptor e: edgesTobeRemoved) { + boost::remove_edge(e, graph); + } + +} +#endif + diff --git a/src/seqan.hpp b/src/seqan.hpp index ffbb7d0..db56272 100644 --- a/src/seqan.hpp +++ b/src/seqan.hpp @@ -41,6 +41,20 @@ namespace shasta { bool freeOnRight, vector< pair<bool, bool> >& alignment); + // Same, banded. + template<class Iterator> + int64_t seqanAlign( + Iterator begin0, Iterator end0, + Iterator begin1, Iterator end1, + int64_t matchScore, + int64_t mismatchScore, + int64_t gapScore, + int64_t bandMin, + int64_t bandMax, + bool freeOnLeft, + bool freeOnRight, + vector< pair<bool, bool> >& alignment); + // Find out if the alignment computed by seqanAlign contains mismatches. template<class Iterator> bool containsMismatches( @@ -164,6 +178,117 @@ template<class Iterator> +// Same, banded. +template<class Iterator> + int64_t shasta::seqanAlign( + Iterator begin0, Iterator end0, + Iterator begin1, Iterator end1, + int64_t matchScore, + int64_t mismatchScore, + int64_t gapScore, + int64_t bandMin, + int64_t bandMax, + bool freeOnLeft, + bool freeOnRight, + vector< pair<bool, bool> >& alignment) +{ + // SeqAn does not handle empty sequences. + SHASTA_ASSERT(begin0 != end0); + SHASTA_ASSERT(begin1 != end1); + + // SeqAn types used below. + using namespace seqan; + using Int = typename Iterator::value_type; + using Sequence = String<Int>; + using StringSet = seqan::StringSet<Sequence>; + using DepStringSet = seqan::StringSet<Sequence, Dependent<> >; + using AlignGraph = Graph<seqan::Alignment<DepStringSet> >; + + // Fill in the sequences, adding 100 to all values + // because SeqAn uses 45 to represent gaps. + Sequence sequence0; + for(Iterator it=begin0; it!=end0; ++it) { + appendValue(sequence0, *it + 100); + } + Sequence sequence1; + for(Iterator it=begin1; it!=end1; ++it) { + appendValue(sequence1, *it + 100); + } + // Store them in a SeqAn string set. + StringSet sequences; + appendValue(sequences, sequence0); + appendValue(sequences, sequence1); + + + + // Compute the alignment. + // See https://docs.seqan.de/seqan/2.1.0/class_AlignConfig.html + // for meaning of AlignConfig. + AlignGraph graph(sequences); + int64_t alignmentScore = 0; + if(freeOnLeft) { + if(freeOnRight) { + // Free on both sides. + alignmentScore = globalAlignment( + graph, + Score<int64_t, seqan::Simple>(matchScore, mismatchScore, gapScore), + AlignConfig<true, true, true, true>(), + int32_t(bandMin), int32_t(bandMax), + LinearGaps()); + } else { + // Free on left only. + alignmentScore = globalAlignment( + graph, + Score<int64_t, seqan::Simple>(matchScore, mismatchScore, gapScore), + AlignConfig<true, true, false, false>(), + int32_t(bandMin), int32_t(bandMax), + LinearGaps()); + } + }else { + if(freeOnRight) { + // Free on right only. + alignmentScore = globalAlignment( + graph, + Score<int64_t, seqan::Simple>(matchScore, mismatchScore, gapScore), + AlignConfig<false, false, true, true>(), + int32_t(bandMin), int32_t(bandMax), + LinearGaps()); + } else { + // Free on neither side. + alignmentScore = globalAlignment( + graph, + Score<int64_t, seqan::Simple>(matchScore, mismatchScore, gapScore), + AlignConfig<false, false, false, false>(), + int32_t(bandMin), int32_t(bandMax), + LinearGaps()); + } + } + + + + // Extract the alignment from the graph. + // This creates a single sequence consisting of the two rows + // of the alignment, concatenated. + Sequence align; + convertAlignment(graph, align); + const uint64_t totalAlignmentLength = seqan::length(align); + SHASTA_ASSERT((totalAlignmentLength % 2) == 0); + const uint64_t alignmentLength = totalAlignmentLength / 2; + + // Fill in the bool pairs representing the alignment. + alignment.resize(alignmentLength); + for(uint64_t i=0; i<alignmentLength; i++) { + auto& p = alignment[i]; + p.first = not (align[i] == 45); + p.second = not (align[i+alignmentLength] == 45); + } + + + return alignmentScore; +} + + + // Find out if the alignment computed by seqanAlign contains mismatches. template<class Iterator> bool shasta::containsMismatches( diff --git a/src/shastaTypes.hpp b/src/shastaTypes.hpp index 53f1272..b837c18 100644 --- a/src/shastaTypes.hpp +++ b/src/shastaTypes.hpp @@ -5,7 +5,9 @@ namespace shasta { - using KmerId = uint32_t; + using KmerId16 = uint32_t; + using KmerId32 = uint64_t; + using KmerId = KmerId32; using ReadId = uint32_t; using Strand = ReadId; diff --git a/src/transitiveReduction.hpp b/src/transitiveReduction.hpp index 0720920..5f5764d 100644 --- a/src/transitiveReduction.hpp +++ b/src/transitiveReduction.hpp @@ -11,25 +11,32 @@ // Standard library. #include "iterator.hpp" -using std::back_inserter; +#include <map> #include <queue> +#include <set> #include "vector.hpp" namespace shasta { + + // Version that requires the graph to use vecS. template<class Graph> void transitiveReduction(Graph&); + + // Less performant version without the above requirement. + template<class Graph> void transitiveReductionAny(Graph&); } // Transitive reduction of a directed graph without cycles. // Class Graph must be a boost::adjacency_list with -// the first three template arguments set to <listS, vecS, directedS>. +// the first three template arguments set to <listS, vecS, directedS or bidirectionalS>. // If the graph has cycles, this throws boost::not_a_dag. template<class Graph> void shasta::transitiveReduction(Graph &graph) { using namespace boost; using vertex_descriptor = typename Graph::vertex_descriptor; using edge_descriptor = typename Graph::edge_descriptor; + using edge_iterator = typename Graph::edge_iterator; // Check the Graph type. // Use C++20 concepts instead. @@ -42,16 +49,18 @@ template<class Graph> void shasta::transitiveReduction(Graph &graph) "shasta::transitiveReduction requires an adjacency_list " "with the second template argument set to boost::vecS."); static_assert( + std::is_same<typename Graph::directed_selector, directedS>::value + or std::is_same<typename Graph::directed_selector, bidirectionalS>::value, "shasta::transitiveReduction requires an adjacency_list " - "with the third template argument set to boost::bidirectionalS."); + "with the third template argument set to boost::directedS or boost::bidirectionalS."); // Use boost topological_sort to get a vector of vertex descriptors // in reverse toplogical order. vector<vertex_descriptor> sortedVertices; topological_sort(graph, back_inserter(sortedVertices)); - // Now construct a vector containg the rank of each vertex in topological order. + // Now construct a vector containing the rank of each vertex in topological order. vector<uint64_t> vertexRank(num_vertices(graph)); uint64_t rank = num_vertices(graph); for (const vertex_descriptor v : sortedVertices) { @@ -60,9 +69,14 @@ template<class Graph> void shasta::transitiveReduction(Graph &graph) // Find the edges that should be removed. vector<edge_descriptor> edgesToBeRemoved; - vector<bool> wasVisited(num_vertices(graph)); - BGL_FORALL_EDGES_T(e, graph, Graph) - { + vector<bool> wasVisited(num_vertices(graph), false); + vector<vertex_descriptor> visitedVertices; + edge_iterator it, itEnd; + tie(it, itEnd) = edges(graph); + while(it != itEnd) { + edge_iterator itNext = it; + ++itNext; + const edge_descriptor e = *it; const vertex_descriptor v0 = source(e, graph); const vertex_descriptor v1 = target(e, graph); @@ -80,11 +94,11 @@ template<class Graph> void shasta::transitiveReduction(Graph &graph) // Initialize the BFS. std::queue<vertex_descriptor> q; q.push(v0); - std::fill(wasVisited.begin(), wasVisited.end(), false); wasVisited[v0] = true; + visitedVertices.push_back(v0); // BFS loop. - while (not q.empty()) { + while(not q.empty()) { // Dequeue a vertex. const vertex_descriptor vv0 = q.front(); @@ -114,24 +128,136 @@ template<class Graph> void shasta::transitiveReduction(Graph &graph) if (vv1 == v1) { // We reached v1. Edge e can be removed and we can stop the BFS. - edgesToBeRemoved.push_back(e); + boost::remove_edge(e, graph); q = { }; + break; } else { // Continue the BFS. wasVisited[vv1] = true; + visitedVertices.push_back(vv1); q.push(vv1); } } } - } - // Remove the edges. - deduplicate(edgesToBeRemoved); - for (const edge_descriptor e : edgesToBeRemoved) { - remove_edge(e, graph); + // Prepare for the next iteration. + it = itNext; + + // Clean up. + for(const vertex_descriptor v: visitedVertices) { + wasVisited[v] = false; + } + visitedVertices.clear(); } + } +// Less performant version which works on any acyclic boost directed graph. +template<class Graph> void shasta::transitiveReductionAny(Graph &graph) + { + using namespace boost; + using vertex_descriptor = typename Graph::vertex_descriptor; + using edge_descriptor = typename Graph::edge_descriptor; + using edge_iterator = typename Graph::edge_iterator; + + // Map vertices to integers. + std::map<vertex_descriptor, uint64_t> vertexIndexMap; + uint64_t vertexIndex = 0; + BGL_FORALL_VERTICES_T(v, graph, Graph) { + vertexIndexMap.insert({v, vertexIndex++}); + } + + // Use boost topological_sort to get a vector of vertex descriptors + // in reverse topological order. + vector<vertex_descriptor> sortedVertices; + topological_sort( + graph, + back_inserter(sortedVertices), + boost::vertex_index_map(boost::make_assoc_property_map(vertexIndexMap))); + + // Store the rank of each vertex in topological order. + std::map<vertex_descriptor, uint64_t> vertexRank; + uint64_t rank = num_vertices(graph); + for (const vertex_descriptor v : sortedVertices) { + vertexRank.insert({v, --rank}); + } + + // Find the edges that should be removed. + edge_iterator it, itEnd; + tie(it, itEnd) = edges(graph); + while(it != itEnd) { + edge_iterator itNext = it; + ++itNext; + const edge_descriptor e = *it; + const vertex_descriptor v0 = source(e, graph); + const vertex_descriptor v1 = target(e, graph); + + // Edge e should be removed if there is a path + // from v0 to v1 that does not use edge e. + + // Do a forward BFS starting at v0 and ending at v1 but: + // - Don't use edge e in the BFS. + // - Don't use any vertices that have topological order + // greater than the topological order of v1, + // because there can be no paths ending at v1 + // that use these vertices. + // If the BFS encounters v1, edge e can be removed. + + // Initialize the BFS. + std::queue<vertex_descriptor> q; + q.push(v0); + std::set<vertex_descriptor> visitedVertices; + visitedVertices.insert(v0); + + // BFS loop. + while(not q.empty()) { + + // Dequeue a vertex. + const vertex_descriptor vv0 = q.front(); + q.pop(); + + // Loop over its out-edges. + BGL_FORALL_OUTEDGES_T(vv0, ee, graph, Graph) + { + + // Don't use edge e in the BFS. + if (ee == e) { + continue; + } + + // Get the other vertex in edge ee. + const vertex_descriptor vv1 = target(ee, graph); + + // If vv1 was already visited in this BFS, skip it. + if(visitedVertices.contains(vv1)) { + continue; + } + + // If vv1 follows v1 in topological order, skip it. + if (vertexRank[vv1] > vertexRank[v1]) { + continue; + } + + if (vv1 == v1) { + // We reached v1. Edge e can be removed and we can stop the BFS. + boost::remove_edge(e, graph); + q = { }; + break; + } else { + // Continue the BFS. + visitedVertices.insert(vv1); + q.push(vv1); + } + } + } + + // Prepare for the next iteration. + it = itNext; + + } + +} + #endif diff --git a/srcMain/main.cpp b/srcMain/main.cpp index d8968f6..cee1120 100644 --- a/srcMain/main.cpp +++ b/srcMain/main.cpp @@ -1,4 +1,3 @@ -// Main program for the Shasta static executable. // The static executable provides // basic functionality and reduced performance. // For full functionality use the shared library built @@ -220,6 +219,18 @@ void shasta::main::assemble( "and is now required to run an assembly."); } + // Check --Kmers.k. + // Using Kmer=ShortBaseSequence16 limits it to 16 bases. + // But alignment methods adds 100 to KmerIds to deal + // with the Seqan gap value 45, so this means + // that we cannot use k=16. + // Therefore the maximum allowed value is 15. + // We also reject values that are grossly too low. + if(assemblerOptions.kmersOptions.k > 31 or assemblerOptions.kmersOptions.k < 6) { + throw runtime_error("Invalid value specified for --Kmers.k. " + "Must be between 6 and 31"); + } + // Check that we have at least one input file. if(assemblerOptions.commandLineOnlyOptions.inputFileNames.empty()) { cout << assemblerOptions.allOptionsDescription << endl; @@ -228,17 +239,19 @@ void shasta::main::assemble( } // Check assemblerOptions.minHashOptions.version. - if( assemblerOptions.minHashOptions.version!=0 and - assemblerOptions.minHashOptions.version!=1) { + if( assemblerOptions.minHashOptions.version!=0) { throw runtime_error("Invalid value " + to_string(assemblerOptions.minHashOptions.version) + - " specified for --MinHash.version. Must be 0 or 1."); + " specified for --MinHash.version. Must be 0."); } // Check assemblerOptions.minHashOptions minimum/maximum bucket size. - if( assemblerOptions.minHashOptions.maxBucketSize <= - assemblerOptions.minHashOptions.minBucketSize) { - throw runtime_error("MinHash maximum bucket size must be greater than minimum bucket size. " + const bool dynamicMinHashBucketRange = + (assemblerOptions.minHashOptions.minBucketSize == 0) and + (assemblerOptions.minHashOptions.maxBucketSize == 0); + if((not dynamicMinHashBucketRange) and (assemblerOptions.minHashOptions.maxBucketSize <= + assemblerOptions.minHashOptions.minBucketSize)) { + throw runtime_error("Invalid MinHash min/max bucket sizes specified. " "The following values were specified:" " minimum bucket size " + to_string(assemblerOptions.minHashOptions.minBucketSize) + @@ -258,9 +271,9 @@ void shasta::main::assemble( if( assemblerOptions.alignOptions.alignMethod < 0 or assemblerOptions.alignOptions.alignMethod == 2 or - assemblerOptions.alignOptions.alignMethod > 4) { + assemblerOptions.alignOptions.alignMethod > 5) { throw runtime_error("Align method " + to_string(assemblerOptions.alignOptions.alignMethod) + - " is not valid. Valid options are 0, 1, 3, and 4."); + " is not valid. Valid options are 0, 1, 3, 4, and 5."); } if(assemblerOptions.readGraphOptions.creationMethod != 0 and @@ -284,6 +297,10 @@ void shasta::main::assemble( assemblerOptions.readGraphOptions.strandSeparationMethod != 2) { throw runtime_error("--Assembly.mode 2 requires --ReadGraph.strandSeparationMethod 2."); } + if(assemblerOptions.assemblyOptions.mode == 3 and + assemblerOptions.readGraphOptions.strandSeparationMethod != 2) { + throw runtime_error("--Assembly.mode 3 requires --ReadGraph.strandSeparationMethod 2."); + } // Find absolute paths of the input files. // We will use them below after changing directory to the output directory. @@ -370,11 +387,12 @@ void shasta::main::assemble( assemblerOptions.commandLineOnlyOptions.memoryMode != "filesystem") { cout << "This run uses options \"--memoryBacking " << assemblerOptions.commandLineOnlyOptions.memoryBacking << " --memoryMode " << assemblerOptions.commandLineOnlyOptions.memoryMode << "\".\n" - "This could result in performance degradation.\n" - "For full performance, use \"--memoryBacking 2M --memoryMode filesystem\"\n" + "This could result in longer run time.\n" + "For faster assembly, use \"--memoryBacking 2M --memoryMode filesystem\"\n" "(root privilege via sudo required).\n" "Therefore the results of this run should not be used\n" - "for benchmarking purposes." << endl; + "for the purpose of benchmarking assembly time.\n" + "However the memory options don't affect assembly results in any way." << endl; } // Create the Assembler. @@ -391,11 +409,12 @@ void shasta::main::assemble( assemblerOptions.commandLineOnlyOptions.memoryMode != "filesystem") { cout << "This run used options \"--memoryBacking " << assemblerOptions.commandLineOnlyOptions.memoryBacking << " --memoryMode " << assemblerOptions.commandLineOnlyOptions.memoryMode << "\".\n" - "This could have resulted in performance degradation.\n" - "For full performance, use \"--memoryBacking 2M --memoryMode filesystem\"\n" + "This could result in longer run time.\n" + "For faster assembly, use \"--memoryBacking 2M --memoryMode filesystem\"\n" "(root privilege via sudo required).\n" "Therefore the results of this run should not be used\n" - "for benchmarking purposes." << endl; + "for the purpose of benchmarking assembly time.\n" + "However the memory options don't affect assembly results in any way." << endl; } // Write out the build id again. @@ -530,8 +549,10 @@ void shasta::main::assemble( cout << "This assembly will use " << threadCount << " threads." << endl; // Set up the consensus caller. - cout << "Setting up consensus caller " << - assemblerOptions.assemblyOptions.consensusCaller << endl; + if(assembler.getReads().representation == 1) { + cout << "Setting up consensus caller " << + assemblerOptions.assemblyOptions.consensusCaller << endl; + } assembler.setupConsensusCaller(assemblerOptions.assemblyOptions.consensusCaller); @@ -588,71 +609,25 @@ void shasta::main::assemble( performanceLog << timestamp << "Done loading reads from " << inputFileNames.size() << " files." << endl; performanceLog << "Read loading took " << seconds(t1-t0) << "s." << endl; + // Find duplicate reads and handle them according to the setting + // of --Reads.handleDuplicates. + assembler.findDuplicateReads(assemblerOptions.readsOptions.handleDuplicates); - - // Select the k-mers that will be used as markers. - switch(assemblerOptions.kmersOptions.generationMethod) { - case 0: - assembler.randomlySelectKmers( - assemblerOptions.kmersOptions.k, - assemblerOptions.kmersOptions.probability, 231); - break; - - case 1: - // Randomly select the k-mers to be used as markers, but - // excluding those that are globally overenriched in the input reads, - // as measured by total frequency in all reads. - assembler.selectKmersBasedOnFrequency( - assemblerOptions.kmersOptions.k, - assemblerOptions.kmersOptions.probability, 231, - assemblerOptions.kmersOptions.enrichmentThreshold, threadCount); - break; - - case 2: - // Randomly select the k-mers to be used as markers, but - // excluding those that are overenriched even in a single oriented read. - assembler.selectKmers2( - assemblerOptions.kmersOptions.k, - assemblerOptions.kmersOptions.probability, 231, - assemblerOptions.kmersOptions.enrichmentThreshold, threadCount); - break; - - case 3: - // Read the k-mers to be used as markers from a file. - if(assemblerOptions.kmersOptions.file.empty() or - assemblerOptions.kmersOptions.file[0] != '/') { - throw runtime_error("Option --Kmers.file must specify an absolute path. " - "A relative path is not accepted."); - } - assembler.readKmersFromFile( - assemblerOptions.kmersOptions.k, - assemblerOptions.kmersOptions.file); - break; - - case 4: - // Randomly select the k-mers to be used as markers, but - // excluding those that appear in two copies close to each other - // even in a single oriented read. - assembler.selectKmers4( - assemblerOptions.kmersOptions.k, - assemblerOptions.kmersOptions.probability, 231, - assemblerOptions.kmersOptions.distanceThreshold, threadCount); - break; - - default: - throw runtime_error("Invalid --Kmers generationMethod. " - "Specify a value between 0 and 4, inclusive."); - } - - + // Initialize the KmerChecker, which has the information needed + // to decide if a k-mer is a marker. + assembler.createKmerChecker(assemblerOptions.kmersOptions, threadCount); // Find the markers in the reads. assembler.findMarkers(0); - if(!assemblerOptions.readsOptions.palindromicReads.skipFlagging) { + // Gather marker KmerIds for all markers. + // They are used by LowHash and alignment computation. + // These will be kept until we are done computing alignments. + assembler.computeMarkerKmerIds(threadCount); - // Flag palindromic reads. - // These will be excluded from further processing. + // Flag palindromic reads. + // These will be excluded from further processing. + if(!assemblerOptions.readsOptions.palindromicReads.skipFlagging) { assembler.flagPalindromicReads( assemblerOptions.readsOptions.palindromicReads.maxSkip, assemblerOptions.readsOptions.palindromicReads.maxDrift, @@ -663,12 +638,11 @@ void shasta::main::assemble( threadCount); } - - // Find alignment candidates. if(assemblerOptions.minHashOptions.allPairs) { assembler.markAlignmentCandidatesAllPairs(); - } else if(assemblerOptions.minHashOptions.version == 0) { + } else { + SHASTA_ASSERT(assemblerOptions.minHashOptions.version == 0); // Already checked for that. assembler.findAlignmentCandidatesLowHash0( assemblerOptions.minHashOptions.m, assemblerOptions.minHashOptions.hashFraction, @@ -679,17 +653,6 @@ void shasta::main::assemble( assemblerOptions.minHashOptions.maxBucketSize, assemblerOptions.minHashOptions.minFrequency, threadCount); - } else { - SHASTA_ASSERT(assemblerOptions.minHashOptions.version == 1); // Already checked for that. - assembler.findAlignmentCandidatesLowHash1( - assemblerOptions.minHashOptions.m, - assemblerOptions.minHashOptions.hashFraction, - assemblerOptions.minHashOptions.minHashIterationCount, - 0, - assemblerOptions.minHashOptions.minBucketSize, - assemblerOptions.minHashOptions.maxBucketSize, - assemblerOptions.minHashOptions.minFrequency, - threadCount); } @@ -711,6 +674,9 @@ void shasta::main::assemble( assemblerOptions.alignOptions, threadCount); + // Marker KmerIds are freed here. + // They can always be recomputed from the reads when needed. + assembler.cleanupMarkerKmerIds(); // Create the read graph. @@ -943,14 +909,6 @@ void shasta::main::mode0Assembly( assemblerOptions.markerGraphOptions.highCoverageThreshold, assemblerOptions.markerGraphOptions.maxDistance, assemblerOptions.markerGraphOptions.edgeMarkerSkipThreshold); - if(assemblerOptions.markerGraphOptions.reverseTransitiveReduction) { - assembler.reverseTransitiveReduction( - assemblerOptions.markerGraphOptions.lowCoverageThreshold, - assemblerOptions.markerGraphOptions.highCoverageThreshold, - assemblerOptions.markerGraphOptions.maxDistance); - } - - // Prune the marker graph. assembler.pruneMarkerGraphStrongSubgraph( @@ -1118,52 +1076,54 @@ void shasta::main::mode3Assembly( const AssemblerOptions& assemblerOptions, uint32_t threadCount) { + // Mode 3 assembly requires reads in raw representation (not RLE). + SHASTA_ASSERT(assemblerOptions.readsOptions.representation == 0); + + // The marker length must be even. + SHASTA_ASSERT((assembler.assemblerInfo->k %2) == 0); + // Create marker graph vertices. + // To create a complete marker graph, generate all vertices + // regardless of coverage, and allow duplicate markers on vertices. assembler.createMarkerGraphVertices( - assemblerOptions.markerGraphOptions.minCoverage, - assemblerOptions.markerGraphOptions.maxCoverage, - assemblerOptions.markerGraphOptions.minCoveragePerStrand, - assemblerOptions.markerGraphOptions.allowDuplicateMarkers, - assemblerOptions.markerGraphOptions.peakFinderMinAreaFraction, - assemblerOptions.markerGraphOptions.peakFinderAreaStartIndex, + 1, // minVertexCoverage + std::numeric_limits<uint64_t>::max(), // maxVertexCoverage + 0, // minVertexCoveragePerStrand + true, // allowDuplicateMarkers + std::numeric_limits<double>::signaling_NaN(), // For peak finder, unused because minVertexCoverage is not 0. + invalid<uint64_t>, // For peak finder, unused because minVertexCoverage is not 0. threadCount); assembler.findMarkerGraphReverseComplementVertices(threadCount); // Create marker graph edges. - // For assembly mode 1 we use createMarkerGraphEdgesStrict - // with minimum edge coverage (total and per strand). + // Use createMarkerGraphEdgesStrict so all oriented reads on an edge + // have exactly the same sequence. + // To create a complete marker graph, generate all edges + // regardless of coverage. assembler.createMarkerGraphEdgesStrict( - assemblerOptions.markerGraphOptions.minEdgeCoverage, - assemblerOptions.markerGraphOptions.minEdgeCoveragePerStrand, threadCount); + 0, // minEdgeCoverage + 0, // minEdgeCoveragePerStrand + threadCount); assembler.findMarkerGraphReverseComplementEdges(threadCount); // Coverage histograms for vertices and edges of the marker graph. assembler.computeMarkerGraphCoverageHistogram(); - // In mode 3 assembly, we don't add secondary edges. - - // Coverage histograms for vertices and edges of the marker graph. - assembler.computeMarkerGraphCoverageHistogram(); - - // Compute optimal repeat counts for each vertex of the marker graph. - if(assemblerOptions.readsOptions.representation == 1) { - assembler.assembleMarkerGraphVertices(threadCount); - } - - // Compute consensus sequence for all marker graph edges. - assembler.assembleMarkerGraphEdges( - threadCount, - assemblerOptions.assemblyOptions.markerGraphEdgeLengthThresholdForConsensus, - assemblerOptions.assemblyOptions.storeCoverageData or - assemblerOptions.assemblyOptions.storeCoverageDataCsvLengthThreshold>0, - true - ); - - // Run mode 3 assembly. - assembler.mode3Assembly( + // Assemble sequence for marker graph edges. + // This assembles MarkerGraph::edgeSequence which is + // different from what happens in other assembly modes. + // See the comments before MarkerGraph::edgeSequence + // for more information. + assembler.assembleMarkerGraphEdgesMode3(); + + // Flag primary marker graph edges. + assembler.flagPrimaryMarkerGraphEdges( + assemblerOptions.assemblyOptions.mode3Options.minPrimaryCoverage, + assemblerOptions.assemblyOptions.mode3Options.maxPrimaryCoverage, threadCount); - + // Run Mode 3 assembly. + assembler.mode3Assembly(threadCount, assemblerOptions.assemblyOptions.mode3Options, false); } @@ -1321,8 +1281,10 @@ void shasta::main::explore( Assembler assembler("Data/", false, 1, 0); // Set up the consensus caller. - cout << "Setting up consensus caller " << - assemblerOptions.assemblyOptions.consensusCaller << endl; + if(assembler.getReads().representation == 1) { + cout << "Setting up consensus caller " << + assemblerOptions.assemblyOptions.consensusCaller << endl; + } assembler.setupConsensusCaller(assemblerOptions.assemblyOptions.consensusCaller); // Access all available binary data. diff --git a/staticExecutable/CMakeLists.txt b/staticExecutable/CMakeLists.txt index 824eb8f..7926d84 100644 --- a/staticExecutable/CMakeLists.txt +++ b/staticExecutable/CMakeLists.txt @@ -5,7 +5,7 @@ project(shastaStaticExecutable) add_definitions(-std=c++20) # Compilation warnings. -add_definitions(-Wall -Wconversion -Wno-unused-result) +add_definitions(-Wall -Wconversion -Wno-unused-result -Wno-trigraphs -Wno-psabi) # Optimization and debug options. if(BUILD_DEBUG) @@ -62,14 +62,14 @@ if(X86_64) target_link_libraries( shastaStaticExecutable shastaStaticLibrary - atomic boost_system boost_program_options boost_chrono spoa png z + atomic boost_system boost_program_options boost_chrono boost_serialization spoa png z lapack blas gfortran quadmath -Wl,--whole-archive -lpthread -Wl,--no-whole-archive) else(X86_64) target_link_libraries( shastaStaticExecutable shastaStaticLibrary - atomic boost_system boost_program_options boost_chrono spoa png z + atomic boost_system boost_program_options boost_chrono boost_serialization spoa png z lapack blas gfortran -Wl,--whole-archive -lpthread -Wl,--no-whole-archive) endif(X86_64) diff --git a/staticLibrary/CMakeLists.txt b/staticLibrary/CMakeLists.txt index 8e001eb..afa9548 100644 --- a/staticLibrary/CMakeLists.txt +++ b/staticLibrary/CMakeLists.txt @@ -5,7 +5,7 @@ project(shastaStaticLibrary) add_definitions(-std=c++20) # Compilation warnings. -add_definitions(-Wall -Wconversion -Wno-unused-result -Wno-trigraphs) +add_definitions(-Wall -Wconversion -Wno-unused-result -Wno-trigraphs -Wno-psabi) # Optimization and debug options. if(BUILD_DEBUG) |