src/MarkerGraph.hpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332

#ifndef SHASTA_MARKER_GRAPH_HPP
#define SHASTA_MARKER_GRAPH_HPP

#include "MarkerInterval.hpp"
#include "MemoryMappedVectorOfVectors.hpp"
#include "MultithreadedObject.hpp"
#include "shastaTypes.hpp"
#include "Uint.hpp"

#include "cstdint.hpp"
#include "memory.hpp"

namespace shasta {

    class Base;
    class MarkerGraph;
    class CompressedCoverageData;

    extern template class MultithreadedObject<MarkerGraph>;
}


class shasta::MarkerGraph : public MultithreadedObject<MarkerGraph> {
public:

    using VertexId = MarkerGraphVertexId;
    using EdgeId = MarkerGraphEdgeId;
    static const VertexId invalidVertexId;
    static const EdgeId invalidEdgeId;

    // To save memory, store vertex ids using 5 bytes.
    // This allows for up to 2^40 = 1 Ti markers (both strands).
    // A human size run with 40x coverage and 10% markers
    // has around 25 G markers (both strands).
    using CompressedVertexId = Uint40;
    static const CompressedVertexId invalidCompressedVertexId;

    MarkerGraph();

    // The marker ids of the markers corresponding to
    // each vertex of the global marker graph.
    // Indexed by VertexId.
    // For a given vertex, the marker ids are sorted.
    // Stored as a shared pointer to permit easy replacement of the vertices.
    shared_ptr< MemoryMapped::VectorOfVectors<MarkerId, CompressedVertexId> > verticesPointer;
    void constructVertices()
    {
        verticesPointer = make_shared<MemoryMapped::VectorOfVectors<MarkerId, CompressedVertexId> >();
    }
    void destructVertices() {
        verticesPointer = 0;
    }


    // Vertices access functions.
    // Return the number of vertices.
    MemoryMapped::VectorOfVectors<MarkerId, CompressedVertexId>& vertices()
    {
        return *verticesPointer;
    }
    const MemoryMapped::VectorOfVectors<MarkerId, CompressedVertexId>& vertices() const
    {
        return *verticesPointer;
    }
    uint64_t vertexCount() const {
        return verticesPointer->size();
    }
    // Return the number of markers for a given vertex.
    uint64_t vertexCoverage(VertexId vertexId) const
    {
        return verticesPointer->size(vertexId);
    }
    // Return the marker ids for a given vertex.
    span<MarkerId> getVertexMarkerIds(VertexId vertexId) {
        return vertices()[vertexId];
    }
    span<const MarkerId> getVertexMarkerIds(VertexId vertexId) const {
        return vertices()[vertexId];
    }

    void remove();

    // The global marker graph vertex corresponding to each marker.
    // Indexed by MarkerId.
    // For markers that don't correspond to a marker graph vertex,
    // this stores invalidCompressedVertexId.
    MemoryMapped::Vector<CompressedVertexId> vertexTable;


    // This renumbers the vertex table to make sure that
    // vertices are numbered contiguously starting at 0.
    // This must be called after the vertexTable is changed,
    // as in Assembler::cleanupDuplicateMarkers.
    // After this is called, all other data structures
    // are inconsistent and need to be recreated.
    // The second version can be called if the maximum vertex id
    // present in the vertex table is already known, and is faster.
    // Returns the maximmum vertex id after renumbering.
    VertexId renumberVertexTable(size_t threadCount);
    VertexId renumberVertexTable(size_t threadCount, VertexId maxVertexId);
private:
    void renumberVertexTableThreadFunction1(size_t threadId);
    void renumberVertexTableThreadFunction2(size_t threadId);
    class RenumberVertexTableData {
    public:
        // Set to true for VertexId values represented in the starting vertexTable.
        MemoryMapped::Vector<bool> isPresent;

        // The new VertexId corresponding to each old VertexId.
        MemoryMapped::Vector<VertexId> newVertexId;
    };
    RenumberVertexTableData renumberVertexTableData;


    // Find the maximum valid VertexId in the vertex table.
    VertexId findMaxVertexTableEntry(size_t threadCount);
    void findMaxVertexTableEntryThreadFunction(size_t threadId);
    class FindMaxVertexTableEntryData {
    public:
        // The maximum VertexId found by each thread.
        vector<VertexId> threadMaxVertexId;
    };
    FindMaxVertexTableEntryData findMaxVertexTableEntryData;
public:


    // Recreate the vertices from the vertexTable.
    // This assumes that valid VertexId's in the vertex table
    // are numbered contiguously starting at 0 (call renumberVertexTable to ensure that).
    void createVerticesFromVertexTable(size_t threadCount, VertexId maxVertexId);
private:
    void createVerticesFromVertexTableThreadFunction1(size_t threadId);
    void createVerticesFromVertexTableThreadFunction2(size_t threadId);
    void createVerticesFromVertexTableThreadFunction3(size_t threadId);
    void createVerticesFromVertexTableThreadFunction4(size_t threadId);
    class CreateVerticesFromVertexTableData {
    public:
        // Like the vertices, but the second template argument is VertexId
        // instead of CompressedVertexId. This is necessariy to be able to
        // work on it in multilthreaded code efficiently.
        MemoryMapped::VectorOfVectors<MarkerId, VertexId> vertices;
    };
    CreateVerticesFromVertexTableData createVerticesFromVertexTableData;
public:


    // Remove marker graph vertices and update vertices and vertexTable.
    // After this is called, the only
    // two MarkerGraph field filled in are vertices and vertexTable.
    // Everything else has to be recreated.
    void removeVertices(
        const MemoryMapped::Vector<VertexId>& verticesToBeKept,
        uint64_t pageSize,
        uint64_t threadCount);
private:
    class RemoveVerticesData {
    public:
        const MemoryMapped::Vector<VertexId>* verticesToBeKept;
        shared_ptr<MemoryMapped::VectorOfVectors<MarkerId, CompressedVertexId> > newVerticesPointer;
    };
    RemoveVerticesData removeVerticesData;
    void removeVerticesThreadFunction1(size_t threadId);
    void removeVerticesThreadFunction2(size_t threadId);
    void removeVerticesThreadFunction3(size_t threadId);
public:


    // The reverse complement of each vertex.
    // Indexed by VertexId.
    MemoryMapped::Vector<VertexId> reverseComplementVertex;

    // The edges of the marker graph.
    class Edge {
    public:
        Uint40 source;  // The source vertex (index into globalMarkerGraphVertices).
        Uint40 target;  // The target vertex (index into globalMarkerGraphVertices).
        uint8_t coverage;   // (255 indicates 255 or more).

        // Flags used to mark the edge as removed from the marker graph.
        bool wasRemoved() const
        {
            return
                wasRemovedByTransitiveReduction ||
                wasPruned ||
                isLowCoverageCrossEdge ||
                isSuperBubbleEdge ||
                wasRemovedWhileSplittingSecondaryEdges
                ;
        }

        // Flag that is set if the edge was removed during
        // approximate transitive reduction by flagWeakMarkerGraphEdges.
        uint8_t wasRemovedByTransitiveReduction : 1;

        // Set if this edge was removed during pruning.
        uint8_t wasPruned : 1;

        // Set if this edge belongs to a bubble/superbubble that was removed.
        uint8_t isSuperBubbleEdge : 1;

        // Flag set if this edge corresponds to a low coverage cross edge
        // of the assembly graph.
        uint8_t isLowCoverageCrossEdge: 1;

        // Flag set if this edge was assembled.
        // If set, edgeConsensusOverlappingBaseCount and edgeConsensus
        // for this edge are set.
        uint8_t wasAssembled : 1;

        // Flag for secondary edges in assembly mode 1.
        uint8_t isSecondary;

        // This is set for secondary edges that are created and later split.
        // Assembly mode 2 only.
        uint8_t wasRemovedWhileSplittingSecondaryEdges : 1;

        // Unused.
        uint8_t flag6 : 1;

        void clearFlags()
        {
            wasRemovedByTransitiveReduction = 0;
            wasPruned = 0;
            isSuperBubbleEdge = 0;
            isLowCoverageCrossEdge = 0;
            wasAssembled = 0;
            isSecondary = 0;
            wasRemovedWhileSplittingSecondaryEdges = 0;
            flag6 = 0;
        }
        Edge() :
            source(MarkerGraph::invalidCompressedVertexId),
            target(MarkerGraph::invalidCompressedVertexId),
            coverage(0)
        {
            clearFlags();
        }
    };
    MemoryMapped::Vector<Edge> edges;
    const Edge* findEdge(Uint40 source, Uint40 target) const;
    EdgeId findEdgeId(Uint40 source, Uint40 target) const;

    // The MarkerIntervals for each of the above edges.
    MemoryMapped::VectorOfVectors<MarkerInterval, uint64_t> edgeMarkerIntervals;

    // The edges that each vertex is the source of.
    // Contains indexes into the above edges vector.
    MemoryMapped::VectorOfVectors<Uint40, uint64_t> edgesBySource;

    // The edges that each vertex is the target of.
    // Contains indexes into the above edges vector.
    MemoryMapped::VectorOfVectors<Uint40, uint64_t> edgesByTarget;

    // Compute in-degree or out-degree of a vertex,
    // counting only edges that were not removed.
    uint64_t inDegree(VertexId) const;
    uint64_t outDegree(VertexId) const;
    EdgeId getFirstNonRemovedOutEdge(VertexId) const;
    EdgeId getFirstNonRemovedInEdge(VertexId) const;

    // The reverse complement of each edge.
    // Indexed by EdgeId.
    MemoryMapped::Vector<EdgeId> reverseComplementEdge;

    // Return total coverage of an edge.
    uint64_t edgeCoverage(EdgeId edgeId) const
    {
        return edgeMarkerIntervals.size(edgeId);
    }

    // Return coverage for each strand for an edge.
    array<uint64_t, 2> edgeStrandCoverage(EdgeId edgeId) const
    {
        array<uint64_t, 2> coverage = {0, 0};
        for(const MarkerInterval& markerInterval: edgeMarkerIntervals[edgeId]) {
            ++coverage[markerInterval.orientedReadId.getStrand()];
        }
        return coverage;
    }

    // The consensus repeat counts of each vertex of the marker graph.
    // There are assemblerInfo->k entries for each vertex.
    // The first entry for a vertex is at index vertexId*assemblerInfo->k.
    MemoryMapped::Vector<uint8_t> vertexRepeatCounts;

    // Consensus sequence and repeat counts for each marker graph edge.
    // This excludes the sequence of flanking markers and their repeat counts.
    // Indexed by the marker graph edge id.
    // - For edges that were marked as removed,
    //   edgeConsensusOverlappingBaseCount is 0 and edgeConsensus is empty.
    // - For edges that were not marked as removed:
    //   * If the consensus sequence has one or more intervening bases
    //     between the flanking markers,
    //     edgeConsensusOverlappingBaseCount is 0 and edgeConsensus
    //     stores those intervening bases with their repeat count consensus.
    //   * Otherwise, edgeConsensus is empty and
    //     edgeConsensusOverlappingBaseCount stores the number of
    //     overlapping bases (for the consensus sequence)
    //     between the two flanking markers. This can be zero
    //     if the consensus sequence has the flanking markers
    //     exactly adjacent.
    MemoryMapped::VectorOfVectors<pair<Base, uint8_t>, uint64_t> edgeConsensus;
    MemoryMapped::Vector<uint8_t> edgeConsensusOverlappingBaseCount;


    // Details of vertex coverage.
    // These are not stored by default.
    // They can be used to calibrate the Bayesian model for repeat counts
    // and for some types of analyses.
    // Indeed by VertexId. For each vertex, contains pairs (position, CompressedCoverageData),
    // ordered by position.
    // Note that the bases at a given position are all identical by construction.
    MemoryMapped::VectorOfVectors<pair<uint32_t, CompressedCoverageData>, uint64_t>
        vertexCoverageData;

    // Details of edge coverage.
    // These are not stored by default.
    // They can be used to calibrate the Bayesian model for repeat counts
    // and for some types of analyses.
    // Indeed by EdgeId. For each edge, contains pairs (position, CompressedCoverageData),
    // ordered by position.
    MemoryMapped::VectorOfVectors<pair<uint32_t, CompressedCoverageData>, uint64_t>
        edgeCoverageData;
};

#endif