summaryrefslogtreecommitdiff
path: root/reference.h
blob: 9b8e2e7ab9da105a15909450453ff9c7c5a8e039 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/*
 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
 *
 * This file is part of Bowtie 2.
 *
 * Bowtie 2 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Bowtie 2 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef REFERENCE_H_
#define REFERENCE_H_

#include <stdexcept>
#include <fcntl.h>
#include <sys/stat.h>
#include <utility>
#ifdef BOWTIE_MM
#include <sys/mman.h>
#include <sys/shm.h>
#endif
#include "endian_swap.h"
#include "ref_read.h"
#include "sequence_io.h"
#include "mm.h"
#include "shmem.h"
#include "timer.h"
#include "sstring.h"
#include "btypes.h"


/**
 * Concrete reference representation that bulk-loads the reference from
 * the bit-pair-compacted binary file and stores it in memory also in
 * bit-pair-compacted format.  The user may request reference
 * characters either on a per-character bases or by "stretch" using
 * getBase(...) and getStretch(...) respectively.
 *
 * Most of the complexity in this class is due to the fact that we want
 * to represent references with ambiguous (non-A/C/G/T) characters but
 * we don't want to use more than two bits per base.  This means we
 * need a way to encode the ambiguous stretches of the reference in a
 * way that is external to the bitpair sequence.  To accomplish this,
 * we use the RefRecords vector, which is stored in the .3.ebwt index
 * file.  The bitpairs themselves are stored in the .4.ebwt index file.
 *
 * Once it has been loaded, a BitPairReference is read-only, and is
 * safe for many threads to access at once.
 */
class BitPairReference {

public:
	/**
	 * Load from .3.ebwt/.4.ebwt Bowtie index files.
	 */
	BitPairReference(
		const string& in,
		bool color,
		bool sanity = false,
		EList<string>* infiles = NULL,
		EList<SString<char> >* origs = NULL,
		bool infilesSeq = false,
		bool useMm = false,
		bool useShmem = false,
		bool mmSweep = false,
		bool verbose = false,
		bool startVerbose = false);

	~BitPairReference();

	/**
	 * Return a single base of the reference.  Calling this repeatedly
	 * is not an efficient way to retrieve bases from the reference;
	 * use loadStretch() instead.
	 *
	 * This implementation scans linearly through the records for the
	 * unambiguous stretches of the target reference sequence.  When
	 * there are many records, binary search would be more appropriate.
	 */
	int getBase(size_t tidx, size_t toff) const;

	/**
	 * Load a stretch of the reference string into memory at 'dest'.
	 *
	 * This implementation scans linearly through the records for the
	 * unambiguous stretches of the target reference sequence.  When
	 * there are many records, binary search would be more appropriate.
	 */
	int getStretchNaive(
		uint32_t *destU32,
		size_t tidx,
		size_t toff,
		size_t count) const;

	/**
	 * Load a stretch of the reference string into memory at 'dest'.
	 *
	 * This implementation scans linearly through the records for the
	 * unambiguous stretches of the target reference sequence.  When
	 * there are many records, binary search would be more appropriate.
	 */
	int getStretch(
		uint32_t *destU32,
		size_t tidx,
		size_t toff,
		size_t count
		ASSERT_ONLY(, SStringExpandable<uint32_t>& destU32_2)) const;

	/**
	 * Return the number of reference sequences.
	 */
	TIndexOffU numRefs() const {
		return nrefs_;
	}

	/**
	 * Return the approximate length of a reference sequence (it might leave
	 * off some Ns on the end).
	 *
	 * TODO: Is it still true that it might leave off Ns?
	 */
	TIndexOffU approxLen(TIndexOffU elt) const {
		assert_lt(elt, nrefs_);
		return refLens_[elt];
	}

	/**
	 * Return true iff buf_ and all the vectors are populated.
	 */
	bool loaded() const {
		return loaded_;
	}
	
	/**
	 * Given a reference sequence id, return its offset into the pasted
	 * reference string; i.e., return the number of unambiguous nucleotides
	 * preceding it.
	 */
	TIndexOffU pastedOffset(TIndexOffU idx) const {
		return refOffs_[idx];
	}

	/**
	 * Parse the input fasta files, populating the szs list and writing the
	 * .3.ebwt and .4.ebwt portions of the index as we go.
	 */
	static std::pair<size_t, size_t>
	szsFromFasta(
		EList<FileBuf*>& is,
		const string& outfile,
		bool bigEndian,
		const RefReadInParams& refparams,
		EList<RefRecord>& szs,
		bool sanity);
	
protected:

	uint32_t byteToU32_[256];

	EList<RefRecord> recs_;       /// records describing unambiguous stretches
	// following two lists are purely for the binary search in getStretch
	EList<TIndexOffU> cumUnambig_; // # unambig ref chars up to each record
	EList<TIndexOffU> cumRefOff_;  // # ref chars up to each record
	EList<TIndexOffU> refLens_;    /// approx lens of ref seqs (excludes trailing ambig chars)
	EList<TIndexOffU> refOffs_;    /// buf_ begin offsets per ref seq
	EList<TIndexOffU> refRecOffs_; /// record begin/end offsets per ref seq
	uint8_t *buf_;      /// the whole reference as a big bitpacked byte array
	uint8_t *sanityBuf_;/// for sanity-checking buf_
	TIndexOffU bufSz_;    /// size of buf_
	TIndexOffU bufAllocSz_;
	TIndexOffU nrefs_;    /// the number of reference sequences
	bool     loaded_;   /// whether it's loaded
	bool     sanity_;   /// do sanity checking
	bool     useMm_;    /// load the reference as a memory-mapped file
	bool     useShmem_; /// load the reference into shared memory
	bool     verbose_;
	ASSERT_ONLY(SStringExpandable<uint32_t> tmp_destU32_);
};

#endif