summaryrefslogtreecommitdiff
path: root/ONElib.h
blob: 319ef6bb0789b2bb99bc909b8b874473a65a949c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
/******************************************************************************************
 *
 *  File: ONElib.h
 *    Header for ONE file reading and writing
 *
 *  Authors: Richard Durbin (rd109@cam.ac.uk), Gene Myers (myers@mpi-cbg.de)
 *  Copyright (C) Richard Durbin, Gene Myers, 2019-
 *
 * HISTORY:
 * Last edited: Dec  3 06:08 2022 (rd109)
 * * Dec  3 06:01 2022 (rd109): remove oneWriteHeader(), switch to stdarg for oneWriteComment etc.
 *   * Dec 27 09:46 2019 (gene): style edits
 *   * Created: Sat Feb 23 10:12:43 2019 (rd109)
 *
 *****************************************************************************************/

#ifndef ONE_DEFINED
#define ONE_DEFINED

#include <stdio.h>    // for FILE etc.
#include <stdarg.h>   // for formatted writing in oneWriteComment(), oneAddProvenance()
#include <inttypes.h> // for standard size int types and their PRI print macros
#include <stdbool.h>  // for standard bool types
#include <limits.h>   // for INT_MAX etc.
#include <pthread.h>

/***********************************************************************************
 *
 *    DATA TYPES
 *
 **********************************************************************************/

// Basic Types
#ifndef U8_DEFINED
#define U8_DEFINED

typedef int8_t        I8;
typedef int16_t       I16;
typedef int32_t       I32;
typedef int64_t       I64;
typedef unsigned char U8;

#endif // U8_DEFINED

typedef enum { oneINT = 1, oneREAL, oneCHAR, oneSTRING,
	       oneINT_LIST, oneREAL_LIST, oneSTRING_LIST, oneDNA } OneType;
extern char* oneTypeString[] ; 
// = { 0, "INT", "REAL", "CHAR", "STRING", "INT_LIST", "REAL_LIST", "STRING_LIST", "DNA" } ;

typedef union
  { I64    i;
    double r;
    char   c;
    I64    len; // For lists : top 8 bits encode excess bytes, low 56 length
  } OneField;

typedef struct
  { char *program;
    char *version;
    char *command;
    char *date;
  } OneProvenance;

typedef struct
  { char *filename; 
    I64   count;
  } OneReference;

typedef struct
  { I64 count;
    I64 max;
    I64 total;
    I64 groupCount;
    I64 groupTotal;
  } OneCounts;

  // OneCodecs are a private package for binary one file compression

typedef void OneCodec; // forward declaration of opaque type for compression codecs

  // DNAcodec is a special pre-existing compressor one should use for DNA.
  // It compresses every base to 2-bits, where any non-ACGT letter is
  // effectively converted to an A.  Compression is case insensitive,
  // but decompression always delivers lower-case.

extern  OneCodec *DNAcodec;

  // Record for a particular line type.  There is at most one list element.

typedef struct
  { OneCounts accum;            // counts read or written to this moment
    OneCounts given;            // counts read from header
    I64       gCount;           // used internally to calculate groupCount and groupTotal
    I64       gTotal;
    I64       oCount;           // # of objects in prefix before first group (if any)
    I64       oTotal;           // + of objects in prefix (these 2 are for thread parallel apps)

    int       nField;           // number of fields
    OneType  *fieldType;        // type of each field
    int       listEltSize;      // size of list field elements (if present, else 0)
    int       listField;        // field index of list
    char     *comment;          // the comment on the definition line in the schema
    
    bool      isUserBuf;        // flag for whether buffer is owned by user
    I64       bufSize;          // system buffer and size if not user supplied
    void     *buffer;

    OneCodec *listCodec;       // compression codec and flags
    bool      isUseListCodec;  // on once enough data collected to train associated codec
    char      binaryTypePack;   // binary code for line type, bit 8 set.
                                //     bit 0: list compressed
    I64       listTack;         // accumulated training data for this threads codeCodec (master)
  } OneInfo;

  // the schema type - the first record is the header spec, then a linked list of primary classes

typedef struct OneSchema
  {
    char      *primary ;
    int        nSecondary ;
    char     **secondary ;
    OneInfo   *info[128] ;
    int        nFieldMax ;
    char       objectType ;
    char       groupType ;
    struct OneSchema *nxt ;
  } OneSchema ;

typedef struct OneHeaderText
  { char *text ;
    struct OneHeaderText *nxt ;
  } OneHeaderText ;

  // The main OneFile type - this is the primary handle used by the end user

typedef struct
  {
    // this field may be set by the user

    bool           isCheckString;      // set if want to validate string char by char

    // these fields may be read by user - but don't change them!

    char          *fileType;
    char          *subType;
    char           lineType;           // current lineType
    char           objectType;         // line designation character for primary objects
    char           groupType;          // line designation character for groups (optional)
    I64            line;               // current line number
    I64            byte;               // current byte position when writing binary
    I64            object;             // current object - incremented when object line read
    I64            group;              // current group - incremented when group line read
    OneProvenance *provenance;         // if non-zero then count['!'] entries
    OneReference  *reference;          // if non-zero then count['<'] entries
    OneReference  *deferred;           // if non-zero then count['>'] entries
    OneField      *field;              // used to hold the current line - accessed by macros
    OneInfo       *info[128];          // all the per-linetype information
    I64            codecTrainingSize;  // amount of data to see before building codec

    // fields below here are private to the package

    FILE  *f;

    bool   isWrite;                // true if open for writing
    bool   isHeaderOut;            // true if header already written
    bool   isBinary;               // true if writing a binary file
    bool   inGroup;                // set once inside a group
    bool   isLastLineBinary;       // needed to deal with newlines on ascii files
    bool   isIndexIn;              // index read in
    bool   isBig;                  // are we on a big-endian machine?
    bool   isNoAsciiHeader;        // backdoor for ONEview to avoid writing header in ascii

    char   lineBuf[128];           // working buffers
    char   numberBuf[32];
    int    nFieldMax;
    I64    codecBufSize;
    char  *codecBuf;
    I64    nBits;                  // number of bits of list currently in codecBuf
    I64    intListBytes;           // number of bytes per integer in the compacted INT_LIST
    I64    linePos;                // current line position
    OneHeaderText *headerText;     // arbitrary descriptive text that goes with the header

    char   binaryTypeUnpack[256];  // invert binary line code to ASCII line character.
    int    share;                  // index if slave of threaded write, +nthreads > 0 if master
    int    isFinal;                // oneFinalizeCounts has been called on file
    pthread_mutex_t fieldLock;     // Mutexs to protect training accumumulation stats when threadded
    pthread_mutex_t listLock;
  } OneFile;                      //   the footer will be in the concatenated result.


/***********************************************************************************
 *
 *    ROUTINES FOR READING & WRITING ONE FILES IN BOTH ASCII & BINARY (TRANSPARENTLY)
 *
 **********************************************************************************/

//  CREATING AND DESTROYING SCHEMAS

OneSchema *oneSchemaCreateFromFile (char *path) ;
OneSchema *oneSchemaCreateFromText (char *text) ;

  // These functions create a schema handle that can be used to open One-code data files 
  //   for reading and writing.  A schema file is itself a One-code file, consisting of
  //   a set of objects, one per primary file type.  Valid lines in this file are:
  //      P <primary file type>   // a short string
  //      S <secondary file type> // a short string - any number of these
  //      O <char> <field_list>   // definition of object type
  //      G <char> <field_list>   // definition of group type - first field must be an int
  //      D <char> <field_list>   // definition of line
  //   <char> must be a lower or upper case letter.
  //   <field_list> is a list of field types from:
  //      CHAR, INT, REAL, STRING, INT_LIST, REAL_LIST, STRING_LIST, DNA
  //      Only one list type (STRING, *_LIST or DNA) is allowed per line type.
  //   All the D lines following an O line apply to that object type.
  //   By convention comments on each line explain the definition.
  //   Example, with lists and strings preceded by their length in OneCode style
  //      P 3 seq                            this is a sequence file
  //      O S 1 3 DNA                        the DNA sequence - each S line starts an object
  //      D Q 1 6 STRING                     the phred encoded quality score + ASCII 33
  //      D N 4 4 REAL 4 REAL 4 REAL 4 REAL  signal to noise ratio in A, C, G, T channels
  //      G g 2 3 INT 6 STRING               group designator: number of objects, name
  // The ...FromText() alternative writes the text to a temp file and reads it with 
  //   oneSchemaCreateFromFile(). This allows code to set the schema.
  // Internally a schema is a linked list of OneSchema objects, with the first holding
  //   the (hard-coded) schema for the header and footer, and the remainder each 
  //   corresponding to one primary file type.

void oneSchemaDestroy (OneSchema *schema) ;

//  READING ONE FILES:

OneFile *oneFileOpenRead (const char *path, OneSchema *schema, char *type, int nthreads) ;

  // Open ONE file 'path', either binary or ascii encoded, for reading.
  //   If the file doesn't have a header, then 'type' must be specified,
  //   otherwise, if 'type' is non-zero it must match the header type.
  //   All header information (if present) is read.
  // 'schema' is also optional.  If it is NULL then the file must contain its own schema.  
  //   If 'schema' is present then it must support 'type', and if the file contains its 
  //   own schema, then that must be a subset of the one for this type in 'schema'.
  // If nthreads > 1 then nthreadds OneFiles are generated as an array and the pointer
  //   to the first, called the master, is returned.  The other nthreads-1 files are
  //   called slaves.  The package routines are aware of when a OneFile argument is a
  //   slave or master in a parallel group.  The master recieves provenance, counts, etc.
  //   The slaves only read data and have the virtue of sharing indices and codecs with
  //   the master if relevant.

bool oneFileCheckSchema (OneFile *vf, char *textSchema) ;

  // Checks if file schema is consistent with text schema.  Mismatches are reported to stderr.
  // Filetype and all linetypes in text must match.  File schema can contain additional linetypes.
  // e.g. if (! oneFileCheckSchema (vf, "P 3 seq\nD S 1 3 DNA\nD Q 1 6 STRING\nD P 0\n")) die () ;
  // This is provided to enable a program to ensure that its assumptions about data layout
  // are satisfied.

char oneReadLine (OneFile *vf) ;

  // Read the next ONE formatted line returning the line type of the line, or 0
  //   if at the end of the data section.  The content macros immediately below are
  //   used to access the information of the line most recently read.

void   *_oneList (OneFile *vf) ;                // lazy codec decompression if required
void   *_oneCompressedList (OneFile *vf) ;      // lazy codec compression if required

#define oneInt(vf,x)        ((vf)->field[x].i)
#define oneReal(vf,x)       ((vf)->field[x].r)
#define oneChar(vf,x)       ((vf)->field[x].c)
#define _LF(vf)             ((vf)->info[(int)(vf)->lineType]->listField)
#define oneLen(vf)          ((vf)->field[_LF(vf)].len & 0xffffffffffffffll)
#define oneString(vf)       (char *) _oneList(vf)
#define oneDNAchar(vf)      (char *) _oneList(vf)
#define oneDNA2bit(vf)      (U8 *) _oneCompressedList(vf)
#define oneIntList(vf)      (I64 *) _oneList(vf)
#define oneRealList(vf)     (double *) _oneList(vf)
#define oneNextString(vf,s) (s + strlen(s) + 1)

  // Access field information.  The index x of a list object is not required as there is
  //   only one list per line, stored in ->buffer.
  //   A "string list" is implicitly supported, get the first string with oneString, and
  //   subsequent strings sequentially with oneNextString, e.g.:
  //
  //       char *s = oneString(vf);
  //       for (i = 0; i < oneLen(vf); i++)
  //         { // do something with i'th string
  //           s = oneNextString(vf,s);
  //         }

char *oneReadComment (OneFile *vf);

  // Can be called after oneReadLine() to read any optional comment text after the fixed fields.
  // Returns NULL if there is no comment.

//  WRITING ONE FILES:

OneFile *oneFileOpenWriteNew (const char *path, OneSchema *schema, char *type,
			      bool isBinary, int nthreads);
OneFile *oneFileOpenWriteFrom (const char *path, OneFile *vfIn,
			       bool isBinary, int nthreads);

  // Create a new oneFile that will be written to 'path'.  For the 'New' variant supply
  //   the file type, subtype (if non-zero), and whether it should be binary or ASCII.
  //   For the 'From' variant, specify binary or ASCII, schema and all other header 
  //   information is inherited from 'vfIn', where the count stats are from vfIn's 
  //   accumulation (assumes vfIn has been fully read or written) if 'useAccum is true, 
  //   and from vfIn's header otherwise.
  // If nthreads > 1 then nthreads OneFiles are generated as an array and the pointer
  //   to the first, called the master, is returned.  The other nthreads-1 files are
  //   called slaves.  The package routines are aware of when a OneFile argument is a
  //   slave or master in a parallel group.  The slaves are expected to only write data
  //   lines, with the master adding provenance, producing the header, and then some
  //   segment of the initial data lines.  Upon close the final result is effectively
  //   the concatenation of the master, followed by the output of each slave in sequence.

bool oneInheritProvenance (OneFile *vf, OneFile *source);
bool oneInheritReference  (OneFile *vf, OneFile *source);
bool oneInheritDeferred   (OneFile *vf, OneFile *source);

  // Add all provenance/reference/deferred entries in source to header of vf.  Must be
  //   called before first call to oneWriteLine.

bool oneAddProvenance (OneFile *vf, char *prog, char *version, char *format, ...);
bool oneAddReference  (OneFile *vf, char *filename, I64 count);
bool oneAddDeferred   (OneFile *vf, char *filename);

  // Append provenance/reference/deferred to header information.  Must be called before
  //   first call to oneWriteLine.

  // For ASCII output, if you want the header to contain count information then you must
  //   create and fill the relevant OneCounts objects before the first call to oneWriteLine.
  //   For BINARY output, the OneCounts information is accumulated and written automatically.

void oneWriteLine (OneFile *vf, char lineType, I64 listLen, void *listBuf);

  // Set up a line for output just as it would be returned by oneReadLine and then call
  //   this routine to output the line (ASCII or binary).
  // Use the macros above on the l.h.s. of assignments to fill fields (e.g. oneInt(vf,2) = 3).
  // For lists, give the length in the listLen argument, and either place the list data in your
  //   own buffer and give it as listBuf, or put in the line's buffer and set listBuf == NULL.

void oneWriteLineFrom (OneFile *vf, OneFile *source) ; // copies a line from source into vf
void oneWriteLineDNA2bit (OneFile *vf, char lineType, I64 listLen, U8 *dnaBuf);

// Minor variants of oneWriteLine().
// Use oneWriteLineDNA2bit for DNA lists if your DNA is already 2-bit compressed.

void oneWriteComment (OneFile *vf, char *format, ...); // can not include newline \n chars

  // Adds a comment to the current line. Extends line in ascii, adds special line type in binary.

// CLOSING FILES (FOR BOTH READ & WRITE)

void oneFileClose (OneFile *vf);

  // Close vf (opened either for reading or writing). Finalizes counts, merges theaded files,
  // and writes footer if binary. Frees all non-user memory associated with vf.

//  GOTO & BUFFER MANAGEMENT

void oneUserBuffer (OneFile *vf, char lineType, void *buffer);

  // A buffer is used to capture the list element of each line type that has one.
  //   This routine allows you to reassign the buffer to one you've allocated, or
  //   to revert to a default system buffer if 'buffer' = NULL.  The previous buffer
  //   (if any) is freed.  The user must ensure that a buffer they supply is large
  //   enough. BTW, this buffer is overwritten with each new line read of the given type.

bool oneGotoObject (OneFile *vf, I64 i);

  // Goto i'th object in the file. This only works on binary files, which have an index.

I64  oneGotoGroup  (OneFile *vf, I64 i);

  // Goto the first object in group i. Return the size (in objects) of the group, or 0
  //   if an error (i out of range or vf has not group type). Only works for binary files.

/***********************************************************************************
 *
 *    A BIT ABOUT THE FORMAT OF BINARY FILES
 *
 **********************************************************************************/

 //   <bin file> <- <ASCII Prolog> <$-line> <binary data> <footer> <^-line> <footer-size:int64>
 //
 // '$'-line flags file is binary and gives endian
 // The data block ends with a blank line consisting of '\n'
 //
 // EWM: Removed '-' line, simply write off_t to footer start
 //
 //   <ASCII Prolog> <- <'1'-line> [<'2'-line>] ( <'!'-line> | <'<'-line> | <'>'-line> )*
 //
 // The ASCII prolog contains the type, subtype, provenance, reference, and deferred lines
 //   in the ASCII format.  The ONE count statistic lines for each data line type are found
 //   in the footer along with binary ';' and ':' lines that encode their compressors as
 //   needed.  The footer also contains binary '&' and '*' lines that encode the object index
 //   and group indices, respectively.
 //
 //   <Binary line> <- <Binary line code + tags> <fields> [<list data>]
 //
 // Line codes are >= 128 for binary encoded lines.  The low two order bits of these are flags,
 //   so each binary-encoded line type has 4 codes and a table maps these to the ASCII code.
 //   Bit 0 indicates if the fields of the line type are compressed, and Bit 1 indicates if
 //   the list data (if present) is compressed.
 //
 // If a field is a list, then the field array element for that field is the list's length
 //   where the low 56 bits encode length, and the high 8 bits encode the # of high-order
 //   0-bytes in every list element if an INT_LIST (0 otherwise).

#endif  // ONE_DEFINED

/******************* end of file **************/