/****************************************************************************************** * * File: ONElib.h * Header for ONE file reading and writing * * Authors: Richard Durbin (rd109@cam.ac.uk), Gene Myers (myers@mpi-cbg.de) * Copyright (C) Richard Durbin, Gene Myers, 2019- * * HISTORY: * Last edited: Dec 3 06:08 2022 (rd109) * * Dec 3 06:01 2022 (rd109): remove oneWriteHeader(), switch to stdarg for oneWriteComment etc. * * Dec 27 09:46 2019 (gene): style edits * * Created: Sat Feb 23 10:12:43 2019 (rd109) * *****************************************************************************************/ #ifndef ONE_DEFINED #define ONE_DEFINED #include // for FILE etc. #include // for formatted writing in oneWriteComment(), oneAddProvenance() #include // for standard size int types and their PRI print macros #include // for standard bool types #include // for INT_MAX etc. #include /*********************************************************************************** * * DATA TYPES * **********************************************************************************/ // Basic Types #ifndef U8_DEFINED #define U8_DEFINED typedef int8_t I8; typedef int16_t I16; typedef int32_t I32; typedef int64_t I64; typedef unsigned char U8; #endif // U8_DEFINED typedef enum { oneINT = 1, oneREAL, oneCHAR, oneSTRING, oneINT_LIST, oneREAL_LIST, oneSTRING_LIST, oneDNA } OneType; extern char* oneTypeString[] ; // = { 0, "INT", "REAL", "CHAR", "STRING", "INT_LIST", "REAL_LIST", "STRING_LIST", "DNA" } ; typedef union { I64 i; double r; char c; I64 len; // For lists : top 8 bits encode excess bytes, low 56 length } OneField; typedef struct { char *program; char *version; char *command; char *date; } OneProvenance; typedef struct { char *filename; I64 count; } OneReference; typedef struct { I64 count; I64 max; I64 total; I64 groupCount; I64 groupTotal; } OneCounts; // OneCodecs are a private package for binary one file compression typedef void OneCodec; // forward declaration of opaque type for compression codecs // DNAcodec is a special pre-existing compressor one should use for DNA. // It compresses every base to 2-bits, where any non-ACGT letter is // effectively converted to an A. Compression is case insensitive, // but decompression always delivers lower-case. extern OneCodec *DNAcodec; // Record for a particular line type. There is at most one list element. typedef struct { OneCounts accum; // counts read or written to this moment OneCounts given; // counts read from header I64 gCount; // used internally to calculate groupCount and groupTotal I64 gTotal; I64 oCount; // # of objects in prefix before first group (if any) I64 oTotal; // + of objects in prefix (these 2 are for thread parallel apps) int nField; // number of fields OneType *fieldType; // type of each field int listEltSize; // size of list field elements (if present, else 0) int listField; // field index of list char *comment; // the comment on the definition line in the schema bool isUserBuf; // flag for whether buffer is owned by user I64 bufSize; // system buffer and size if not user supplied void *buffer; OneCodec *listCodec; // compression codec and flags bool isUseListCodec; // on once enough data collected to train associated codec char binaryTypePack; // binary code for line type, bit 8 set. // bit 0: list compressed I64 listTack; // accumulated training data for this threads codeCodec (master) } OneInfo; // the schema type - the first record is the header spec, then a linked list of primary classes typedef struct OneSchema { char *primary ; int nSecondary ; char **secondary ; OneInfo *info[128] ; int nFieldMax ; char objectType ; char groupType ; struct OneSchema *nxt ; } OneSchema ; typedef struct OneHeaderText { char *text ; struct OneHeaderText *nxt ; } OneHeaderText ; // The main OneFile type - this is the primary handle used by the end user typedef struct { // this field may be set by the user bool isCheckString; // set if want to validate string char by char // these fields may be read by user - but don't change them! char *fileType; char *subType; char lineType; // current lineType char objectType; // line designation character for primary objects char groupType; // line designation character for groups (optional) I64 line; // current line number I64 byte; // current byte position when writing binary I64 object; // current object - incremented when object line read I64 group; // current group - incremented when group line read OneProvenance *provenance; // if non-zero then count['!'] entries OneReference *reference; // if non-zero then count['<'] entries OneReference *deferred; // if non-zero then count['>'] entries OneField *field; // used to hold the current line - accessed by macros OneInfo *info[128]; // all the per-linetype information I64 codecTrainingSize; // amount of data to see before building codec // fields below here are private to the package FILE *f; bool isWrite; // true if open for writing bool isHeaderOut; // true if header already written bool isBinary; // true if writing a binary file bool inGroup; // set once inside a group bool isLastLineBinary; // needed to deal with newlines on ascii files bool isIndexIn; // index read in bool isBig; // are we on a big-endian machine? bool isNoAsciiHeader; // backdoor for ONEview to avoid writing header in ascii char lineBuf[128]; // working buffers char numberBuf[32]; int nFieldMax; I64 codecBufSize; char *codecBuf; I64 nBits; // number of bits of list currently in codecBuf I64 intListBytes; // number of bytes per integer in the compacted INT_LIST I64 linePos; // current line position OneHeaderText *headerText; // arbitrary descriptive text that goes with the header char binaryTypeUnpack[256]; // invert binary line code to ASCII line character. int share; // index if slave of threaded write, +nthreads > 0 if master int isFinal; // oneFinalizeCounts has been called on file pthread_mutex_t fieldLock; // Mutexs to protect training accumumulation stats when threadded pthread_mutex_t listLock; } OneFile; // the footer will be in the concatenated result. /*********************************************************************************** * * ROUTINES FOR READING & WRITING ONE FILES IN BOTH ASCII & BINARY (TRANSPARENTLY) * **********************************************************************************/ // CREATING AND DESTROYING SCHEMAS OneSchema *oneSchemaCreateFromFile (char *path) ; OneSchema *oneSchemaCreateFromText (char *text) ; // These functions create a schema handle that can be used to open One-code data files // for reading and writing. A schema file is itself a One-code file, consisting of // a set of objects, one per primary file type. Valid lines in this file are: // P // a short string // S // a short string - any number of these // O // definition of object type // G // definition of group type - first field must be an int // D // definition of line // must be a lower or upper case letter. // is a list of field types from: // CHAR, INT, REAL, STRING, INT_LIST, REAL_LIST, STRING_LIST, DNA // Only one list type (STRING, *_LIST or DNA) is allowed per line type. // All the D lines following an O line apply to that object type. // By convention comments on each line explain the definition. // Example, with lists and strings preceded by their length in OneCode style // P 3 seq this is a sequence file // O S 1 3 DNA the DNA sequence - each S line starts an object // D Q 1 6 STRING the phred encoded quality score + ASCII 33 // D N 4 4 REAL 4 REAL 4 REAL 4 REAL signal to noise ratio in A, C, G, T channels // G g 2 3 INT 6 STRING group designator: number of objects, name // The ...FromText() alternative writes the text to a temp file and reads it with // oneSchemaCreateFromFile(). This allows code to set the schema. // Internally a schema is a linked list of OneSchema objects, with the first holding // the (hard-coded) schema for the header and footer, and the remainder each // corresponding to one primary file type. void oneSchemaDestroy (OneSchema *schema) ; // READING ONE FILES: OneFile *oneFileOpenRead (const char *path, OneSchema *schema, char *type, int nthreads) ; // Open ONE file 'path', either binary or ascii encoded, for reading. // If the file doesn't have a header, then 'type' must be specified, // otherwise, if 'type' is non-zero it must match the header type. // All header information (if present) is read. // 'schema' is also optional. If it is NULL then the file must contain its own schema. // If 'schema' is present then it must support 'type', and if the file contains its // own schema, then that must be a subset of the one for this type in 'schema'. // If nthreads > 1 then nthreadds OneFiles are generated as an array and the pointer // to the first, called the master, is returned. The other nthreads-1 files are // called slaves. The package routines are aware of when a OneFile argument is a // slave or master in a parallel group. The master recieves provenance, counts, etc. // The slaves only read data and have the virtue of sharing indices and codecs with // the master if relevant. bool oneFileCheckSchema (OneFile *vf, char *textSchema) ; // Checks if file schema is consistent with text schema. Mismatches are reported to stderr. // Filetype and all linetypes in text must match. File schema can contain additional linetypes. // e.g. if (! oneFileCheckSchema (vf, "P 3 seq\nD S 1 3 DNA\nD Q 1 6 STRING\nD P 0\n")) die () ; // This is provided to enable a program to ensure that its assumptions about data layout // are satisfied. char oneReadLine (OneFile *vf) ; // Read the next ONE formatted line returning the line type of the line, or 0 // if at the end of the data section. The content macros immediately below are // used to access the information of the line most recently read. void *_oneList (OneFile *vf) ; // lazy codec decompression if required void *_oneCompressedList (OneFile *vf) ; // lazy codec compression if required #define oneInt(vf,x) ((vf)->field[x].i) #define oneReal(vf,x) ((vf)->field[x].r) #define oneChar(vf,x) ((vf)->field[x].c) #define _LF(vf) ((vf)->info[(int)(vf)->lineType]->listField) #define oneLen(vf) ((vf)->field[_LF(vf)].len & 0xffffffffffffffll) #define oneString(vf) (char *) _oneList(vf) #define oneDNAchar(vf) (char *) _oneList(vf) #define oneDNA2bit(vf) (U8 *) _oneCompressedList(vf) #define oneIntList(vf) (I64 *) _oneList(vf) #define oneRealList(vf) (double *) _oneList(vf) #define oneNextString(vf,s) (s + strlen(s) + 1) // Access field information. The index x of a list object is not required as there is // only one list per line, stored in ->buffer. // A "string list" is implicitly supported, get the first string with oneString, and // subsequent strings sequentially with oneNextString, e.g.: // // char *s = oneString(vf); // for (i = 0; i < oneLen(vf); i++) // { // do something with i'th string // s = oneNextString(vf,s); // } char *oneReadComment (OneFile *vf); // Can be called after oneReadLine() to read any optional comment text after the fixed fields. // Returns NULL if there is no comment. // WRITING ONE FILES: OneFile *oneFileOpenWriteNew (const char *path, OneSchema *schema, char *type, bool isBinary, int nthreads); OneFile *oneFileOpenWriteFrom (const char *path, OneFile *vfIn, bool isBinary, int nthreads); // Create a new oneFile that will be written to 'path'. For the 'New' variant supply // the file type, subtype (if non-zero), and whether it should be binary or ASCII. // For the 'From' variant, specify binary or ASCII, schema and all other header // information is inherited from 'vfIn', where the count stats are from vfIn's // accumulation (assumes vfIn has been fully read or written) if 'useAccum is true, // and from vfIn's header otherwise. // If nthreads > 1 then nthreads OneFiles are generated as an array and the pointer // to the first, called the master, is returned. The other nthreads-1 files are // called slaves. The package routines are aware of when a OneFile argument is a // slave or master in a parallel group. The slaves are expected to only write data // lines, with the master adding provenance, producing the header, and then some // segment of the initial data lines. Upon close the final result is effectively // the concatenation of the master, followed by the output of each slave in sequence. bool oneInheritProvenance (OneFile *vf, OneFile *source); bool oneInheritReference (OneFile *vf, OneFile *source); bool oneInheritDeferred (OneFile *vf, OneFile *source); // Add all provenance/reference/deferred entries in source to header of vf. Must be // called before first call to oneWriteLine. bool oneAddProvenance (OneFile *vf, char *prog, char *version, char *format, ...); bool oneAddReference (OneFile *vf, char *filename, I64 count); bool oneAddDeferred (OneFile *vf, char *filename); // Append provenance/reference/deferred to header information. Must be called before // first call to oneWriteLine. // For ASCII output, if you want the header to contain count information then you must // create and fill the relevant OneCounts objects before the first call to oneWriteLine. // For BINARY output, the OneCounts information is accumulated and written automatically. void oneWriteLine (OneFile *vf, char lineType, I64 listLen, void *listBuf); // Set up a line for output just as it would be returned by oneReadLine and then call // this routine to output the line (ASCII or binary). // Use the macros above on the l.h.s. of assignments to fill fields (e.g. oneInt(vf,2) = 3). // For lists, give the length in the listLen argument, and either place the list data in your // own buffer and give it as listBuf, or put in the line's buffer and set listBuf == NULL. void oneWriteLineFrom (OneFile *vf, OneFile *source) ; // copies a line from source into vf void oneWriteLineDNA2bit (OneFile *vf, char lineType, I64 listLen, U8 *dnaBuf); // Minor variants of oneWriteLine(). // Use oneWriteLineDNA2bit for DNA lists if your DNA is already 2-bit compressed. void oneWriteComment (OneFile *vf, char *format, ...); // can not include newline \n chars // Adds a comment to the current line. Extends line in ascii, adds special line type in binary. // CLOSING FILES (FOR BOTH READ & WRITE) void oneFileClose (OneFile *vf); // Close vf (opened either for reading or writing). Finalizes counts, merges theaded files, // and writes footer if binary. Frees all non-user memory associated with vf. // GOTO & BUFFER MANAGEMENT void oneUserBuffer (OneFile *vf, char lineType, void *buffer); // A buffer is used to capture the list element of each line type that has one. // This routine allows you to reassign the buffer to one you've allocated, or // to revert to a default system buffer if 'buffer' = NULL. The previous buffer // (if any) is freed. The user must ensure that a buffer they supply is large // enough. BTW, this buffer is overwritten with each new line read of the given type. bool oneGotoObject (OneFile *vf, I64 i); // Goto i'th object in the file. This only works on binary files, which have an index. I64 oneGotoGroup (OneFile *vf, I64 i); // Goto the first object in group i. Return the size (in objects) of the group, or 0 // if an error (i out of range or vf has not group type). Only works for binary files. /*********************************************************************************** * * A BIT ABOUT THE FORMAT OF BINARY FILES * **********************************************************************************/ // <- <$-line>