summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Tille <tille@debian.org>2023-02-06 11:01:05 +0100
committerAndreas Tille <tille@debian.org>2023-02-06 11:01:05 +0100
commite8be2f3779f775961835ee94fbc656318cce9603 (patch)
treefc788d379f6f2037f176cca5f035313ec9bf478c
parentd9021b19ee773f17ce95f2893bbb9f03139bb91a (diff)
parent48aaed49b80fe221fb7ce4f21f67919f1ee12cc1 (diff)
Record daligner (1.0+git20221215.bd26967-1) in archive suite sid
-rw-r--r--LA2ONE.c (renamed from LAdump.c)370
-rw-r--r--LAa2b.c104
-rw-r--r--LAb2a.c107
-rw-r--r--Makefile16
-rw-r--r--ONE2LA.c275
-rw-r--r--ONElib.c3924
-rw-r--r--ONElib.h410
-rw-r--r--README.md115
-rw-r--r--debian/changelog7
-rw-r--r--debian/control2
-rw-r--r--debian/patches/cflags.patch2
-rw-r--r--debian/patches/cppflags.patch26
-rw-r--r--debian/patches/cross.patch22
-rw-r--r--debian/patches/destdir-install.patch2
-rw-r--r--debian/patches/lddflags.patch22
-rw-r--r--dumpLA.c177
16 files changed, 4942 insertions, 639 deletions
diff --git a/LAdump.c b/LA2ONE.c
index 1cccf76..a69db40 100644
--- a/LAdump.c
+++ b/LA2ONE.c
@@ -20,9 +20,114 @@
#include "DB.h"
#include "align.h"
+#include "ONElib.h"
static char *Usage =
- "[-cdtlo] <src1:db|dam> [<src2:db|dam>] <align:las> [<reads:FILE> | <reads:range> ...]";
+ "[-cto] <src1:db|dam> [<src2:db|dam>] <align:las> [<reads:FILE> | <reads:range> ...]";
+
+static char *One_Schema =
+ "P 3 dal This is a 1-code las file from daligner\n"
+ "D X 1 3 INT Data prolog: trace spacing\n"
+ "O P 2 3 INT 8 INT_LIST A-read and B-read list\n"
+ ". All per B-read/l.a.s:\n"
+ "D O 1 6 STRING Orientation [+-]\n"
+ "D C 1 6 STRING Chain directive [>+-.]\n"
+ "D A 1 8 INT_LIST A-read alignment intervals: (ab,ae)\n"
+ "D B 1 8 INT_LIST B-read alignment intervals: (bb,be)\n"
+ "D D 1 8 INT_LIST Diffs in alignment\n"
+ "D L 2 3 INT 8 INT_LIST A-read length and then each B-read length\n"
+ ". One line per B-read:\n"
+ "D T 1 8 INT_LIST Trace segment length\n"
+ "D Q 1 8 INT_LIST Trace segment diffs\n";
+
+static Overlap *ovls;
+static uint16 *trace;
+static int64 *list;
+static char *string;
+static OneFile *file1;
+static DAZZ_READ *read1, *read2;
+
+static int OVERLAP;
+static int DOCOORDS;
+static int DOTRACE;
+
+static void output_pile(Overlap *optr)
+{ int i, k;
+ Overlap *o;
+
+ i = 0;
+ for (o = ovls; o < optr; o++)
+ list[i++] = o->bread+1;
+ oneInt(file1,0) = ovls->aread+1;
+ oneWriteLine(file1,'P',i,list);
+
+ i = 0;
+ for (o = ovls; o < optr; o++)
+ if (COMP(o->flags))
+ string[i++] = 'c';
+ else
+ string[i++] = 'n';
+ oneWriteLine(file1,'O',i,string);
+
+ i = 0;
+ for (o = ovls; o < optr; o++)
+ if (CHAIN_NEXT(o->flags))
+ string[i++] = '-';
+ else if (BEST_CHAIN(o->flags))
+ string[i++] = '>';
+ else if (CHAIN_START(o->flags))
+ string[i++] = '+';
+ else
+ string[i++] = '.';
+ oneWriteLine(file1,'C',i,string);
+
+ if (DOCOORDS)
+ { i = 0;
+ for (o = ovls; o < optr; o++)
+ { list[i++] = o->path.abpos;
+ list[i++] = o->path.aepos;
+ }
+ oneWriteLine(file1,'A',i,list);
+
+ i = 0;
+ for (o = ovls; o < optr; o++)
+ { list[i++] = o->path.bbpos;
+ list[i++] = o->path.bepos;
+ }
+ oneWriteLine(file1,'B',i,list);
+
+ oneInt(file1,0) = read1[ovls->aread].rlen;
+ i = 0;
+ for (o = ovls; o < optr; o++)
+ list[i++] = read2[o->bread].rlen;
+ oneWriteLine(file1,'L',i,list);
+
+ i = 0;
+ for (o = ovls; o < optr; o++)
+ list[i++] = o->path.diffs;
+ oneWriteLine(file1,'D',i,list);
+ }
+
+ if (DOTRACE)
+ { uint16 *trace;
+ int tlen;
+
+ for (o = ovls; o < optr; o++)
+ { trace = (uint16 *) o->path.trace;
+ tlen = o->path.tlen;
+
+ i = 0;
+ for (k = 1; k < tlen; k += 2)
+ list[i++] = trace[k];
+ oneWriteLine(file1,'T',i,list);
+
+ i = 0;
+ for (k = 0; k < tlen; k += 2)
+ list[i++] = trace[k];
+ oneWriteLine(file1,'Q',i,list);
+ }
+ }
+}
static int ORDER(const void *l, const void *r)
{ int x = *((int *) l);
@@ -33,32 +138,50 @@ static int ORDER(const void *l, const void *r)
int main(int argc, char *argv[])
{ DAZZ_DB _db1, *db1 = &_db1;
DAZZ_DB _db2, *db2 = &_db2;
- Overlap _ovl, *ovl = &_ovl;
+ OneSchema *schema;
+ char *command;
FILE *input;
- int64 novl;
- int tspace, tbytes, small;
- int trmax;
+ int64 novl, omax, tmax;
+ int tspace, tbytes;
int reps, *pts;
int input_pts;
- int OVERLAP;
- int DOCOORDS, DODIFFS, DOTRACE, DOLENS;
int ISTWO;
- // Process options
+ // Process options and capture command line for provenance
{ int i, j, k;
int flags[128];
- ARG_INIT("LAdump")
+ ARG_INIT("LA2ONE")
+
+ { int n, t;
+ char *c;
+
+ n = 0;
+ for (t = 1; t < argc; t++)
+ n += strlen(argv[t])+1;
+
+ command = Malloc(n+1,"Allocating command string");
+ if (command == NULL)
+ exit (1);
+
+ c = command;
+ if (argc >= 1)
+ { c += sprintf(c,"%s",argv[1]);
+ for (t = 2; t < argc; t++)
+ c += sprintf(c," %s",argv[t]);
+ }
+ *c = '\0';
+ }
j = 1;
for (i = 1; i < argc; i++)
if (argv[i][0] == '-')
switch (argv[i][1])
{ default:
- ARG_FLAGS("ocdtl")
+ ARG_FLAGS("cto")
break;
}
else
@@ -67,9 +190,7 @@ int main(int argc, char *argv[])
OVERLAP = flags['o'];
DOCOORDS = flags['c'];
- DODIFFS = flags['d'];
DOTRACE = flags['t'];
- DOLENS = flags['l'];
if (DOTRACE)
DOCOORDS = 1;
@@ -77,22 +198,12 @@ int main(int argc, char *argv[])
if (argc <= 2)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
fprintf(stderr,"\n");
- fprintf(stderr," P #a #b #o #c -");
- fprintf(stderr," (#a,#b^#o) have an LA between them where #o is 'n' or 'c' and\n");
- fprintf(stderr," ");
- fprintf(stderr," #c is '>' (start of best chain), '+' (start of alternate chain),\n");
- fprintf(stderr," ");
- fprintf(stderr," '-' (continuation of chain), or '.' (no chains in file).\n");
+ fprintf(stderr," Output pile reads, orientation, and chains by default");
+ fprintf(stderr," (P, O, C lines)\n");
fprintf(stderr,"\n");
- fprintf(stderr," -c: C #ab #ae #bb #be - #a[#ab,#ae] aligns with #b^#o[#bb,#be]\n");
- fprintf(stderr," -d: D # - there are # differences in the LA\n");
- fprintf(stderr," -t: T #n -");
- fprintf(stderr," there are #n trace point intervals for the LA\n");
- fprintf(stderr," (#d #y )^#n -");
- fprintf(stderr," there are #d difference aligning the #y bp's of B with the\n");
- fprintf(stderr," next fixed-size interval of A\n");
- fprintf(stderr," -l: L #la #lb -");
- fprintf(stderr," #la is the length of the a-read and #lb that of the b-read\n");
+ fprintf(stderr," -c: Ootput also aligned intervals, read lengths, and diffs");
+ fprintf(stderr," (B, E, L, and D lines)\n");
+ fprintf(stderr," -t: Output also traces (T and Q lines)\n");
fprintf(stderr,"\n");
fprintf(stderr," -o: Output proper overlaps only\n");
@@ -283,23 +394,25 @@ int main(int argc, char *argv[])
SYSTEM_READ_ERROR
if (tspace <= TRACE_XOVR && tspace != 0)
- { small = 1;
- tbytes = sizeof(uint8);
- }
+ tbytes = sizeof(uint8);
else
- { small = 0;
- tbytes = sizeof(uint16);
- }
+ tbytes = sizeof(uint16);
free(pwd);
free(root);
}
- // Scan to count sizes of things
+ schema = oneSchemaCreateFromText(One_Schema);
+ file1 = oneFileOpenWriteNew("-",schema,"dal",true,1);
+ oneAddProvenance(file1,Prog_Name,"1.0","%s >?.dal",command);
+
+ // Scan to determine max trace length and max pile size
- { int j, al, tlen;
- int in, npt, idx, ar;
- int64 novls, odeg, omax, sdeg, smax, tmax, ttot;
+ { int in, npt, idx;
+ int j, ar, al;
+ int tlen;
+ int64 odeg;
+ Overlap _ovl, *ovl = &_ovl;
in = 0;
npt = pts[0];
@@ -307,9 +420,8 @@ int main(int argc, char *argv[])
// For each record do
- trmax = 0;
- novls = omax = smax = ttot = tmax = 0;
- sdeg = odeg = 0;
+ omax = tmax = 0;
+ odeg = 0;
al = 0;
for (j = 0; j < novl; j++)
@@ -319,8 +431,6 @@ int main(int argc, char *argv[])
{ Read_Overlap(input,ovl);
tlen = ovl->path.tlen;
fseeko(input,tlen*tbytes,SEEK_CUR);
- if (tlen > trmax)
- trmax = tlen;
// Determine if it should be displayed
@@ -359,82 +469,107 @@ int main(int argc, char *argv[])
}
if (ar != al)
- { if (sdeg > smax)
- smax = sdeg;
- if (odeg > omax)
+ { if (odeg > omax)
omax = odeg;
- sdeg = odeg = 0;
al = ar;
}
-
- novls += 1;
- odeg += 1;
- sdeg += tlen;
- ttot += tlen;
+ odeg += 1;
if (tlen > tmax)
tmax = tlen;
}
-
- if (sdeg > smax)
- smax = sdeg;
if (odeg > omax)
omax = odeg;
-
- printf("+ P %lld\n",novls);
- printf("%% P %lld\n",omax);
- if (DOTRACE)
- { printf("+ T %lld\n",ttot);
- printf("%% T %lld\n",smax);
- printf("@ T %lld\n",tmax);
- printf("X %d\n",tspace);
- }
}
// Read the file and display selected records
- { int j, k;
- uint16 *trace;
- int in, npt, idx, ar;
- DAZZ_READ *read1, *read2;
+ { int j;
+ Overlap *optr;
+ uint16 *tptr;
+ int in, npt, idx, ar, last;
rewind(input);
fread(&novl,sizeof(int64),1,input);
fread(&tspace,sizeof(int),1,input);
- trace = (uint16 *) Malloc(sizeof(uint16)*trmax,"Allocating trace vector");
- if (trace == NULL)
+ ovls = Malloc(sizeof(Overlap)*omax,"Allocating alignment array");
+ trace = Malloc(sizeof(uint16)*omax*tmax,"Allocating trace buffer");
+ string = Malloc(sizeof(int64)*omax,"Allocating 1-string");
+ if (tmax > 2*omax)
+ list = Malloc(sizeof(int64)*tmax,"Allocating 1-list");
+ else
+ list = Malloc(sizeof(int64)*omax*2,"Allocating 1-list");
+ if (ovls == NULL || trace == NULL || string == NULL || list == NULL)
exit (1);
read1 = db1->reads;
read2 = db2->reads;
+ if (DOTRACE)
+ { oneInt(file1,0) = tspace;
+ oneWriteLine(file1,'X',0,NULL);
+ }
+
+ // For each record do
+
in = 0;
npt = pts[0];
idx = 1;
- // For each record do
-
+ optr = ovls;
+ tptr = trace;
+ last = -1;
for (j = 0; j < novl; j++)
// Read it in
- { Read_Overlap(input,ovl);
- ovl->path.trace = (void *) trace;
- Read_Trace(input,ovl,tbytes);
+ { Read_Overlap(input,optr);
+ ar = optr->aread+1;
- // Determine if it should be displayed
-
- ar = ovl->aread+1;
if (in)
- { while (ar > npt)
- { npt = pts[idx++];
- if (ar < npt)
- { in = 0;
- break;
+
+ { if (ar == last)
+ { optr->path.trace = (void *) tptr;
+ Read_Trace(input,optr,tbytes);
+ if (tbytes == 1)
+ Decompress_TraceTo16(optr);
+ tptr += sizeof(uint16)*optr->path.tlen;
+ optr += 1;
+ }
+
+ else
+ { output_pile(optr);
+
+ while (ar > npt)
+ { npt = pts[idx++];
+ if (ar < npt)
+ { in = 0;
+ break;
+ }
+ npt = pts[idx++];
+ }
+
+ if (in)
+ { ovls[0] = *optr++;
+ tptr = trace;
+ optr = ovls;
+ last = ar;
+
+ optr->path.trace = (void *) tptr;
+ Read_Trace(input,optr,tbytes);
+ if (tbytes == 1)
+ Decompress_TraceTo16(optr);
+ tptr += sizeof(uint16)*optr->path.tlen;
+ optr += 1;
+ }
+ else
+ { fseeko(input,optr->path.tlen*tbytes,SEEK_CUR);
+ optr = ovls;
+ tptr = trace;
}
- npt = pts[idx++];
}
}
+
else
{ while (ar >= npt)
{ npt = pts[idx++];
@@ -444,64 +579,39 @@ int main(int argc, char *argv[])
}
npt = pts[idx++];
}
- }
- if (!in)
- continue;
- // If -o check display only overlaps
+ if (in)
+ { last = ar;
- if (OVERLAP)
- { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0)
- continue;
- if (ovl->path.aepos != db1->reads[ovl->aread].rlen &&
- ovl->path.bepos != db2->reads[ovl->bread].rlen)
- continue;
- }
-
- // Display it
-
- printf("P %d %d",ovl->aread+1,ovl->bread+1);
- if (COMP(ovl->flags))
- printf(" c");
- else
- printf(" n");
- if (CHAIN_NEXT(ovl->flags))
- printf(" -");
- else if (BEST_CHAIN(ovl->flags))
- printf(" >");
- else if (CHAIN_START(ovl->flags))
- printf(" +");
- else
- printf(" .");
- printf("\n");
-
- if (DOLENS)
- printf("L %d %d\n",read1[ovl->aread].rlen,read2[ovl->bread].rlen);
-
- if (DOCOORDS)
- printf("C %d %d %d %d\n",ovl->path.abpos,ovl->path.aepos,ovl->path.bbpos,ovl->path.bepos);
-
- if (DODIFFS)
- printf("D %d\n",ovl->path.diffs);
-
- if (DOTRACE)
- { uint16 *trace = (uint16 *) ovl->path.trace;
- int tlen = ovl->path.tlen;
-
- if (small)
- Decompress_TraceTo16(ovl);
- printf("T %d\n",tlen>>1);
- for (k = 0; k < tlen; k += 2)
- printf(" %d %d\n",trace[k],trace[k+1]);
+ optr->path.trace = (void *) tptr;
+ Read_Trace(input,optr,tbytes);
+ if (tbytes == 1)
+ Decompress_TraceTo16(optr);
+ tptr += sizeof(uint16)*optr->path.tlen;
+ optr += 1;
+ }
+ else
+ fseeko(input,optr->path.tlen*tbytes,SEEK_CUR);
}
}
+ if (in)
+ output_pile(optr);
+
+ free(string);
+ free(list);
free(trace);
+ free(ovls);
}
+ oneFileClose(file1);
+ oneSchemaDestroy(schema);
+
Close_DB(db1);
if (ISTWO)
Close_DB(db2);
+ free(command);
+
exit (0);
}
diff --git a/LAa2b.c b/LAa2b.c
deleted file mode 100644
index 5f68780..0000000
--- a/LAa2b.c
+++ /dev/null
@@ -1,104 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "DB.h"
-#include "align.h"
-
-int main(int argc, char *argv[])
-{ char code, which;
- int64 total;
- int aread, bread;
- char orient, chain;
- int alen, blen;
- int ab, ae, bb, be;
- int diffs;
- int len;
- int tspace, small;
- uint8 *tbuffer = NULL;
- uint16 *sbuffer = NULL;
-
- (void) argv;
-
- // Process arguments
-
- if (argc > 1)
- { fprintf(stderr,"Usage: LAa2b <(ascii) >(binary)\n");
- exit (1);
- }
-
- while (scanf(" %c",&code) == 1) // Header lines
- if (code == '@' || code == '+' || code == '%')
- { scanf(" %c %lld",&which,&total);
- fwrite(&code,sizeof(char),1,stdout);
- fwrite(&which,sizeof(char),1,stdout);
- fwrite(&total,sizeof(int64),1,stdout);
- if (code == '@')
- { tbuffer = (uint8 *) malloc(2*total*sizeof(uint16));
- sbuffer = (uint16 *) tbuffer;
- }
- }
- else
- { ungetc(code,stdin);
- break;
- }
- small = 0;
- if (tbuffer != NULL)
- { if (code != 'X')
- { fprintf(stderr,"LAa2b: .las dump has traces but no X-line\n");
- exit (1);
- }
- scanf(" X %d",&tspace);
- small = (tspace <= TRACE_XOVR && tspace != 0);
- fwrite(&code,sizeof(char),1,stdout);
- fwrite(&tspace,sizeof(int),1,stdout);
- }
-
- while (scanf(" %c",&code) == 1) // For each data line do
- { fwrite(&code,sizeof(char),1,stdout);
- switch (code)
- { case 'P': // Alignment pair
- scanf(" %d %d %c %c",&aread,&bread,&orient,&chain);
- fwrite(&aread,sizeof(int),1,stdout);
- fwrite(&bread,sizeof(int),1,stdout);
- fwrite(&orient,sizeof(char),1,stdout);
- fwrite(&chain,sizeof(char),1,stdout);
- break;
- case 'L': // Read lengths
- scanf(" %d %d",&alen,&blen);
- fwrite(&len,sizeof(int),1,stdout);
- fwrite(&blen,sizeof(int),1,stdout);
- break;
- case 'C': // Coordinate intervals
- scanf(" %d %d %d %d",&ab,&ae,&bb,&be);
- fwrite(&ab,sizeof(int),1,stdout);
- fwrite(&ae,sizeof(int),1,stdout);
- fwrite(&bb,sizeof(int),1,stdout);
- fwrite(&be,sizeof(int),1,stdout);
- break;
- case 'D': // Differences
- scanf(" %d",&diffs);
- fwrite(&diffs,sizeof(int),1,stdout);
- break;
- case 'T': // Mask
- if (tbuffer == NULL)
- { fprintf(stderr,"LAa2b: .las dump has traces but no @ T-line\n");
- exit (1);
- }
- scanf(" %d",&len);
- fwrite(&len,sizeof(int),1,stdout);
- len *= 2;
- if (small)
- { for (int i = 0; i < len; i += 2)
- scanf(" %hhd %hhd",tbuffer+i,tbuffer+(i+1));
- fwrite(tbuffer,sizeof(uint8),len,stdout);
- }
- else
- { for (int i = 0; i < len; i += 2)
- scanf(" %hd %hd",sbuffer+i,sbuffer+(i+1));
- fwrite(sbuffer,sizeof(uint16),len,stdout);
- }
- }
- }
-
- exit (0);
-}
diff --git a/LAb2a.c b/LAb2a.c
deleted file mode 100644
index 0c0dbf8..0000000
--- a/LAb2a.c
+++ /dev/null
@@ -1,107 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "DB.h"
-#include "align.h"
-
-int main(int argc, char *argv[])
-{ char code, which;
- int64 total;
- int aread, bread;
- char orient, chain;
- int alen, blen;
- int ab, ae, bb, be;
- int diffs;
- int len;
- int tspace, small;
- uint8 *tbuffer = NULL;
- uint16 *sbuffer = NULL;
-
- (void) argv;
-
- // Process arguments
-
- if (argc > 1)
- { fprintf(stderr,"Usage: LAa2b <(ascii) >(binary)\n");
- exit (1);
- }
-
- if (fread(&code,sizeof(char),1,stdin) == 0)
- code = 0;
-
- while (code == '@' || code == '+' || code == '%')
- { fread(&which,sizeof(char),1,stdin);
- fread(&total,sizeof(int64),1,stdin);
- printf("%c %c %lld\n",code,which,total);
- if (code == '@')
- { tbuffer = (uint8 *) malloc(2*total*sizeof(uint16));
- sbuffer = (uint16 *) tbuffer;
- }
-
- if (fread(&code,sizeof(char),1,stdin) == 0)
- code = 0;
- }
-
- small = 0;
- if (tbuffer != NULL && code != 0)
- { if (code != 'X')
- { fprintf(stderr,"LAb2a: .las dump has traces but no X-info\n");
- exit (1);
- }
- fread(&tspace,sizeof(int),1,stdin);
- small = (tspace <= TRACE_XOVR && tspace != 0);
- printf("X %d\n",tspace);
- }
-
- while (code != 0) // For each data item do
- { switch (code)
- { case 'P': // Alignment pair
- fread(&aread,sizeof(int),1,stdin);
- fread(&bread,sizeof(int),1,stdin);
- fread(&orient,sizeof(char),1,stdin);
- fread(&chain,sizeof(char),1,stdin);
- printf("%c %d %d %c %c\n",code,aread,bread,orient,chain);
- break;
- case 'L': // Read lengths
- scanf(" %d %d",&alen,&blen);
- fread(&len,sizeof(int),1,stdin);
- fread(&blen,sizeof(int),1,stdin);
- printf("%c %d %d\n",code,alen,blen);
- break;
- case 'C': // Coordinate intervals
- fread(&ab,sizeof(int),1,stdin);
- fread(&ae,sizeof(int),1,stdin);
- fread(&bb,sizeof(int),1,stdin);
- fread(&be,sizeof(int),1,stdin);
- printf("%c %d %d %d %d\n",code,ab,ae,bb,be);
- break;
- case 'D': // Differences
- fread(&diffs,sizeof(int),1,stdin);
- printf("%c %d\n",code,diffs);
- break;
- case 'T': // Mask
- if (tbuffer == NULL)
- { fprintf(stderr,"LAb2a: .las dump has traces but no @ T-info\n");
- exit (1);
- }
- fread(&len,sizeof(int),1,stdin);
- printf("%c %d\n",code,len);
- len *= 2;
- if (small)
- { fread(tbuffer,sizeof(uint8),len,stdin);
- for (int i = 0; i < len; i += 2)
- printf(" %d %d\n",tbuffer[i],tbuffer[i+1]);
- }
- else
- { fread(sbuffer,sizeof(uint16),len,stdin);
- for (int i = 0; i < len; i += 2)
- printf(" %d %d\n",sbuffer[i],sbuffer[i+1]);
- }
- }
-
- if (fread(&code,sizeof(char),1,stdin) == 0)
- code = 0;
- }
-
- exit (0);
-}
diff --git a/Makefile b/Makefile
index d3be2ea..2ac6359 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ DEST_DIR = ~/bin
CFLAGS += -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing
-ALL = daligner HPC.daligner LAsort LAmerge LAsplit LAcat LAshow LAdump LAcheck LAa2b LAb2a dumpLA
+ALL = daligner HPC.daligner LAsort LAmerge LAsplit LAcat LAshow LA2ONE LAcheck ONE2LA
all: $(ALL)
@@ -24,8 +24,8 @@ LAmerge: LAmerge.c align.h DB.c DB.h QV.c QV.h
LAshow: LAshow.c align.c align.h DB.c DB.h QV.c QV.h
$(CC) $(CPPFLAGS) $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS)
-LAdump: LAdump.c align.c align.h DB.c DB.h QV.c QV.h
- $(CC) $(CPPFLAGS) $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm $(LDFLAGS)
+LA2ONE: LA2ONE.c align.c align.h DB.c DB.h QV.c QV.h ONElib.c ONElib.h
+ $(CC) $(CPPFLAGS) $(CFLAGS) -o LA2ONE LA2ONE.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS)
LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h
$(CC) $(CPPFLAGS) $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS)
@@ -36,14 +36,8 @@ LAsplit: LAsplit.c align.h DB.c DB.h QV.c QV.h
LAcheck: LAcheck.c align.c align.h DB.c DB.h QV.c QV.h
$(CC) $(CPPFLAGS) $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS)
-LAa2b: LAa2b.c align.c align.h DB.c DB.h QV.c QV.h
- $(CC) $(CPPFLAGS) $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm $(LDFLAGS)
-
-LAb2a: LAb2a.c align.c align.h DB.c DB.h QV.c QV.h
- $(CC) $(CPPFLAGS) $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm $(LDFLAGS)
-
-dumpLA: dumpLA.c align.c align.h DB.c DB.h QV.c QV.h
- $(CC) $(CPPFLAGS) $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm $(LDFLAGS)
+ONE2LA: ONE2LA.c align.c align.h DB.c DB.h QV.c QV.h ONElib.c ONElib.h
+ $(CC) $(CPPFLAGS) $(CFLAGS) -o ONE2LA ONE2LA.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS)
clean:
rm -f $(ALL)
diff --git a/ONE2LA.c b/ONE2LA.c
new file mode 100644
index 0000000..77b4bae
--- /dev/null
+++ b/ONE2LA.c
@@ -0,0 +1,275 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "DB.h"
+#include "align.h"
+#include "ONElib.h"
+
+static char *One_Schema =
+ "P 3 dal\n"
+ "D X 1 3 INT\n" // Data prolog: trace spacing
+ "O P 2 3 INT 8 INT_LIST\n" // A-read and B-read list
+ // All per B-read
+ "D O 1 6 STRING\n" // orientation [+-]
+ "D C 1 6 STRING\n" // chain directive [>+-.]
+ "D A 1 8 INT_LIST\n" // (ab,ae)
+ "D B 1 8 INT_LIST\n" // (be,be)
+ "D L 2 3 INT 8 INT_LIST\n" // la and then each lb
+ "D D 1 8 INT_LIST\n" // diff
+ // One line per B-read
+ "D T 1 8 INT_LIST\n" // trace segment length
+ "D Q 1 8 INT_LIST\n"; // trace segment diffs
+
+int main(int argc, char *argv[])
+{ int64 novls, Pmax, Tmax, psize;
+ Overlap *ovls, *otop, *o;
+ void *trace, *ttop;
+ int aread;
+ int has[128];
+ int tspace, small, tbytes;
+
+ OneFile *file1;
+ OneSchema *schema;
+ int64 *list;
+ char *string, *command;
+
+ int t, i, j, k;
+
+ // Process arguments and capture command line for provenance
+
+ { int n, t;
+ char *c;
+
+ n = 0;
+ for (t = 1; t < argc; t++)
+ n += strlen(argv[t])+1;
+
+ command = Malloc(n+1,"Allocating command string");
+ if (command == NULL)
+ exit (1);
+
+ c = command;
+ if (argc >= 1)
+ { c += sprintf(c,"%s",argv[1]);
+ for (t = 2; t < argc; t++)
+ c += sprintf(c," %s",argv[t]);
+ }
+ *c = '\0';
+ }
+
+ if (argc != 2)
+ { fprintf(stderr,"Usage: ONE2LA <align:dal> > (.las)\n");
+ exit (1);
+ }
+
+ { char *pwd, *root, *path;
+ FILE *output;
+
+ pwd = PathTo(argv[1]);
+ root = Root(argv[1],".dal");
+ path = Catenate(pwd,"/",root,".dal");
+ if ((output = fopen(path,"r")) == NULL)
+ { free(root);
+ root = Root(argv[1],".1dal");
+ path = Catenate(pwd,"/",root,".1dal");
+ if ((output = fopen(path,"r")) == NULL)
+ { fprintf(stderr,"ONE2LA: Cannot open %s for reading\n",argv[1]);
+ exit (1);
+ }
+ }
+ fclose(output);
+ free(root);
+ free(pwd);
+
+ schema = oneSchemaCreateFromText(One_Schema);
+
+ file1 = oneFileOpenRead(path,schema,"dal",1);
+
+ oneAddProvenance(file1,"ONE2LA","1.0","%s >?.las",command);
+ }
+
+ if (file1->info['A']->given.count == 0)
+ { fprintf(stderr,"ONE2LA: .dal file does not contatin coordinate information\n");
+ exit (1);
+ }
+ if (file1->info['T']->given.count == 0)
+ { fprintf(stderr,"ONE2LA: .dal file does not contatin trace information\n");
+ exit (1);
+ }
+
+ t = oneReadLine(file1);
+ if (t == 0 || t != 'X')
+ { fprintf(stderr,"ONE2LA: .dal data segment does not begine with an 'X'-line\n");
+ exit (1);
+ }
+
+ tspace = oneInt(file1,0);
+ if (tspace <= TRACE_XOVR && tspace != 0)
+ { small = 1;
+ tbytes = 1;
+ }
+ else
+ { small = 0;
+ tbytes = 2;
+ }
+
+ Pmax = file1->info['P']->given.max;
+ Tmax = file1->info['T']->given.max;
+
+ trace = Malloc(sizeof(uint16)*Tmax*Pmax,"Allocating trace buffer");
+ ovls = Malloc(sizeof(Overlap)*Pmax,"Allocating overlap vector");
+ list = Malloc(sizeof(int64)*(Pmax+Tmax),"Allocating integer list");
+ string = Malloc(Pmax+1,"Allocating max string");
+
+ novls = file1->info['P']->given.total;
+ fwrite(&novls,sizeof(int64),1,stdout);
+ fwrite(&tspace,sizeof(int),1,stdout);
+
+ while ((t = oneReadLine(file1)) != 0)
+ { if (t != 'P')
+ { fprintf(stderr,"ONE2LA: Pile data does not begin with a P-line\n");
+ exit(1);
+ }
+ psize = oneLen(file1);
+ list = oneIntList(file1);
+ aread = oneInt(file1,0)-1;
+ for (i = 0; i < psize; i++)
+ { ovls[i].aread = aread;
+ ovls[i].bread = list[i]-1;
+ }
+
+ ttop = trace;
+ otop = ovls + psize;
+ has['O'] = has['C'] = has['A'] = has['B'] = has['L'] = has['D'] = has['T'] = has['Q'] = 0;
+ for (o = ovls; o < otop; o++)
+ { o->flags = 0;
+ o->path.tlen = -1;
+ }
+
+
+ for (j = 0; j < 6+2*psize; j++)
+ { t = oneReadLine(file1);
+
+ if (t == 0)
+ { fprintf(stderr,"ONE2LA: Pile object not followed by sufficient auxilliary lines\n");
+ exit (1);
+ }
+ if (has[t] > 0 && t != 'T' && t != 'Q')
+ { fprintf(stderr,"ONE2LA: Pile has more than one '%c' line\n",t);
+ exit (1);
+ }
+ has[t] += 1;
+ if (t == 'A' || t == 'B')
+ { if (oneLen(file1) != 2*psize)
+ { fprintf(stderr,"ONE2LA: %c-line has incorrect list length\n",t);
+ exit (1);
+ }
+ }
+ else if (t != 'T' && t != 'Q')
+ { if (oneLen(file1) != psize)
+ { fprintf(stderr,"ONE2LA: %c-line has incorrect list length\n",t);
+ exit (1);
+ }
+ }
+ else
+ { if (has[t] > psize)
+ { fprintf(stderr,"ONE2LA: Too many %c-lines for pile\n",t);
+ exit (1);
+ }
+ }
+
+ switch (t)
+ { case 'O':
+ string = oneString(file1);
+ i = 0;
+ for (o = ovls; o < otop; o++)
+ if (string[i++] == 'c')
+ o->flags |= COMP_FLAG;
+ break;
+ case 'C':
+ string = oneString(file1);
+ i = 0;
+ for (o = ovls; o < otop; o++)
+ if (string[i] == '-')
+ o->flags |= NEXT_FLAG;
+ else if (string[i] == '>')
+ o->flags |= BEST_FLAG;
+ else if (string[i] == '+')
+ o->flags |= START_FLAG;
+ break;
+ case 'A':
+ list = oneIntList(file1);
+ i = 0;
+ for (o = ovls; o < otop; o++)
+ { o->path.abpos = list[i++];
+ o->path.aepos = list[i++];
+ }
+ break;
+ case 'B':
+ list = oneIntList(file1);
+ i = 0;
+ for (o = ovls; o < otop; o++)
+ { o->path.bbpos = list[i++];
+ o->path.bepos = list[i++];
+ }
+ break;
+ case 'L':
+ break;
+ case 'D':
+ list = oneIntList(file1);
+ i = 0;
+ for (o = ovls; o < otop; o++)
+ o->path.diffs = list[i++];
+ break;
+ case 'T':
+ case 'Q':
+ list = oneIntList(file1);
+ o = ovls + (has[t]-1);
+ if (o->path.tlen >= 0)
+ { if (o->path.tlen != 2*oneLen(file1))
+ { fprintf(stderr,"LA2ONE: T and Q line lengths do not correspond\n");
+ exit (1);
+ }
+ }
+ else
+ { o->path.tlen = 2*oneLen(file1);
+ o->path.trace = ttop;
+ ttop += o->path.tlen*tbytes;
+ }
+ if (t == 'Q')
+ k = 0;
+ else
+ k = 1;
+ if (tbytes == 1)
+ { uint8 *t8 = (uint8 *) o->path.trace;
+ for (i = 0; k < o->path.tlen; k += 2)
+ t8[k] = list[i++];
+ }
+ else
+ { uint16 *t16 = (uint16 *) o->path.trace;
+ for (i = 0; k < o->path.tlen; k += 2)
+ t16[k] = list[i++];
+ }
+ break;
+ default:
+ fprintf(stderr,"LA2ONE: Unrecognized line type '%c'\n",t);
+ exit (1);
+ }
+ }
+
+ if (has['T'] != psize || has['Q'] != psize)
+ { fprintf(stderr,"ONE2LA: # of pile traces != pile size\n");
+ exit (1);
+ }
+
+ for (o = ovls; o < otop; o++)
+ Write_Overlap(stdout,o,tbytes);
+ }
+
+ oneSchemaDestroy(schema);
+
+ free(command);
+
+ exit (0);
+}
diff --git a/ONElib.c b/ONElib.c
new file mode 100644
index 0000000..a47e3bd
--- /dev/null
+++ b/ONElib.c
@@ -0,0 +1,3924 @@
+/*****************************************************************************************
+ *
+ * file: ONElib.c
+ * implementation for ONElib.h
+ *
+ * Author: Richard Durbin (rd109@cam.ac.uk)
+ * Copyright (C) Richard Durbin, Cambridge University and Eugene Myers 2019-
+ *
+ * HISTORY:
+ * Last edited: Dec 4 23:57 2022 (rd109)
+ * * Apr 23 00:31 2020 (rd109): global rename of VGP to ONE, Vgp to One, vgp to one
+ * * Apr 20 11:27 2020 (rd109): added VgpSchema to make schema dynamic
+ * * Dec 27 09:46 2019 (gene): style edits + compactify code
+ * * Jul 8 04:28 2019 (rd109): refactored to use info[]
+ * * Created: Thu Feb 21 22:40:28 2019 (rd109)
+ *
+ ****************************************************************************************/
+
+#include <sys/errno.h>
+#include <sys/types.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <stdarg.h>
+#include <time.h>
+#include <ctype.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <math.h>
+
+#ifdef DEBUG
+#include <assert.h>
+#else
+#define assert(x) 0
+#endif
+
+#include "ONElib.h"
+
+// set major and minor code versions
+
+#define MAJOR 1
+#define MINOR 1
+
+// utilities with implementation at the end of the file
+
+static void die(char *format, ...); // print message to stderr and exit -1
+static void *myalloc(size_t size); // allocate block, die if malloc fails
+static void *mycalloc(size_t number, size_t size); // allocate & zero # objects of size
+#define new(n,type) (type *) myalloc((n)*sizeof(type)) // actually use these not myalloc
+#define new0(n,type) (type *) mycalloc((n),sizeof(type))
+
+// global required for parallelisation
+
+static pthread_mutex_t mutexInit = PTHREAD_MUTEX_INITIALIZER;
+
+// forward declarations of serialisation functions lower in the file
+// RD 220818: I think that many of int below should be I64, e.g. for len, ilen etc.
+
+OneCodec *vcCreate();
+void vcAddToTable(OneCodec *vc, int len, char *bytes);
+void vcAddHistogram(OneCodec *vc, OneCodec *vh);
+void vcCreateCodec(OneCodec *vc, int partial);
+void vcDestroy(OneCodec *vc);
+int vcMaxSerialSize();
+int vcSerialize(OneCodec *vc, void *out);
+OneCodec *vcDeserialize(void *in);
+int vcEncode(OneCodec *vc, int ilen, char *ibytes, char *obytes);
+int vcDecode(OneCodec *vc, int ilen, char *ibytes, char *obytes);
+
+// forward declarations of 64-bit integer encoding/decoding
+
+static inline int ltfWrite (I64 x, FILE *f) ;
+static inline I64 ltfRead (FILE *f) ;
+
+/***********************************************************************************
+ *
+ * ONE_FILE CREATION & DESTRUCTION
+ *
+ **********************************************************************************/
+
+char* oneTypeString[] = { 0, "INT", "REAL", "CHAR", "STRING",
+ "INT_LIST", "REAL_LIST", "STRING_LIST", "DNA" } ;
+
+/******************* OneInfo ********************/
+
+static OneInfo *infoCreate (int nField)
+{ OneInfo *vi = new0 (1, OneInfo) ;
+ vi->nField = nField ;
+ if (nField) vi->fieldType = new (nField, OneType) ;
+ return vi;
+}
+
+static OneInfo *infoDeepCopy (OneInfo *vi0)
+{ OneInfo *vi = new (1, OneInfo) ;
+ *vi = *vi0 ;
+ if (vi->nField)
+ { vi->fieldType = new (vi->nField, OneType) ;
+ memcpy (vi->fieldType, vi0->fieldType, vi->nField*sizeof(OneType)) ;
+ }
+ if (vi->listCodec && vi->listCodec != DNAcodec) vi->listCodec = vcCreate() ;
+ if (vi->comment) vi->comment = strdup (vi0->comment) ;
+ return vi ;
+}
+
+static bool infoCheckFields (OneInfo *vi, OneFile *vf)
+{ // check field types against the STRING_LIST in vf
+ char *s = oneString(vf) ;
+ int i ;
+ if (vi->nField != oneLen(vf)) return false ;
+ for (i = 0 ; i < vi->nField ; ++i, s = oneNextString(vf,s))
+ if (strcmp (oneTypeString[vi->fieldType[i]], s)) return false ;
+ return true ;
+}
+
+static void infoDestroy (OneInfo *vi)
+{ if (vi->buffer && ! vi->isUserBuf) free (vi->buffer) ;
+ if (vi->listCodec) vcDestroy (vi->listCodec) ;
+ if (vi->fieldType) free (vi->fieldType) ;
+ if (vi->comment) free (vi->comment) ;
+ free (vi);
+}
+
+/******************* OneSchema ********************/
+
+// a utility to set the OneInfo list information
+
+static int listEltSize[9] = { 0, 0, 0, 0, 1, sizeof(I64), sizeof(double), 1, 1 } ;
+
+static void schemaAddInfoFromArray (OneSchema *vs, int n, OneType *a, char t, char type)
+{
+ // use during the bootstrap, while parsing .def files, and while parsing ~ lines in other files
+
+ if (vs->info[(int) t])
+ die ("duplicate schema specification for linetype %c in filetype %s", t, vs->primary) ;
+ if (isalpha(t) && type == 'G')
+ { if (vs->groupType) die ("second group type in schema for filetype %s", vs->primary) ;
+ vs->groupType = t ;
+ }
+ else if (isalpha(t) && type == 'O')
+ { if (vs->objectType) die ("second object type in schema for filetype %s", vs->primary) ;
+ vs->objectType = t ;
+ }
+ else if (vs->primary && (type != 'D' || !isalpha(t))) // allow non-alphabetic lines in header
+ die ("non-alphabetic linetype %c (ascii %d) in schema for filetype %s",t,t,vs->primary) ;
+
+ if (n > vs->nFieldMax) vs->nFieldMax = n ;
+
+ OneInfo *vi = infoCreate (n) ;
+
+ memcpy (vi->fieldType, a, n*sizeof(OneType)) ;
+ int i ;
+ for (i = 0 ; i < n ; ++i)
+ if (a[i] >= oneSTRING)
+ { if (vi->listEltSize)
+ die ("OneFile schema error; multiple list types for linetype definition %c", t) ;
+ vi->listEltSize = listEltSize[vi->fieldType[i]] ;
+ vi->listField = i ;
+ if (a[i] == oneDNA)
+ { vi->listCodec = DNAcodec ; vi->isUseListCodec = true ; }
+ else
+ vi->listCodec = vcCreate () ; // always make a listCodec for any list type
+ }
+
+ if (t >= 'A' && t <= 'Z') vi->binaryTypePack = ((t-'A') << 1) | (char) 0x80 ;
+ else if (t >= 'a' && t <= 'z') vi->binaryTypePack = ((26+t-'a') << 1) | (char) 0x80 ;
+ else if (t == ';') vi->binaryTypePack = (52 << 2) | (char) 0x80 ;
+ else if (t == '&') vi->binaryTypePack = (53 << 2) | (char) 0x80 ;
+ else if (t == '*') vi->binaryTypePack = (54 << 2) | (char) 0x80 ;
+ else if (t == '/') vi->binaryTypePack = (55 << 2) | (char) 0x80 ;
+ else if (t == '.') vi->binaryTypePack = (56 << 2) | (char) 0x80 ;
+
+ vs->info[(int)t] = vi ;
+}
+
+static void schemaAddInfoFromLine (OneSchema *vs, OneFile *vf, char t, char type)
+{ // assumes field specification is in the STRING_LIST of the current vf line
+ // need to set vi->comment separately
+
+ static OneType a[32] ;
+ int i ;
+ OneType j ;
+ char *s = oneString(vf) ;
+ int n = oneLen(vf) ;
+
+ if (n > 32)
+ die ("line specification %d fields too long - need to recompile", n) ;
+
+ for (i = 0 ; i < n ; ++i, s = oneNextString(vf,s))
+ { a[i] = 0 ;
+ for (j = oneINT ; j <= oneDNA ; ++j)
+ if (!strcmp (s, oneTypeString[j])) a[i] = j ;
+ if (!a[i])
+ die ("ONE schema error: bad field %d of %d type %s in line %d type %c",
+ i, n, s, vf->line, t) ;
+ }
+
+ schemaAddInfoFromArray (vs, n, a, t, type) ;
+
+ if (oneReadComment (vf))
+ vs->info[(int)t]->comment = strdup (oneReadComment(vf)) ;
+}
+
+static OneSchema *schemaLoadRecord (OneSchema *vs, OneFile *vf)
+{ char *s;
+
+ // parse a schema specfication line from vf and add into vs
+ // return value is vs unless a new primary type is declared, in which case vs->nxt
+
+ switch (vf->lineType)
+ {
+ case '.': // ignore - blank or comment line in schema file
+ break ;
+ case 'P':
+ if (vs->primary && !vs->objectType)
+ die ("schema: file type %s has no object type", vs->primary) ;
+ if (oneLen(vf) == 0) die ("schema: primary name must have at least one letter") ;
+ OneSchema *vsNxt = new0 (1, OneSchema) ;
+ vs->nxt = vsNxt ;
+ vs = vsNxt ;
+ s = oneString(vf);
+ vs->primary = new (oneLen(vf)+1, char) ;
+ strcpy (vs->primary, s) ;
+ vs->nFieldMax = 4 ; // needed for header
+ break ;
+ case 'S':
+ if (oneLen(vf) == 0) die ("schema: secondary name must have at least one letter") ;
+ if (vs->nSecondary)
+ { char **temp = vs->secondary ;
+ vs->secondary = new (vs->nSecondary+1, char*) ;
+ memcpy (vs->secondary, temp, vs->nSecondary*sizeof(char*)) ;
+ free (temp) ;
+ }
+ else
+ vs->secondary = new (1, char*) ;
+ s = oneString(vf);
+ vs->secondary[vs->nSecondary] = new0 (oneLen(vf)+1, char) ;
+ strcpy (vs->secondary[vs->nSecondary++], s) ;
+ break ;
+ case 'G': // group type
+ case 'O': // object type
+ case 'D': // standard record type
+ schemaAddInfoFromLine (vs, vf, oneChar(vf,0), vf->lineType) ;
+ break ;
+ default:
+ die ("unrecognized schema line %d starting with %c", vf->line, vf->lineType) ;
+ }
+
+ return vs ;
+}
+
+static void oneFileDestroy (OneFile *vf) ; // need a forward declaration here
+
+OneSchema *oneSchemaCreateFromFile (char *filename)
+{
+ FILE *fs = fopen (filename, "r") ;
+ if (!fs) return 0 ;
+ fclose(fs);
+
+ OneSchema *vs = new0 (1, OneSchema) ;
+
+ OneFile *vf = new0 (1, OneFile) ; // shell object to support bootstrap
+ // bootstrap specification of linetypes to read schemas
+ { OneInfo *vi ;
+ vi = vf->info['P'] = infoCreate (1) ; // to define the schema for parsing a .def file
+ vi->fieldType[0] = oneSTRING ; vi->listEltSize = 1 ; vi->listField = 0 ;
+ vi = vf->info['O'] = infoCreate (2) ; // object type specification
+ vi->fieldType[0] = oneCHAR ;
+ vi->fieldType[1] = oneSTRING_LIST ; vi->listEltSize = 1 ; vi->listField = 1 ;
+ vi = vf->info['D'] = infoCreate (2) ; // line type specification
+ vi->fieldType[0] = oneCHAR ;
+ vi->fieldType[1] = oneSTRING_LIST ; vi->listEltSize = 1 ; vi->listField = 1 ;
+ vf->info['/'] = infoCreate (0) ; // to store comments
+ vf->field = new (2, OneField) ;
+ }
+
+ // first load the universal header and footer (non-alphabetic) line types
+ // do this by writing their schema into a temporary file and parsing it into the base schema
+ { errno = 0 ;
+ static char template[64] ;
+#define VALGRIND_MACOS
+#ifdef VALGRIND_MACOS // MacOS valgrind is missing functions to make temp files it seems
+ sprintf (template, "/tmp/OneSchema.%d", getpid()) ;
+ vf->f = fopen (template, "w+") ;
+ if (errno) die ("failed to open temporary file %s errno %d\n", template, errno) ;
+#else
+ strcpy (template, "/tmp/OneSchema.XXXXXX") ;
+ int fd = mkstemp (template) ;
+ if (errno) die ("failed to open temporary file %s errno %d\n", template, errno) ;
+ vf->f = fdopen (fd, "w+") ;
+ if (errno) die ("failed to assign temporary file to stream: errno %d\n", errno) ;
+#endif
+ unlink (template) ; // this ensures that the file is removed on closure
+ if (errno) die ("failed to remove temporary file %s errno %d\n", template, errno) ;
+ }
+
+ // NB if you change the header spec and add a record with more than 4 fields,
+ // change the assignment of ->nFieldMax in the 'P' section of schemaLoadRecord() above
+
+ fprintf (vf->f, "D 1 3 6 STRING 3 INT 3 INT first line: 3-letter type, major, minor version\n") ;
+ fprintf (vf->f, "D 2 1 6 STRING subtype: 3-letter subtype\n") ;
+ fprintf (vf->f, "D # 2 4 CHAR 3 INT linetype, count\n") ;
+ fprintf (vf->f, "D @ 2 4 CHAR 3 INT linetype, list max\n") ;
+ fprintf (vf->f, "D + 2 4 CHAR 3 INT linetype, list total\n") ;
+ fprintf (vf->f, "D %% 4 4 CHAR 4 CHAR 4 CHAR 3 INT group, #/+, linetype, value\n") ;
+ fprintf (vf->f, "D ! 1 11 STRING_LIST provenance: program, version, command, date\n") ;
+ fprintf (vf->f, "D < 2 6 STRING 3 INT reference: filename, object count\n") ;
+ fprintf (vf->f, "D > 1 6 STRING deferred: filename\n") ;
+ fprintf (vf->f, "D ~ 3 4 CHAR 4 CHAR 11 STRING_LIST embedded schema linetype definition\n") ;
+ fprintf (vf->f, "D . 0 blank line, anywhere in file\n") ;
+ fprintf (vf->f, "D $ 1 3 INT binary file - goto footer: isBigEndian\n") ;
+ fprintf (vf->f, "D ^ 0 binary file: end of footer designation\n") ;
+ fprintf (vf->f, "D - 1 3 INT binary file: offset of start of footer\n") ;
+ fprintf (vf->f, "D & 1 8 INT_LIST binary file: object index\n") ;
+ fprintf (vf->f, "D * 1 8 INT_LIST binary file: group index\n") ;
+ fprintf (vf->f, "D ; 2 4 CHAR 6 STRING binary file: list codec\n") ;
+ fprintf (vf->f, "D / 1 6 STRING binary file: comment\n") ;
+ if (fseek (vf->f, 0, SEEK_SET)) die ("ONE schema failure: cannot rewind tmp file") ;
+ while (oneReadLine (vf))
+ schemaLoadRecord (vs, vf) ;
+
+ // next reuse the temp file to load the schema for reading schemas
+ if (fseek (vf->f, 0, SEEK_SET)) die ("ONE schema failure: cannot rewind tmp file") ;
+ fprintf (vf->f, "P 3 def this is the primary file type for schemas\n") ;
+ fprintf (vf->f, "O P 1 6 STRING primary type name\n") ;
+ fprintf (vf->f, "D S 1 6 STRING secondary type name\n") ;
+ fprintf (vf->f, "D G 2 4 CHAR 11 STRING_LIST define linetype for groupType\n") ;
+ fprintf (vf->f, "D O 2 4 CHAR 11 STRING_LIST define linetype for objectType\n") ;
+ fprintf (vf->f, "D D 2 4 CHAR 11 STRING_LIST define linetype for other records\n") ;
+ fprintf (vf->f, "\n") ; // terminator
+ if (fseek (vf->f, 0, SEEK_SET)) die ("ONE schema failure: cannot rewind tmp file") ;
+ OneSchema *vs0 = vs ; // need this because loadInfo() updates vs on reading P lines
+ vf->line = 0 ;
+ while (oneReadLine (vf))
+ vs = schemaLoadRecord (vs, vf) ;
+ OneSchema *vsDef = vs ; // will need this to destroy it once the true schema is read
+ oneFileDestroy (vf) ; // this will also effectively remove the temp file on closing
+
+ // finally read the schema itself
+ if (!(vf = oneFileOpenRead (filename, vs0, "def", 1)))
+ return 0 ;
+ vs = vs0 ; // set back to vs0, so next filetype spec will replace vsDef
+ vs->nxt = 0 ;
+ oneSchemaDestroy (vsDef) ; // no longer need this, and can destroy because unlinked from vs0
+ while (oneReadLine (vf))
+ vs = schemaLoadRecord (vs, vf) ;
+ oneFileDestroy (vf) ;
+
+ return vs0 ;
+}
+
+static char *schemaFixNewlines (const char *text)
+{ // replace literal "\n" by '\n' chars in text
+ char *newText = strdup (text) ;
+ char *s = newText, *t = s ;
+ while (*s)
+ if (*s == '\\' && s[1] == 'n')
+ { *t++ = '\n' ; s += 2 ; }
+ else
+ *t++ = *s++ ;
+ *t = 0 ;
+ return newText ;
+}
+
+OneSchema *oneSchemaCreateFromText (char *text) // write to temp file and call CreateFromFile()
+{
+ static char template[64] ;
+ sprintf (template, "/tmp/OneTextSchema-%d.def", getpid()) ;
+
+ errno = 0 ;
+ FILE *f = fopen (template, "w") ;
+ char *fixedText = schemaFixNewlines (text) ;
+ fprintf (f, "%s\n", fixedText) ;
+ free (fixedText) ;
+ fclose (f) ;
+ if (errno) die ("failed to write temporary file %s errno %d\n", template, errno) ;
+
+ OneSchema *vs = oneSchemaCreateFromFile (template) ;
+
+ errno = 0 ;
+ unlink (template) ; // delete temporary file - not ideal: will remain if schemaCreate crashes
+ if (errno) die ("failed to remove temporary file %s errno %d\n", template, errno) ;
+
+ return vs ;
+}
+
+static OneSchema *oneSchemaCreateDynamic (char *fileType, char *subType)
+{ // this is clean, but it seems a bit wasteful to create a temp file
+ char text[32] ;
+ assert (fileType && strlen(fileType) > 0) ;
+ assert (!subType || strlen(subType) > 0) ;
+ if (subType)
+ sprintf (text, "P %ld %s\nS %ld %s\n", strlen(fileType),fileType, strlen(subType), subType) ;
+ else
+ sprintf (text, "P %ld %s\n", strlen(fileType), fileType) ;
+ OneSchema *vs = oneSchemaCreateFromText (text) ;
+ return vs ;
+}
+
+void oneSchemaDestroy (OneSchema *vs)
+{ int i ;
+ while (vs)
+ { for (i = 0 ; i < 128 ; ++i) if (vs->info[i]) infoDestroy (vs->info[i]) ;
+ if (vs->nSecondary)
+ { for (i = 0 ; i < vs->nSecondary ; ++i) free (vs->secondary[i]) ;
+ free (vs->secondary) ;
+ }
+ free(vs->primary);
+ OneSchema *t = vs->nxt ;
+ free (vs) ;
+ vs = t ;
+ }
+}
+
+/*************************************/
+
+static inline void setCodecBuffer (OneInfo *vi)
+{
+ vi->bufSize = vcMaxSerialSize() + 1; // +1 for added but unused 0-terminator
+ vi->buffer = new (vi->bufSize, void);
+}
+
+static OneFile *oneFileCreate (OneSchema **vsp, char *type)
+{ // searches through the linked list of vs to find type, either as primary or a secondary
+ // if found fills and returns vf, else returns 0
+
+ int i, j ;
+ OneFile *vf = new0 (1, OneFile) ;
+ char *secondary = 0 ;
+ OneSchema *vs = *vsp ;
+
+ // fprintf (stderr, "oneFileCreate vs %lx type %s\n", (unsigned long)vs, type) ;
+
+ // transfer header info
+ for (i = 0 ; i < 128 ; ++i)
+ if (vs->info[i]) vf->info[i] = infoDeepCopy (vs->info[i]) ;
+
+ // find type in schema
+ while ((vs = vs->nxt))
+ if (!strcmp (type, vs->primary))
+ break ;
+ else if (vs->nSecondary)
+ { for (j = 0 ; j < vs->nSecondary ; ++j)
+ if (!strcmp (type, vs->secondary[j])) break ;
+ if (j < vs->nSecondary) { secondary = vs->secondary[j] ; break ; }
+ }
+ if (!vs)
+ { oneFileDestroy (vf) ;
+ return 0 ; // failed to find a match
+ }
+
+ // transfer info from matched schema
+ for (i = 0 ; i < 128 ; ++i)
+ if (vs->info[i]) vf->info[i] = infoDeepCopy (vs->info[i]) ;
+
+ // build binaryTypeUnpack[]
+ for (i = 0 ; i < 128 ; ++i)
+ if (vf->info[i] && vf->info[i]->binaryTypePack)
+ { U8 x = vf->info[i]->binaryTypePack ;
+ vf->binaryTypeUnpack[x] = i ;
+ vf->binaryTypeUnpack[x+1] = i ;
+ }
+
+ // set other information
+ vf->objectType = vs->objectType ;
+ vf->groupType = vs->groupType ;
+ vf->fileType = new (strlen(vs->primary)+1, char);
+ strcpy (vf->fileType, vs->primary) ;
+ if (secondary)
+ { vf->subType = new (strlen(secondary)+1, char);
+ strcpy (vf->subType, secondary) ;
+ }
+ vf->nFieldMax = vs->nFieldMax ;
+ vf->field = new (vf->nFieldMax, OneField) ;
+
+ // setup for compression
+
+ vf->codecTrainingSize = 100000;
+ setCodecBuffer (vf->info[';']) ;
+
+ // determine endian of machine
+ { int t = 1;
+ char *b = (char *) (&t);
+ vf->isBig = (*b == 0);
+ }
+
+ *vsp = vs ;
+ return vf ;
+}
+
+static void provRefDefCleanup (OneFile *vf)
+{ int n ;
+
+ if (vf->provenance)
+ { OneProvenance *p = vf->provenance ;
+ for (n = vf->info['!']->accum.count ; n-- ; p++)
+ { free (p->program) ;
+ free (p->version) ;
+ free (p->command) ;
+ free (p->date) ;
+ }
+ free (vf->provenance) ;
+ }
+ if (vf->reference)
+ { OneReference *r = vf->reference ;
+ for (n = vf->info['<']->accum.count ; n-- ; r++)
+ free (r->filename) ;
+ free (vf->reference) ;
+ }
+ if (vf->deferred)
+ { OneReference *r = vf->deferred ;
+ for (n = vf->info['>']->accum.count ; n-- ; r++)
+ free (r->filename) ;
+ free (vf->deferred) ;
+ }
+}
+
+static void oneFileDestroy (OneFile *vf)
+{ int i, j;
+ OneInfo *li, *lx;
+
+ if (vf->share)
+ { for (i = 0; i < 128 ; i++)
+ { lx = vf->info[i];
+ if (lx != NULL)
+ { for (j = 1; j < vf->share; j++)
+ { li = vf[j].info[i];
+ if (li != lx) // the index OneInfos are shared
+ { if (li->listCodec == lx->listCodec) li->listCodec = NULL;
+ infoDestroy(li);
+ }
+ }
+ }
+ }
+
+ for (j = 1; j < vf->share; j++)
+ { provRefDefCleanup (&vf[j]) ;
+ if (vf[j].codecBuf != NULL) free (vf[j].codecBuf);
+ if (vf[j].f != NULL) fclose (vf[j].f);
+ }
+ }
+
+ provRefDefCleanup (vf) ;
+ if (vf->codecBuf != NULL) free (vf->codecBuf);
+ if (vf->f != NULL && vf->f != stdout) fclose (vf->f);
+
+ for (i = 0; i < 128 ; i++)
+ if (vf->info[i] != NULL)
+ infoDestroy (vf->info[i]);
+
+ if (vf->field) free (vf->field) ;
+
+ if (vf->headerText)
+ { OneHeaderText *t = vf->headerText ;
+ while (t)
+ { free (t->text) ;
+ { OneHeaderText *nxt = t->nxt ; free (t) ; t = nxt ; }
+ }
+ }
+
+ free(vf->fileType);
+ free(vf->subType);
+
+ free (vf) ;
+}
+
+/***********************************************************************************
+ *
+ * ASCII PARSING UTILITIES: error reporting, lexical level
+ *
+ **********************************************************************************/
+
+void parseError (OneFile *vf, char *format, ...)
+{ va_list args;
+
+ fprintf (stderr, "ONE PARSE ERROR ");
+
+ va_start (args, format);
+ vfprintf (stderr, format, args);
+ va_end (args);
+
+ vf->lineBuf[vf->linePos] = '\0';
+ fprintf (stderr, ", line %" PRId64 ": %s\n", vf->line, vf->lineBuf);
+
+ exit (1);
+}
+
+static inline char vfGetc(OneFile *vf)
+{ char c = getc(vf->f);
+ if (vf->linePos < 127)
+ vf->lineBuf[vf->linePos++] = c;
+ return c;
+}
+
+static inline void eatWhite (OneFile *vf)
+{ char x = vfGetc(vf);
+ if (x == ' ') // 200414: removed option to have tab instead of space
+ return;
+ parseError (vf, "failed to find expected space separation character");
+}
+
+static inline char readChar(OneFile *vf)
+{ eatWhite(vf);
+ return vfGetc(vf);
+}
+
+static inline char *readBuf(OneFile *vf)
+{ char x, *cp, *endBuf;
+
+ eatWhite (vf);
+ endBuf = vf->numberBuf + 32;
+ for (cp = vf->numberBuf; cp < endBuf ; cp++)
+ { x = vfGetc(vf);
+ if (isspace(x) || x == '\0' || x == EOF)
+ break;
+ *cp = x;
+ }
+ if (cp >= endBuf)
+ { cp[-1] = 0;
+ parseError (vf, "overlong item %s", vf->numberBuf);
+ }
+ else
+ { ungetc (x, vf->f);
+ vf->linePos -= 1;
+ *cp = 0;
+ }
+ return vf->numberBuf;
+}
+
+static inline I64 readInt(OneFile *vf)
+{ char *e, *b;
+ I64 x;
+
+ b = readBuf(vf);
+ x = strtoll(b, &e, 10);
+ if (e == b)
+ parseError (vf, "empty int field");
+ if (*e != '\0')
+ parseError (vf, "bad int");
+ return x;
+}
+
+static inline double readReal(OneFile *vf)
+{ char *e, *b;
+ double x;
+
+ b = readBuf(vf);
+ x = strtod (b, &e);
+ if (e == b)
+ parseError (vf, "empty real field");
+ if (*e != '\0')
+ parseError (vf, "bad real");
+ return (x);
+}
+
+static inline void readString(OneFile *vf, char *buf, I64 n)
+{ eatWhite (vf);
+ if (vf->isCheckString)
+ { char *cp = buf;
+ --cp;
+ while (n-- && (*++cp = vfGetc (vf)))
+ if (*cp == '\n' || *cp == EOF)
+ break;
+ if (++n)
+ parseError (vf, "line too short %d", buf);
+ *++cp = 0;
+ }
+ else
+ { if ((I64) fread (buf, 1, n, vf->f) != n)
+ die ("ONE parse error: failed to read %d byte string", n);
+ buf[n] = 0 ;
+ }
+}
+
+static inline void readFlush (OneFile *vf) // reads to the end of the line and stores as comment
+{ char x;
+ int n = 0;
+ OneInfo *li = vf->info['/'] ;
+
+ // check the first character - if it is newline then done
+ x = getc (vf->f) ;
+ if (x == '\n')
+ return ;
+ else if (x != ' ')
+ parseError (vf, "comment not separated by a space") ;
+
+ // else the remainder of the line is a comment
+ if (!li->bufSize)
+ { li->bufSize = 1024 ;
+ li->buffer = new (li->bufSize, char) ;
+ }
+ while ((x = getc (vf->f)) && x != '\n')
+ if (x == EOF)
+ parseError (vf, "premature end of file");
+ else
+ { if ((n+1) >= li->bufSize)
+ { char *s = new (2*li->bufSize, char) ;
+ memcpy (s, li->buffer, li->bufSize) ;
+ free (li->buffer) ;
+ li->buffer = s ;
+ li->bufSize *= 2 ;
+ }
+ ((char*)li->buffer)[n] = x ;
+ ++n ;
+ }
+ ((char*)li->buffer)[n] = 0 ; // string terminator
+}
+
+
+/***********************************************************************************
+ *
+ * LIST BUFFER & COUNT MANAGEMENT: error reporting, lexical level
+ *
+ **********************************************************************************/
+
+ // Ensure line type t buffer can handles size+nStrings, and accumulate counts
+
+static inline void updateCountsAndBuffer (OneFile *vf, char t, I64 size, I64 nStrings)
+{ OneInfo *li;
+
+ li = vf->info[(int) t];
+ li->accum.total += size;
+ if (size > li->accum.max)
+ li->accum.max = size;
+ size += nStrings; // need to allocate space for terminal 0s
+ if ( ! li->isUserBuf && size > li->bufSize) // expand buffer
+ { if (li->buffer != NULL) free (li->buffer);
+ li->bufSize = size;
+ li->buffer = new (size*li->listEltSize, void);
+ }
+}
+
+ // Called when a new group starts or eof, accumulate group counts since last group start
+
+static inline void updateGroupCount(OneFile *vf, bool isGroupLine)
+{ int i;
+ OneInfo *li;
+ OneCounts *ci;
+
+ for (i = 'A'; i <= 'Z' ; i++)
+ { li = vf->info[i];
+ if (li != NULL)
+ { ci = &(li->accum);
+ if (vf->inGroup)
+ { if (ci->groupCount < ci->count - li->gCount)
+ ci->groupCount = ci->count - li->gCount;
+ if (ci->groupTotal < ci->total - li->gTotal)
+ ci->groupTotal = ci->total - li->gTotal;
+ }
+ else
+ { li->oCount = ci->count;
+ li->oTotal = ci->total;
+ }
+ li->gCount = ci->count;
+ li->gTotal = ci->total;
+ }
+ }
+ if (isGroupLine)
+ { vf->group += 1;
+ vf->inGroup = true;
+ }
+}
+
+/***********************************************************************************
+ *
+ * BINARY INT LIST COMPACTION & UNCOMPACTION
+ *
+ **********************************************************************************/
+
+static char *compactIntList (OneFile *vf, OneInfo *li, I64 len, char *buf, int *usedBytes)
+{ char *y;
+ int d, k;
+ I64 z, i, mask, *ibuf;
+
+ ibuf = (I64 *) buf;
+
+ for (i = len-1; i > 0; i--) // convert to differences - often a big win, else harmless
+ ibuf[i] -= ibuf[i-1];
+
+ mask = 0; // find how many top bytes can be skipped
+ for (i = 1; i < len; i++)
+ if (ibuf[i] >= 0)
+ mask |= ibuf[i];
+ else
+ mask |= -(ibuf[i]+1);
+
+ k = sizeof(I64) ;
+ mask >>= 7;
+ for (d = 1; d < k; d++)
+ { if (mask == 0)
+ break;
+ mask >>= 8;
+ }
+ *usedBytes = d ;
+
+ z = k - d; // number of 0 bytes
+ if (z == 0) return (char*)&ibuf[1] ;
+
+ if (buf != li->buffer && !li->isUserBuf && (I64) (li->bufSize*sizeof(I64)) < d*len)
+ { if (li->buffer != NULL)
+ free (li->buffer);
+ li->bufSize = ((d*len) / sizeof(I64)) + 1;
+ li->buffer = new (li->bufSize * sizeof(I64), void);
+ }
+
+ y = li->buffer ;
+ buf += sizeof(I64) ; --len ; // don't record the first element of buf, which is not a diff
+ if (vf->isBig) // copy d bytes per I64, ignoring z before or after depending on isBig
+ while (len--)
+ { buf += z;
+ for (k = 0; k < d; k++)
+ *y++ = *buf++;
+ }
+ else
+ while (len--)
+ { for (k = 0; k < d; k++)
+ *y++ = *buf++;
+ buf += z;
+ }
+
+ return li->buffer ;
+}
+
+static void decompactIntList (OneFile *vf, I64 len, char *buf, int usedBytes)
+{ int d, z, k;
+ char *s, *t;
+
+ z = sizeof(I64) - usedBytes ;
+
+ if (z > 0) // decompacts in place
+ { buf += sizeof(I64) ; --len ; // don't decompact 0th element
+ d = usedBytes;
+ s = buf + d*len;
+ t = s + z*len;
+ if (vf->isBig)
+ while (s > buf)
+ { for (k = 0; k < d; k++)
+ *--t = *--s;
+ if (*s & 0x80)
+ for (k = 0; k < z; k++)
+ *--t = 0xff;
+ else
+ for (k = 0; k < z; k++)
+ *--t = 0x0;
+ }
+ else
+ while (s > buf)
+ { if (s[-1] & 0x80)
+ for (k = 0; k < z; k++)
+ *--t = 0xff;
+ else
+ for (k = 0; k < z; k++)
+ *--t = 0;
+ for (k = 0; k < d; k++)
+ *--t = *--s;
+ }
+ buf -= sizeof(I64) ; ++len ;
+ }
+
+ { I64 i, *x = (I64 *) buf; // revert differencing
+ for (i = 1; i < len; i++)
+ x[i] += x[i-1];
+ }
+}
+
+// read and write compressed fields
+
+static inline int writeCompressedFields (FILE *f, OneField *field, OneInfo *li)
+{
+ int i, n = 0 ;
+
+ for (i = 0 ; i < li->nField ; ++i)
+ switch (li->fieldType[i])
+ {
+ case oneREAL: fwrite (&field[i].r, 8, 1, f) ; n += 8 ; break ;
+ case oneCHAR: putc (field[i].c, f) ; ++n ; break ;
+ default: // includes INT and all the LISTs, which store their length in field as an INT
+ n += ltfWrite (field[i].i, f) ;
+ }
+
+ return n ;
+}
+
+static inline void readCompressedFields (FILE *f, OneField *field, OneInfo *li)
+{
+ int i ;
+
+ for (i = 0 ; i < li->nField ; ++i)
+ switch (li->fieldType[i])
+ {
+ case oneREAL: fread (&field[i].r, 8, 1, f) ; break ;
+ case oneCHAR: field[i].c = fgetc (f) ; break ;
+ default: // includes INT and all the LISTs, which store their length in field as an INT
+ field[i].i = ltfRead (f) ;
+ }
+}
+
+/***********************************************************************************
+ *
+ * ONE_READ_LINE:
+ * Reads the next line and returns false at end of file or on error. The line is
+ * parsed according to its linetype and contents accessed by macros that follow.
+ * The top bit of the first character determines whether the line is binary or ascii
+ *
+ **********************************************************************************/
+
+ // Read a string list, first into new allocs, then into sized line buffer.
+ // Annoyingly inefficient, but we don't use it very much.
+
+static void readStringList(OneFile *vf, char t, I64 len)
+{ int j;
+ I64 totLen, sLen;
+ char **string, *buf;
+
+ totLen = 0;
+ string = new (len, char *);
+ for (j = 0; j < len ; ++j)
+ { sLen = readInt (vf);
+ totLen += sLen;
+ string[j] = new (sLen+1, char);
+ readString (vf, string[j], sLen);
+ }
+
+ updateCountsAndBuffer (vf, t, totLen, len);
+
+ buf = (char *) vf->info[(int) t]->buffer;
+ for (j = 0; j < len ; ++j)
+ { strcpy (buf, string[j]);
+ buf += strlen(buf) + 1;
+ free (string[j]);
+ }
+ free (string);
+}
+
+static bool addProvenance(OneFile *vf, OneProvenance *from, int n) ; // need forward declaration
+
+char oneReadLine (OneFile *vf)
+{ bool isAscii;
+ U8 x;
+ char t;
+ OneInfo *li;
+
+ assert (!vf->isWrite) ;
+ assert (!vf->isFinal) ;
+
+ vf->linePos = 0; // must come before first vfGetc()
+ x = vfGetc (vf); // read first char
+ if (feof (vf->f) || x == '\n') // blank line (x=='\n') is end of records marker before footer
+ { vf->lineType = 0 ; // additional marker of end of file
+ return 0;
+ }
+
+ vf->line += 1; // otherwise assume this is a good line, and die if not
+ if (x & 0x80)
+ { isAscii = false;
+ t = vf->binaryTypeUnpack[x];
+ }
+ else
+ { isAscii = true;
+ t = x;
+ }
+ vf->lineType = t;
+
+ li = vf->info[(int) t];
+ if (li == NULL)
+ parseError (vf, "unknown line type %c(%d was %d) line %d", t, t, x, (int)vf->line);
+ li->accum.count += 1;
+ if (t == vf->objectType)
+ vf->object += 1;
+ if (t == vf->groupType)
+ updateGroupCount (vf, true);
+
+ // fprintf (stderr, "reading line %" PRId64 " type %c nField %d listElt %d\n", vf->line, t, li->nField, li->listEltSize) ;
+
+ if (vf->info['/']->bufSize) // clear the comment buffer
+ *(char*)(vf->info['/']->buffer) = 0 ;
+
+ vf->nBits = 0 ; // will use for any compressed data read in
+
+ if (isAscii) // read field by field according to ascii spec
+ { int i, j;
+ I64 *ilst, len;
+ double *rlst;
+
+ for (i = 0; i < li->nField; i++)
+ switch (li->fieldType[i])
+ {
+ case oneINT:
+ vf->field[i].i = readInt (vf);
+ // printf (" field %d int %d\n", i, (int)oneInt(vf,i)) ;
+ break;
+ case oneREAL:
+ vf->field[i].r = readReal (vf);
+ break;
+ case oneCHAR:
+ vf->field[i].c = readChar (vf);
+ // printf (" field %d char %c\n", i, (int)oneChar(vf,i)) ;
+ break;
+ case oneSTRING:
+ case oneDNA:
+ len = readInt (vf);
+ vf->field[i].len = len;
+ updateCountsAndBuffer (vf, t, len, 1);
+ readString (vf, (char*) li->buffer, len);
+ break;
+ case oneINT_LIST:
+ len = readInt (vf);
+ vf->field[i].len = len;
+ updateCountsAndBuffer (vf, t, len, 0);
+ ilst = (I64 *) li->buffer;
+ for (j = 0; j < len; ++j)
+ ilst[j] = readInt(vf);
+ break;
+ case oneREAL_LIST:
+ len = readInt (vf);
+ vf->field[i].len = len;
+ updateCountsAndBuffer (vf, t, len, 0);
+ rlst = (double *) li->buffer;
+ for (j = 0; j < len; ++j)
+ rlst[j] = readReal (vf);
+ break;
+ case oneSTRING_LIST: // STRING_LIST - inefficient for now - also used for binary
+ len = readInt (vf);
+ vf->field[i].len = len;
+ // printf (" field %d string list len %d\n", i, (int)oneLen(vf)) ;
+ readStringList (vf, t, len);
+ break;
+ }
+ readFlush (vf);
+ }
+
+ else // binary - block read fields and list, potentially compressed
+ {
+ // read the fields
+
+ if (li->nField > 0)
+ readCompressedFields (vf->f, vf->field, li) ;
+
+ if (t == vf->groupType) // must follow reading the fields
+ { I64 *groupIndex = (I64 *) vf->info['*']->buffer;
+ oneInt(vf,0) = groupIndex[vf->group] - groupIndex[vf->group-1];
+ }
+
+ // read the list if there is one
+
+ if (li->listEltSize > 0)
+ { I64 listLen = oneLen(vf);
+
+ if (listLen > 0)
+ { li->accum.total += listLen;
+ if (listLen > li->accum.max)
+ li->accum.max = listLen;
+
+ if (li->fieldType[li->listField] == oneINT_LIST)
+ { *(I64*)li->buffer = ltfRead (vf->f) ;
+ if (listLen == 1) goto doneLine ;
+ vf->intListBytes = getc(vf->f) ;
+ }
+
+ if (li->fieldType[li->listField] == oneSTRING_LIST) // handle as ASCII
+ readStringList (vf, t, listLen);
+ else if (x & 0x1) // list is compressed
+ { vf->nBits = ltfRead (vf->f) ;
+ if (fread (vf->codecBuf, ((vf->nBits+7) >> 3), 1, vf->f) != 1)
+ die ("ONE read error: fail to read compressed list");
+ }
+ else if (li->fieldType[li->listField] == oneINT_LIST)
+ { I64 listSize = (listLen-1) * vf->intListBytes ;
+ if ((I64) fread (&(((I64*)li->buffer)[1]), 1, listSize, vf->f) != listSize)
+ die ("ONE read error: failed to read list size %" PRId64 "", listSize);
+ decompactIntList (vf, listLen, li->buffer, vf->intListBytes);
+ }
+ else
+ { I64 listSize = listLen * li->listEltSize ;
+ if ((I64) fread (li->buffer, 1, listSize, vf->f) != listSize)
+ die ("ONE read error: failed to read list size %" PRId64 "", listSize);
+ }
+ }
+
+ if (li->fieldType[li->listField] == oneSTRING)
+ ((char *) li->buffer)[listLen] = '\0'; // 0 terminate
+ }
+
+ doneLine:
+
+ { U8 peek = getc(vf->f) ; // check if next line is a comment - if so then read it
+ ungetc(peek, vf->f) ;
+ if (peek & 0x80)
+ peek = vf->binaryTypeUnpack[peek];
+ if (peek == '/') // a comment
+ { OneField keepField0 = vf->field[0] ;
+ oneReadLine (vf) ; // read comment line into vf->info['/']->buffer
+ vf->lineType = t ;
+ vf->field[0] = keepField0 ;
+ }
+ }
+ }
+
+ return t;
+}
+
+char *oneReadComment (OneFile *vf)
+{ char *comment = (char*)(vf->info['/']->buffer) ;
+
+ if (comment && *comment != 0)
+ return comment ;
+ else
+ return 0 ;
+}
+
+void *_oneList (OneFile *vf)
+{
+ OneInfo *li = vf->info[(int) vf->lineType] ;
+
+ if (vf->nBits)
+ { if (li->fieldType[li->listField] == oneINT_LIST) // first elt is already in buffer
+ { vcDecode (li->listCodec, vf->nBits, vf->codecBuf, (char*)&(((I64*)li->buffer)[1])) ;
+ decompactIntList (vf, oneLen(vf), li->buffer, vf->intListBytes) ;
+ }
+ else
+ vcDecode (li->listCodec, vf->nBits, vf->codecBuf, li->buffer) ;
+ vf->nBits = 0 ; // so we don't do it again
+ }
+
+ return li->buffer ;
+}
+
+void *_oneCompressedList (OneFile *vf)
+{
+ OneInfo *li = vf->info[(int) vf->lineType] ;
+
+ if (!vf->nBits && oneLen(vf) > 0) // need to compress
+ vcEncode (li->listCodec, oneLen(vf), vf->info[(int) vf->lineType]->buffer, vf->codecBuf);
+
+ return (void*) vf->codecBuf ;
+}
+
+/***********************************************************************************
+ *
+ * ONE_FILE_OPEN_READ:
+ * Opens file for reading and reads all lines of header if it exists.
+ * If there is no header then fileType must be given (i.e. non-zero),
+ * otherwise if fileType is non-zero then it must match the type in the header.
+ * If the header contains a $-line then it is binary and the routine reads
+ * the footer that includes decompressors and indexes.
+ *
+ **********************************************************************************/
+
+OneFile *oneFileOpenRead (const char *path, OneSchema *vs, char *fileType, int nthreads)
+{
+ OneFile *vf ;
+ off_t startOff = 0, footOff;
+ OneSchema *vs0 = vs ;
+ bool isDynamic = false ; // if we are making the schema from the header
+
+ assert (fileType == NULL || strlen(fileType) > 0) ;
+
+ // first open the file, read first header line if it exists, and create the OneFile object
+
+ { FILE *f ;
+ char *name ;
+ int curLine = 0 ;
+ U8 c ;
+
+ if (strcmp (path, "-") == 0)
+ f = stdin;
+ else
+ { f = fopen (path, "r");
+ if (f == NULL)
+ return NULL;
+ }
+
+#define OPEN_ERROR1(x) { fprintf (stderr,"ONE file error %s: %s\n", path, x) ; \
+ fclose(f) ; return NULL; }
+#define OPEN_ERROR3(x,y,z) { fprintf (stderr,"ONE file error %s: ", path) ; \
+ fprintf (stderr,x,y,z) ; fprintf (stderr, "\n") ; fclose(f) ; return NULL ; }
+
+ c = getc(f);
+ if (feof(f))
+ OPEN_ERROR1("file is empty") ;
+
+ if (c == '1')
+ { int major, minor, slen;
+
+ if (fscanf (f, " %d", &slen) != 1)
+ OPEN_ERROR1("line 1: failed to read type name length") ;
+ if (slen == 0)
+ OPEN_ERROR1("line 1: type name is empty string") ;
+ name = new0 (slen+1, char);
+ if (fscanf (f, " %s %d %d", name, &major, &minor) != 3)
+ OPEN_ERROR1("line 1: failed to read remainder of line") ;
+ while (getc(f) != '\n')
+ if (feof(f))
+ OPEN_ERROR1("end of file before end of line 1") ;
+ ++curLine ;
+ if (major != MAJOR)
+ OPEN_ERROR3("major version file %d > code %d", major, MAJOR) ;
+ if (minor > MINOR)
+ OPEN_ERROR3("minor version file %d > code %d", minor, MINOR) ;
+ }
+ else
+ { ungetc (c, f) ;
+ if (!fileType)
+ OPEN_ERROR1("attempting to open a file without the type being defined") ;
+ name = new0 (strlen(fileType)+1, char);
+ strcpy (name, fileType) ;
+ }
+
+ if (!vs) // create a shell schema, which can be filled from the header
+ { vs0 = vs = oneSchemaCreateDynamic (name, 0) ;
+ isDynamic = true ;
+ }
+
+ vf = oneFileCreate (&vs, name) ;
+ if (!vf)
+ OPEN_ERROR1("failed to create OneFile object") ;
+ if (fileType && strcmp (fileType, vf->fileType) && strcmp (fileType, vf->subType))
+ { oneFileDestroy (vf) ;
+ OPEN_ERROR3("fileType mismatch file %s != requested %s", vf->fileType, fileType) ;
+ }
+
+ free(name);
+
+ vf->f = f;
+ vf->line = curLine;
+ }
+
+ // read header and (optionally) footer
+ // recognise end of header by peeking at the first char to check if alphabetic
+
+ vf->isCheckString = true; // always check strings while reading header
+ while (true)
+ { U8 peek = getc(vf->f);
+
+ if (feof(vf->f)) // loop exit at end of file
+ break;
+ ungetc(peek, vf->f);
+
+ if (peek & 0x80)
+ peek = vf->binaryTypeUnpack[peek];
+
+ if (isalpha(peek))
+ break; // loop exit at standard data line
+
+ oneReadLine(vf); // can't fail because we checked file eof already
+
+ switch (vf->lineType)
+ {
+ case '1':
+ parseError(vf, "1 should be first line in header");
+ break;
+
+ case '2':
+ if (oneLen(vf) != 3)
+ parseError (vf, "secondary subType must have length 3") ;
+ if (isDynamic)
+ { char *s = oneString(vf);
+ vf->subType = new (oneLen(vf)+1, char);
+ strcpy (vf->subType, s) ;
+ }
+ else
+ { char *sub = oneString(vf) ;
+ int i ;
+ for (i = 0 ; i < vs->nSecondary ; ++i)
+ if (!strcmp (sub, vs->secondary[i])) break ;
+ if (i < vs->nSecondary)
+ { vf->subType = new (strlen(sub)+1, char);
+ strcpy (vf->subType, sub) ;
+ }
+ else
+ parseError (vf, "subtype %s not compatible with primary type %s",
+ sub, vf->fileType);
+ }
+ break;
+
+ case '.': // blank line for spacing and header text
+ { char *text = oneReadComment (vf) ;
+ if (text)
+ { OneHeaderText **t = &vf->headerText ;
+ while (*t) t = &((*t)->nxt) ;
+ *t = new0 (1, OneHeaderText) ;
+ (*t)->text = strdup (text) ;
+ }
+ break ;
+ }
+
+ case '~': // schema definition line
+ { char t = oneChar(vf,1) ;
+ OneInfo *vi = vs->info[(int)t] ;
+ if (vi)
+ { if (!infoCheckFields (vi, vf))
+ { fprintf (stderr, "ONE file error %s: schema mismatch line %" PRId64 " linetype %c\n",
+ path, vf->line, t) ;
+ oneFileDestroy (vf) ;
+ return NULL ;
+ }
+ }
+ else if (isDynamic)
+ { int oldMax = vf->nFieldMax ;
+ schemaAddInfoFromLine (vs, vf, t, oneChar(vf,0)) ;
+ if (oneChar(vf,0) == 'G') vf->groupType = vs->groupType ;
+ if (oneChar(vf,0) == 'O') vf->objectType = vs->objectType ;
+ vi = vs->info[(int)t] ;
+ vf->info[(int)t] = infoDeepCopy (vi) ;
+ if (vi->binaryTypePack)
+ { U8 x = vi->binaryTypePack ;
+ vf->binaryTypeUnpack[x] = t ;
+ vf->binaryTypeUnpack[x+1] = t ;
+ }
+ if (vs->nFieldMax > oldMax)
+ { free (vf->field) ;
+ vf->nFieldMax = vs->nFieldMax ;
+ vf->field = new (vf->nFieldMax, OneField) ;
+ }
+ }
+ }
+ break ;
+
+ case '#': // count information
+ case '@':
+ case '+':
+ case '%':
+ { char c = oneChar(vf,0);
+ OneInfo *li = vf->info[(int) c];
+
+ if (li == NULL)
+ parseError (vf, "unknown line type %c", c);
+ switch (vf->lineType)
+ { case '#':
+ li->given.count = oneInt(vf,1);
+ if (c == vf->objectType && vf->isBinary) // allocate space for object index
+ { vf->info['&']->bufSize = li->given.count;
+ vf->info['&']->buffer = new (li->given.count, I64);
+ }
+ if (c == vf->groupType && vf->isBinary) // allocate space for group index
+ { vf->info['*']->bufSize = li->given.count+1; // +1 for end value
+ vf->info['*']->buffer = new (vf->info['*']->bufSize, I64);
+ }
+ break;
+ case '@':
+ li->given.max = oneInt(vf,1);
+ li->bufSize = li->given.max + 1; // allow for string terminators
+ li->buffer = new (li->bufSize*li->listEltSize, void);
+ break;
+ case '+':
+ li->given.total = oneInt(vf,1);
+ break;
+ case '%':
+ c = oneChar(vf,2);
+ li = vf->info[(int) c];
+ if (li == NULL)
+ parseError (vf, "unknown line type %c", c);
+ c = oneChar(vf,1);
+ if (c == '#')
+ li->given.groupCount = oneInt(vf,3);
+ else if (c == '+')
+ li->given.groupTotal = oneInt(vf,3);
+ else
+ parseError (vf, "unrecognised symbol %c", c);
+ break;
+ }
+ }
+ break;
+
+ case '!': // NB need to copy the strings
+ { OneProvenance p ;
+ p.program = oneString(vf) ;
+ p.version = p.program + strlen(p.program) + 1 ;
+ p.command = p.version + strlen(p.version) + 1 ;
+ p.date = p.command + strlen(p.command) + 1 ;
+ vf->info['!']->accum.count -= 1; // to avoid double counting
+ addProvenance (vf, &p, 1) ;
+ }
+ break;
+
+ case '<':
+ vf->info['<']->accum.count -= 1; // to avoid double counting
+ oneAddReference (vf, oneString(vf), oneInt(vf,1));
+ break;
+
+ case '>':
+ vf->info['>']->accum.count -= 1; // to avoid double counting
+ oneAddDeferred (vf, oneString(vf));
+ break;
+
+ // Below here are binary file header types - requires given.count/given.max first
+
+ case '$': // read footer - goto end, find offset to start of footer and go there
+ if (oneInt(vf,0) != vf->isBig)
+ die ("ONE file error: endian mismatch - convert file to ascii");
+ vf->isBinary = true;
+
+ startOff = ftello (vf->f);
+ if (fseek (vf->f, -sizeof(off_t), SEEK_END) != 0)
+ die ("ONE file error: can't seek to final line");
+
+ if (fread (&footOff, sizeof(off_t), 1, vf->f) != 1)
+ die ("ONE file error: can't read footer offset");
+
+ if (fseeko (vf->f, footOff, SEEK_SET) != 0)
+ die ("ONE file error: can't seek to start of footer");
+ break;
+
+ case '^': // end of footer - return to where we jumped from header
+ if (fseeko (vf->f, startOff, SEEK_SET) != 0)
+ die ("ONE file error: can't seek back");
+ break;
+
+ case '&':
+ vf->isIndexIn = true;
+ break;
+
+ case '*':
+ break;
+
+ case ';':
+ vf->info[(int) oneChar(vf,0)]->listCodec = vcDeserialize (oneString(vf));
+ break;
+
+ default:
+ parseError (vf, "unknown header line type %c", vf->lineType);
+ break;
+ }
+ }
+ vf->isCheckString = false; // user can set this back to true if they wish
+
+ if (!vf->objectType) // failed to get a schema from function call or from file
+ { fprintf (stderr, "ONEfile error %s: no schema available\n", path) ;
+ oneFileDestroy (vf) ;
+ return NULL ;
+ }
+
+ // allocate codec buffer - always allocate enough to handle fields of all line types
+
+ { I64 size = vf->nFieldMax * sizeof(OneField) ;
+ int i ;
+
+ for (i = 0; i < 128; ++i)
+ if (vf->info[i])
+ { OneInfo *li = vf->info[i];
+ if (li->listCodec && size < li->given.max * li->listEltSize)
+ size = li->given.max * li->listEltSize;
+ }
+ vf->codecBufSize = size+1;
+ vf->codecBuf = new (vf->codecBufSize, void); // add one for worst case codec usage
+ }
+
+ // if parallel, allocate a OneFile array for parallel thread objects, switch vf to head of array
+
+ if (nthreads > 1)
+ { int i, j ;
+
+ if (strcmp (path, "-") == 0)
+ die ("ONE error: parallel input incompatible with stdin as input");
+
+ { OneFile *vf0 = vf ;
+ vf = new (nthreads, OneFile);
+ vf[0] = *vf0 ;
+ vf->share = nthreads ;
+ free (vf0) ; // NB free() not oneFileDestroy because don't want deep destroy
+ }
+
+ startOff = ftello (vf->f) ;
+ for (i = 1; i < nthreads; i++)
+ { vs = vs0 ; // needed because vs will have changed to map to the relevant page
+ OneFile *v = oneFileCreate(&vs, vf->fileType); // need to do this after header is read
+ vf[i] = *v ;
+ free (v) ;
+ v = vf+i;
+
+ v->share = -i ; // so this slave knows its own identity
+
+ v->f = fopen (path, "r") ; // need an independent file handle
+ if (fseeko (v->f, startOff, SEEK_SET) != 0)
+ die ("ONE file error: can't seek to start of data");
+
+ for (j = 0; j < 128; j++)
+ { OneInfo *li = v->info[j];
+ if (li != NULL)
+ { OneInfo *l0 = vf->info[j];
+ if (li->listCodec) vcDestroy (li->listCodec) ;
+ li->listCodec = l0->listCodec;
+ if (li->listEltSize > 0)
+ { li->bufSize = l0->bufSize;
+ if (li->buffer) free (li->buffer) ;
+ li->buffer = new (l0->bufSize*l0->listEltSize, void);
+ }
+ li->given = l0->given;
+ }
+ }
+
+ v->codecBufSize = vf->codecBufSize;
+ if (v->codecBuf) free (v->codecBuf) ;
+ v->codecBuf = new (v->codecBufSize, void);
+
+ v->info['&']->listCodec = 0 ;
+ infoDestroy (v->info['&']);
+ v->info['*']->listCodec = 0 ;
+ infoDestroy (v->info['*']);
+ v->info['&'] = vf->info['&'];
+ v->info['*'] = vf->info['*'];
+
+ v->isIndexIn = vf->isIndexIn;
+ if (vf->subType != NULL)
+ { v->subType = new (strlen(vf->subType)+1, char);
+ strcpy (v->subType, vf->subType) ;
+ }
+ else
+ v->subType = NULL;
+ }
+ } // end of parallel threads block
+
+ if (isDynamic)
+ oneSchemaDestroy (vs0) ;
+
+ return vf;
+}
+
+/***********************************************************************************
+ *
+ * ONE_USER_BUFFER / CLOSE / GOTO
+ *
+ **********************************************************************************/
+
+ // This lets the user reassign the buffer that lists in a particular line type are read into.
+ // If this is not set, a default buffer is provided. If buffer == NULL then the package
+ // reverts to the default buffer. This routine can be called repeatedly.
+ // NB the package doesn't check the size of a user supplied buffer - the user must allocate
+ // enough memory for all forthcoming list data.
+
+void oneUserBuffer (OneFile *vf, char lineType, void *buffer)
+{ OneInfo *li;
+
+ li = vf->info[(int) lineType];
+ if (buffer != NULL)
+ { if ( ! li->isUserBuf && li->buffer != NULL)
+ { free (li->buffer);
+ li->bufSize = 0;
+ }
+ li->buffer = buffer;
+ li->isUserBuf = true;
+ }
+ else
+ { if (li->isUserBuf)
+ { li->bufSize = li->given.max + 1;
+ li->buffer = new (li->given.max*li->listEltSize, void);
+ }
+ li->isUserBuf = false;
+ }
+}
+
+bool oneGotoObject (OneFile *vf, I64 i)
+{ if (vf != NULL && vf->isIndexIn && vf->objectType)
+ if (0 <= i && i < vf->info[(int) vf->objectType]->given.count)
+ if (fseek (vf->f, ((I64 *) vf->info['&']->buffer)[i], SEEK_SET) == 0)
+ { vf->object = i;
+ return true ;
+ }
+ return false ;
+}
+
+I64 oneGotoGroup (OneFile *vf, I64 i)
+{ if (vf != NULL && vf->isIndexIn && vf->groupType)
+ if (0 <= i && i < vf->info[(int) vf->groupType]->given.count)
+ { I64 *groupIndex = (I64 *) vf->info['*']->buffer;
+ if (!oneGotoObject(vf,groupIndex[i]))
+ return 0 ;
+ return (groupIndex[i+1] - groupIndex[i]);
+ }
+ return 0 ;
+}
+
+/***********************************************************************************
+ *
+ * ONE_OPEN_WRITE_(NEW | FROM)
+ *
+ **********************************************************************************/
+
+OneFile *oneFileOpenWriteNew (const char *path, OneSchema *vs, char *fileType,
+ bool isBinary, int nthreads)
+{ OneFile *vf ;
+ FILE *f ;
+ OneSchema *vs0 = vs ;
+
+ if (strcmp (path, "-") == 0)
+ f = stdout;
+ else
+ { f = fopen (path, "w");
+ if (f == NULL)
+ return NULL ;
+ }
+
+ vf = oneFileCreate (&vs, fileType) ;
+ if (!vf)
+ return NULL ;
+
+ vf->f = f;
+ vf->isWrite = true;
+ vf->isBinary = isBinary;
+ vf->isLastLineBinary = true; // we don't want to add a newline before the first true line
+
+ vf->codecBufSize = vf->nFieldMax*sizeof(OneField) + 1;
+ vf->codecBuf = new (vf->codecBufSize, void);
+
+ if (nthreads > 1)
+ { OneFile *v, *vf0 = vf ;
+ int i ;
+ char name[100] ;
+ int pid = getpid() ;
+
+ vf->share = nthreads ;
+ vf->fieldLock = mutexInit;
+ vf->listLock = mutexInit;
+ vf = new (nthreads, OneFile);
+ vf[0] = *vf0 ;
+ free (vf0) ; // NB free() not oneFileDestroy because don't want deep destroy
+
+ for (i = 1; i < nthreads; i++)
+ { vs = vs0 ; // needed because vs will have changed in prevous oneFileCreate call
+ v = oneFileCreate (&vs, fileType);
+
+ v->isWrite = true;
+ v->isBinary = isBinary;
+ v->isLastLineBinary = isBinary;
+
+ v->codecBufSize = vf->codecBufSize;
+ v->codecBuf = new (v->codecBufSize, void);
+ v->codecTrainingSize /= 3*nthreads;
+ v->share = -i;
+
+ sprintf(name,".part.%d.%d",pid,i);
+ f = fopen (name, "w");
+ if (f == NULL)
+ die ("ONE file error: cannot create temporary file %d for parallel write", i);
+ v->f = f;
+
+ vf[i] = *v;
+ free (v);
+ }
+ }
+
+ return vf;
+}
+
+static inline void infoCopy (OneSchema *vs, OneFile *vfIn, char t, char type)
+{
+ OneInfo *vi = vfIn->info[(int)t] ;
+ schemaAddInfoFromArray (vs, vi->nField, vi->fieldType, t, type) ;
+ if (vi->comment) vs->info[(int)t]->comment = strdup (vi->comment) ;
+}
+
+OneFile *oneFileOpenWriteFrom (const char *path, OneFile *vfIn, bool isBinary, int nthreads)
+{
+ // first build a schema from vfIn
+ OneSchema *vs0 = oneSchemaCreateDynamic (vfIn->fileType, vfIn->subType) ;
+ OneSchema *vs = vs0->nxt ; // this is the actual schema - vs0 is for the header
+
+ if (vfIn->groupType) infoCopy (vs, vfIn, vfIn->groupType, 'G') ; // first the group
+ infoCopy (vs, vfIn, vfIn->objectType, 'O') ; // next the object
+ int i ; // then the rest of the record lines
+ for (i = 'A' ; i <= 'z' ; ++i)
+ if (isalpha(i) && vfIn->info[i] && i != vfIn->groupType && i != vfIn->objectType)
+ infoCopy (vs, vfIn, (char)i, 'D') ;
+ // use it to open the file
+ OneFile *vf = oneFileOpenWriteNew (path, vs0, vfIn->subType ? vfIn->subType : vfIn->fileType,
+ isBinary, nthreads);
+ oneSchemaDestroy (vs0) ;
+ if (!vf)
+ return NULL ;
+
+ oneInheritProvenance (vf, vfIn);
+ oneInheritReference (vf, vfIn);
+ oneInheritDeferred (vf, vfIn);
+
+ // set info[]->given, and resize codecBuf accordingly
+ I64 size = vf->codecBufSize;
+ for (i = 0; i < 128 ; ++i)
+ if (vf->info[i])
+ { OneInfo *vi = vf->info[i];
+ vi->given = vfIn->info[i]->given ;
+ if (vi->listCodec)
+ { I64 sz = vi->given.max * vi->listEltSize;
+ if (sz >= size)
+ size = sz+1;
+ }
+ }
+ if (size > vf->codecBufSize)
+ for (i = 0 ; i < nthreads ; ++i)
+ { OneFile *v = &(vf[i]) ;
+ if (v->codecBuf) free (v->codecBuf) ;
+ v->codecBufSize = size;
+ v->codecBuf = new (size, void);
+ }
+
+ return vf ;
+}
+
+bool oneFileCheckSchema (OneFile *vf, char *textSchema)
+{
+ char *fixedText = schemaFixNewlines (textSchema) ;
+ OneSchema *vs = oneSchemaCreateFromText (fixedText) ;
+ free (fixedText) ;
+ OneSchema *vs0 = vs ; // need to keep the root to destroy the full schema
+
+ if (vs->nxt) // the textSchema contained at least one 'P' line to define a file type
+ { while (vs && strcmp (vs->primary, vf->fileType)) vs = vs->nxt ;
+ if (!vs)
+ { fprintf (stderr, "OneSchema mismatch: file type %s not found in schema\n",
+ vf->fileType) ;
+ oneSchemaDestroy (vs0) ;
+ return false ;
+ }
+ }
+
+ bool isMatch = true ;
+ int i, j ;
+
+ for (i = 'A' ; i <= 'z' ; ++i)
+ if (vs->info[i])
+ { OneInfo *vis = vs->info[i] ;
+ OneInfo *vif = vf->info[i] ;
+ if (!vif)
+ { fprintf (stderr, "OneSchema mismatch: record type %c missing in file schema\n", i) ;
+ isMatch = false ;
+ }
+ else if (vif->nField != vis->nField)
+ { fprintf (stderr, "OneSchema mismatch: number of fields for type %c file %d != %d\n",
+ i, vif->nField, vis->nField) ;
+ isMatch = false ;
+ }
+ else
+ for (j = 0 ; j < vif->nField ; ++j)
+ if (vif->fieldType[j] != vis->fieldType[j])
+ { fprintf (stderr, "OneSchema mismatch: field %d for type %c file %s != %s\n",
+ j,i,oneTypeString[vif->fieldType[j]],oneTypeString[vis->fieldType[j]]);
+ isMatch = false ;
+ }
+ }
+
+ oneSchemaDestroy (vs0) ;
+ return isMatch ;
+}
+
+/***********************************************************************************
+ *
+ * SETTING UP PROVENANCE, REFERENCES, & DEFERRALS
+ *
+ **********************************************************************************/
+
+static bool addProvenance(OneFile *vf, OneProvenance *from, int n)
+{ I64 i ;
+ OneInfo *l = vf->info['!'];
+ I64 o = l->accum.count;
+ OneProvenance *p;
+
+ if (n == 0)
+ return (false);
+ assert (!vf->isHeaderOut) ;
+
+ l->accum.count += n;
+
+ p = new(o+n, OneProvenance);
+ if (o > 0)
+ memcpy (p, vf->provenance, o*sizeof(OneProvenance));
+ memcpy (p+o, from, n*sizeof(OneProvenance));
+ free (vf->provenance);
+ vf->provenance = p;
+
+ // finally create self-owned copy of all fields
+
+ p = p+o ;
+ for (i = 0 ; i < n ; ++i, ++p)
+ { p->program = strdup(p->program) ;
+ p->version = strdup(p->version) ;
+ p->command = strdup(p->command) ;
+ p->date = strdup(p->date) ;
+ }
+
+ return (true);
+}
+
+bool oneInheritProvenance(OneFile *vf, OneFile *source)
+{ return (addProvenance(vf, source->provenance, source->info['!']->accum.count)); }
+
+bool oneAddProvenance(OneFile *vf, char *prog, char *version, char *format, ...)
+{ va_list args ;
+ OneProvenance p;
+ time_t t = time(NULL);
+
+ p.program = prog;
+ p.version = version;
+ va_start (args, format) ; vasprintf (&p.command, format, args) ; va_end (args) ;
+ p.date = new (20, char);
+ strftime(p.date, 20, "%F_%T", localtime(&t));
+ addProvenance (vf, &p, 1);
+ free (p.command) ;
+ free (p.date) ;
+ return true ; // always added something
+}
+
+static bool addReference(OneFile *vf, OneReference *from, int n, bool isDeferred)
+{ I64 o;
+ OneInfo *l;
+ OneReference *r, **t;
+ I64 i ;
+
+ if (n == 0)
+ return false;
+ assert (!vf->isHeaderOut) ;
+
+ if (isDeferred)
+ { l = vf->info['>'];
+ t = &(vf->deferred);
+ }
+ else
+ { l = vf->info['<'];
+ t = &(vf->reference);
+ }
+ o = l->accum.count;
+ l->accum.count += n;
+
+ r = new (o+n, OneReference);
+ if (o > 0)
+ memcpy (r, *t, o*sizeof(OneReference));
+ memcpy (r+o, from, n*sizeof(OneReference));
+ free (*t);
+ *t = r;
+
+ r += o ; // make self-owned copy of filename strings
+ for (i = 0 ; i < n ; ++i, ++r)
+ r->filename = strdup (r->filename) ;
+
+ return true;
+}
+
+bool oneInheritReference(OneFile *vf, OneFile *source)
+{ return (addReference(vf, source->reference, source->info['<']->accum.count, false)); }
+
+bool oneAddReference(OneFile *vf, char *filename, I64 count)
+{ OneReference ref;
+ ref.filename = filename;
+ ref.count = count;
+ return (addReference(vf, &ref, 1, false));
+}
+
+bool oneInheritDeferred (OneFile *vf, OneFile *source)
+{ return (addReference (vf, source->deferred, source->info['>']->accum.count, true)); }
+
+bool oneAddDeferred (OneFile *vf, char *filename)
+{ OneReference ref;
+ ref.filename = filename;
+ return (addReference (vf, &ref, 1, true));
+}
+
+/***********************************************************************************
+ *
+ * ONE_WRITE_HEADER / FOOTER
+ *
+ **********************************************************************************/
+
+static void writeInfoSpec (OneFile *vf, char ci)
+{
+ int i ;
+ OneInfo *vi = vf->info[(int) ci] ;
+
+ if (ci == vf->groupType)
+ fprintf (vf->f, "\n~ G %c %d", ci, vi->nField) ;
+ else if (ci == vf->objectType)
+ fprintf (vf->f, "\n~ O %c %d", ci, vi->nField) ;
+ else
+ fprintf (vf->f, "\n~ D %c %d", ci, vi->nField) ;
+ for (i = 0 ; i < vi->nField ; ++i)
+ fprintf (vf->f, " %d %s",
+ (int)strlen(oneTypeString[vi->fieldType[i]]), oneTypeString[vi->fieldType[i]]) ;
+ if (vi->comment)
+ oneWriteComment (vf, "%s", vi->comment) ;
+}
+
+static void writeHeader (OneFile *vf)
+{ int i,n;
+ OneInfo *li;
+
+ assert (vf->isWrite) ;
+ assert (vf->line == 0) ;
+ assert (vf->share >= 0) ;
+
+ vf->isLastLineBinary = false; // header is in ASCII
+
+ fprintf (vf->f, "1 %lu %s %d %d", strlen(vf->fileType), vf->fileType, MAJOR, MINOR);
+ vf->line += 1;
+ if (vf->subType)
+ { fprintf (vf->f, "\n2 %lu %s", strlen(vf->subType), vf->subType);
+ vf->line += 1;
+ }
+
+ // provenance
+ if (vf->info['!']->accum.count)
+ { OneProvenance *p = vf->provenance;
+ n = vf->info['!']->accum.count;
+ for (i = 0; i < n; i++, p++)
+ { fprintf (vf->f, "\n! 4 %lu %s %lu %s %lu %s %lu %s",
+ strlen(p->program), p->program, strlen(p->version), p->version,
+ strlen(p->command), p->command, strlen(p->date), p->date);
+ vf->line += 1;
+ }
+ }
+
+ fprintf (vf->f, "\n.") ; // always have a spacer after this
+
+ // reference and deferred
+ if (vf->info['<']->accum.count || vf->info['>']->accum.count)
+ { OneReference *r = vf->reference;
+ n = vf->info['<']->accum.count;
+ for (i = 0; i < n; i++, r++)
+ { fprintf (vf->f, "\n< %lu %s %" PRId64 "", strlen(r->filename), r->filename, r->count);
+ vf->line += 1;
+ }
+
+ r = vf->deferred;
+ n = vf->info['>']->accum.count;
+ for (i = 0; i < n; i++, r++)
+ { fprintf (vf->f, "\n> %lu %s", strlen(r->filename), r->filename);
+ vf->line += 1;
+ }
+ fprintf (vf->f, "\n.") ;
+ }
+
+ // write the schema into the header - no need for file type, version etc. since already given
+ if (vf->groupType) writeInfoSpec (vf, vf->groupType) ;
+ if (vf->objectType) writeInfoSpec (vf, vf->objectType) ;
+ for (i = 'A' ; i <= 'z' ; ++i)
+ if (isalnum(i) && vf->info[i] && i != vf->objectType && i != vf->groupType)
+ writeInfoSpec (vf, i) ;
+
+ // any header text on '.' lines
+ if (vf->headerText)
+ { OneHeaderText *t = vf->headerText ;
+ while (t)
+ { fprintf (vf->f, "\n. %s", t->text) ;
+ t = t->nxt ;
+ }
+ fprintf (vf->f, "\n.") ;
+ }
+
+ if (vf->isBinary) // defer writing rest of header
+ { fprintf (vf->f, "\n$ %d", vf->isBig);
+ vf->line += 1;
+ }
+ else // write counts based on those supplied in input header
+ { fprintf (vf->f, "\n.") ;
+ bool isCountWritten = false ;
+ for (i = 'A'; i <= 'Z'+1 ; i++)
+ { if (i == 'Z'+1)
+ { if (vf->groupType) // NB group types are all lower case so > 'Z'+1
+ i = vf->groupType ;
+ else
+ break ;
+ }
+ li = vf->info[i];
+ if (li != NULL && li->given.count > 0)
+ { isCountWritten = true ;
+ fprintf (vf->f, "\n# %c %" PRId64 "", i, li->given.count);
+ vf->line += 1;
+ if (li->given.max > 0)
+ { fprintf (vf->f, "\n@ %c %" PRId64 "", i, li->given.max);
+ vf->line += 1;
+ }
+ if (li->given.total > 0)
+ { fprintf (vf->f, "\n+ %c %" PRId64 "", i, li->given.total);
+ vf->line += 1;
+ }
+ if (li->given.groupCount > 0)
+ { fprintf (vf->f, "\n%% %c # %c %" PRId64 "", vf->groupType, i, li->given.groupCount);
+ vf->line += 1;
+ }
+ if (li->given.groupTotal > 0)
+ { fprintf (vf->f, "\n%% %c + %c %" PRId64 "", vf->groupType, i, li->given.groupTotal);
+ vf->line += 1;
+ }
+ }
+ }
+ if (isCountWritten)
+ fprintf (vf->f, "\n.") ;
+ }
+ fflush (vf->f);
+
+ vf->isHeaderOut = true;
+}
+
+/***********************************************************************************
+ *
+ * ONE_WRITE_LINE
+ *
+ **********************************************************************************/
+
+static int writeStringList (OneFile *vf, char t, int len, char *buf)
+{ OneInfo *li;
+ int j, nByteWritten = 0;
+ I64 sLen, totLen;
+
+ totLen = 0;
+ for (j = 0; j < len; j++)
+ { sLen = strlen (buf);
+ totLen += sLen;
+ nByteWritten += fprintf (vf->f, " %" PRId64 " %s", sLen, buf);
+ buf += sLen + 1;
+ }
+
+ li = vf->info[(int) t];
+ li->accum.total += totLen;
+ if (li->accum.max < totLen)
+ li->accum.max = totLen;
+
+ return nByteWritten ;
+}
+
+// process is to fill fields by assigning to macros, then call - list contents are in buf
+// NB in ASCII mode adds '\n' before writing line not after, so oneWriteComment() can add to line
+// first call will write initial header
+
+void oneWriteLine (OneFile *vf, char t, I64 listLen, void *listBuf)
+{ I64 i, j;
+ OneInfo *li;
+
+ // fprintf (stderr, "write line %d type %c char %c\n", vf->line, t, oneChar(vf,0)) ;
+
+ assert (vf->isWrite) ;
+ assert (!vf->isFinal || !isalpha(t)) ;
+
+ li = vf->info[(int) t];
+ assert (li) ;
+
+ vf->line += 1;
+ li->accum.count += 1;
+ if (t == vf->groupType) updateGroupCount(vf, true);
+
+ if (li->listEltSize > 0) // need to write the list
+ { assert (listLen >= 0) ;
+ vf->field[li->listField].len = listLen ;
+ if (listBuf == NULL) listBuf = li->buffer;
+ }
+
+ // BINARY - block write and optionally compress
+
+ if (vf->isBinary)
+ { U8 x;
+
+ if (!vf->isHeaderOut && vf->share >= 0) writeHeader (vf) ; // no header on slaves
+
+ if (!vf->isLastLineBinary)
+ { fputc ('\n', vf->f) ;
+ vf->byte = ftello (vf->f) ;
+ }
+
+ if (t == vf->objectType) // update index and increment object count
+ { OneInfo *lx = vf->info['&'];
+
+ if (vf->object >= lx->bufSize) // first ensure enough space
+ { I64 ns = (lx->bufSize << 1) + 0x20000;
+ I64 *nb = new (ns, I64);
+
+ memcpy(nb, lx->buffer, lx->bufSize*sizeof(I64));
+ free (lx->buffer);
+ lx->buffer = nb;
+ lx->bufSize = ns;
+ }
+ ((I64 *) lx->buffer)[vf->object] = vf->byte;
+ // assert (ftello (vf->f) == vf->byte) ; // beware - very costly
+
+ ++vf->object ;
+ }
+ if (t == vf->groupType)
+ { OneInfo *lx = vf->info['*'];
+
+ if (vf->group >= lx->bufSize) // still room for final value because one ahead here
+ { I64 ns, *nb;
+
+ ns = (lx->bufSize << 1) + 0x20000;
+ nb = new (ns, I64);
+ memcpy(nb, lx->buffer, lx->bufSize*sizeof(I64));
+ free (lx->buffer);
+ lx->buffer = nb;
+ lx->bufSize = ns;
+ }
+
+ ((I64 *) lx->buffer)[vf->group-1] = vf->object; // group # already advanced
+ }
+
+ // write the line character
+
+ x = li->binaryTypePack; // Binary line code + compression flags
+ if (li->isUseListCodec)
+ x |= 0x01;
+ fputc (x, vf->f);
+ ++vf->byte ;
+
+ // write the fields
+
+ if (li->nField > 0)
+ vf->byte += writeCompressedFields (vf->f, vf->field, li) ;
+
+ // write the list if there is one
+
+ if (li->listEltSize && listLen > 0)
+ { I64 nBits, listSize;
+ int listBytes ;
+
+ li->accum.total += listLen;
+ if (listLen > li->accum.max)
+ li->accum.max = listLen;
+
+ if (li->fieldType[li->listField] == oneINT_LIST)
+ { vf->byte += ltfWrite (*(I64*)listBuf, vf->f) ;
+ if (listLen == 1) goto doneLine ; // finish writing this line here
+ listBuf = compactIntList (vf, li, listLen, listBuf, &listBytes) ;
+ --listLen ;
+ fputc ((char)listBytes, vf->f) ;
+ vf->byte++ ;
+ }
+ else
+ listBytes = li->listEltSize ;
+ listSize = listLen * listBytes;
+
+ if (li->fieldType[li->listField] == oneSTRING_LIST) // handle as ASCII
+ vf->byte += writeStringList (vf, t, listLen, listBuf);
+ else if (x & 0x1)
+ { if (listSize >= vf->codecBufSize)
+ { free (vf->codecBuf);
+ vf->codecBufSize = listSize+1;
+ vf->codecBuf = new (vf->codecBufSize, void);
+ }
+ nBits = vcEncode (li->listCodec, listSize, listBuf, vf->codecBuf);
+ vf->byte += ltfWrite (nBits, vf->f) ;
+ if (fwrite (vf->codecBuf, ((nBits+7) >> 3), 1, vf->f) != 1)
+ die ("ONE write error: failed to write compressed list");
+ vf->byte += ((nBits+7) >> 3) ;
+ }
+ else
+ { if (fwrite (listBuf, listSize, 1, vf->f) != 1)
+ die ("ONE write error line %" PRId64 ": failed to write list field %d listLen %" PRId64 " listSize %" PRId64 " listBuf %lx",
+ vf->line, li->listField, listLen, listSize, listBuf);
+ vf->byte += listSize;
+ if (li->listCodec != NULL)
+ { vcAddToTable (li->listCodec, listSize, listBuf);
+ li->listTack += listSize;
+
+ if (li->listTack > vf->codecTrainingSize)
+ { if (vf->share == 0)
+ { vcCreateCodec (li->listCodec, 1);
+ li->isUseListCodec = true;
+ }
+ else
+ { OneFile *ms;
+ OneInfo *lx;
+
+ if (vf->share < 0)
+ { ms = vf + vf->share;
+ lx = ms->info[(int) t];
+ }
+ else
+ { ms = vf;
+ lx = li;
+ }
+
+ pthread_mutex_lock(&ms->listLock);
+
+ if ( ! li->isUseListCodec)
+
+ { if (vf->share < 0)
+ { lx->listTack += li->listTack;
+ li->listTack = 0;
+ }
+ if (lx->listTack > ms->codecTrainingSize)
+ { for (i = 1; i < ms->share; i++)
+ vcAddHistogram (lx->listCodec,
+ ms[i].info[(int) t]->listCodec);
+ vcCreateCodec (lx->listCodec, 1);
+ for (i = 1; i < ms->share; i++)
+ { OneCodec *m = ms[i].info[(int) t]->listCodec;
+ ms[i].info[(int) t]->listCodec = lx->listCodec;
+ vcDestroy (m);
+ }
+ lx->isUseListCodec = true;
+ for (i = 1; i < ms->share; i++)
+ ms[i].info[(int) t]->isUseListCodec = true;
+ }
+ }
+
+ pthread_mutex_unlock(&ms->listLock);
+ }
+ }
+ }
+ }
+ }
+
+ doneLine:
+
+ vf->isLastLineBinary = true;
+ }
+
+ // ASCII - write field by field
+
+ else
+ { if (!vf->isHeaderOut && !vf->isNoAsciiHeader) writeHeader (vf) ;
+
+ if (!vf->isLastLineBinary) // terminate previous ascii line
+ fputc ('\n', vf->f);
+
+ fputc (t, vf->f);
+
+ for (i = 0; i < li->nField; i++)
+ switch (li->fieldType[i])
+ {
+ case oneINT:
+ fprintf (vf->f, " %" PRId64 "", vf->field[i].i);
+ break;
+ case oneREAL:
+ fprintf (vf->f, " %f", vf->field[i].r);
+ break;
+ case oneCHAR:
+ fprintf (vf->f, " %c", vf->field[i].c);
+ break;
+ case oneSTRING:
+ case oneDNA:
+ case oneINT_LIST:
+ case oneREAL_LIST:
+ case oneSTRING_LIST:
+ li->accum.total += listLen;
+ if (listLen > li->accum.max)
+ li->accum.max = listLen;
+
+ fprintf (vf->f, " %" PRId64 "", listLen);
+ if (li->fieldType[i] == oneSTRING || li->fieldType[i] == oneDNA)
+ { if (listLen > INT_MAX)
+ die ("ONE write error: string length %" PRId64 " > current max %d", listLen, INT_MAX);
+ fprintf (vf->f, " %.*s", (int) listLen, (char *) listBuf);
+ }
+ else if (li->fieldType[i] == oneINT_LIST)
+ { I64 *b = (I64 *) listBuf;
+ for (j = 0; j < listLen ; ++j)
+ fprintf (vf->f, " %" PRId64 "", b[j]);
+ }
+ else if (li->fieldType[i] == oneREAL_LIST)
+ { double *b = (double *) listBuf;
+ for (j = 0; j < listLen ; ++j)
+ fprintf (vf->f, " %f", b[j]);
+ }
+ else // vSTRING_LIST
+ writeStringList (vf, t, listLen, listBuf);
+ break;
+ }
+ vf->isLastLineBinary = false;
+ }
+}
+
+void oneWriteLineDNA2bit (OneFile *vf, char lineType, I64 listLen, U8 *dnaBuf)
+{ die ("not written yet") ;
+ oneWriteLine (vf, lineType, listLen, dnaBuf) ;
+}
+
+void oneWriteComment (OneFile *vf, char *format, ...)
+{
+ va_list args ;
+
+ if (vf->isCheckString) // then check no newlines in format
+ { char *s = format ;
+ while (*s) if (*s++ == '\n') die ("newline in comment format string: %s", format) ;
+ }
+
+ va_start (args, format) ;
+ if (vf->isLastLineBinary) // write a comment line
+ { char *comment ;
+ vasprintf (&comment, format, args) ;
+ oneWriteLine (vf, '/', strlen(comment), comment) ;
+ free (comment) ;
+ }
+ else // write on same line after space
+ { fputc (' ', vf->f) ;
+ vfprintf (vf->f, format, args) ;
+ }
+ va_end (args) ;
+}
+
+/***********************************************************************************
+ *
+ * MERGING, FOOTER HANDLING, AND CLOSE
+ *
+ **********************************************************************************/
+
+static void oneWriteFooter (OneFile *vf)
+{ int i,n;
+ off_t footOff;
+ OneInfo *li;
+ char *codecBuf ;
+
+ footOff = ftello (vf->f);
+ if (footOff < 0)
+ die ("ONE write error: failed footer ftell");
+
+ // first the per-linetype information
+ codecBuf = new (vcMaxSerialSize()+1, char) ; // +1 for added up unused 0-terminator
+ for (i = 'A'; i <= 'Z'+1 ; i++)
+ { if (i == 'Z'+1)
+ { if (vf->groupType) // NB group types are all lower case so > 'Z'+1
+ i = vf->groupType ;
+ else
+ break ;
+ }
+ li = vf->info[i];
+ if (li != NULL && li->accum.count > 0)
+ { fprintf (vf->f, "# %c %" PRId64 "\n", i, li->accum.count);
+ if (li->listEltSize)
+ { fprintf (vf->f, "@ %c %" PRId64 "\n", i, li->accum.max);
+ fprintf (vf->f, "+ %c %" PRId64 "\n", i, li->accum.total);
+ }
+ if (vf->groupType && i != vf->groupType && vf->group > 0)
+ { fprintf (vf->f, "%% %c # %c %" PRId64 "\n", vf->groupType, i, li->accum.groupCount);
+ if (li->listEltSize)
+ fprintf (vf->f, "%% %c + %c %" PRId64 "\n", vf->groupType, i, li->accum.groupTotal);
+ }
+ if (li->isUseListCodec && li->listCodec != DNAcodec)
+ { oneChar(vf,0) = i;
+ n = vcSerialize (li->listCodec, codecBuf);
+ oneWriteLine (vf, ';', n, codecBuf);
+ }
+ }
+ }
+
+ li = vf->info['/'] ; // may need to write list codec for comments
+ if (li->isUseListCodec)
+ { oneChar(vf,0) = '/' ;
+ n = vcSerialize (li->listCodec, codecBuf);
+ oneWriteLine (vf, ';', n, codecBuf);
+ }
+
+ free (codecBuf) ;
+
+ oneWriteLine (vf, '&', vf->object, NULL); // number of objects in file = length of index
+ // NB NULL here and below for '*' defaults writing info->buffer, which contains the index
+
+ if (vf->groupType > 0 && vf->group > 0)
+ { ((I64 *) vf->info['*']->buffer)[vf->group] = vf->object;
+ oneWriteLine (vf, '*', vf->group+1, NULL); // number of groups in file + 1 = length of index
+ }
+
+ fprintf (vf->f, "^\n"); // end of footer marker
+
+ if (fwrite (&footOff, sizeof(off_t), 1, vf->f) != 1)
+ die ("ONE write error: failed writing footer offset");
+}
+
+ // After all input has been read, or all data has been written, this routine will finish
+ // accumulating counts/statistics for the file and merge thread stats into those for
+ // the master file (if a parallel OneFile).
+
+void oneFinalizeCounts(OneFile *vf)
+{ int i, j, n, k, len;
+ OneInfo *li, *ln;
+
+ if (vf->share < 0)
+ die ("ONE write error: cannot call oneFileClose on a slave OneFile");
+
+ vf->isFinal = true;
+
+ if (vf->share == 0)
+ { updateGroupCount(vf,false);
+ return;
+ }
+
+ len = vf->share;
+
+ // Close current groups at the end of each part (if any)
+
+ if (vf->groupType > 0)
+ for (i = 'A'; i <= 'Z'; i++)
+ if (vf->info[i] != NULL)
+ for (j = 0; j < len; j++)
+ if (vf[j].inGroup)
+ { I64 oc, ot;
+
+ ot = oc = 0;
+ for (k = j+1; k < len; k++)
+ if (vf[k].inGroup)
+ { oc += vf[k].info[i]->oCount;
+ ot += vf[k].info[i]->oTotal;
+ break;
+ }
+ else
+ { oc += vf[k].info[i]->accum.count;
+ ot += vf[k].info[i]->accum.total;
+ }
+
+ li = vf[j].info[i];
+ if ((li->accum.count - li->gCount) + oc > li->accum.groupCount)
+ li->accum.groupCount = (li->accum.count - li->gCount) + oc;
+ if ((li->accum.total - li->gTotal) + ot > li->accum.groupTotal)
+ li->accum.groupTotal = (li->accum.total - li->gTotal) + ot;
+ }
+
+ // first the per-linetype information
+
+ n = vf->groupType;
+ if (n == 0)
+ n = 'Z';
+
+ for (i = 'A'; i <= n; i++)
+ { ln = vf->info[i];
+ for (j = 1; j < len; j++)
+ { li = (vf+j)->info[i];
+ if (li != NULL && li->accum.count > 0)
+ { ln->accum.count += li->accum.count;
+ if (li->accum.max > ln->accum.max)
+ ln->accum.max = li->accum.max;
+ ln->accum.total += li->accum.total;
+ if (li->accum.groupCount > ln->accum.groupCount)
+ ln->accum.groupCount = li->accum.groupCount;
+ if (li->accum.groupTotal > ln->accum.groupTotal)
+ ln->accum.groupTotal = li->accum.groupTotal;
+ }
+ }
+ }
+
+ if ( ! vf->isBinary)
+ return;
+
+ // Stitch the group index together
+
+ if (vf->groupType > 0)
+ { I64 *gb, *gi, off;
+ int ns;
+
+ ns = 0;
+ for (j = 0; j < len; j++)
+ ns += vf[j].group;
+ gb = new (ns+1, I64);
+
+ ns = 0;
+ off = 0;
+ for (j = 0; j < len; j++)
+ { li = vf[j].info['*'];
+ gi = (I64 *) (li->buffer);
+ for (i = 0; i < vf[j].group; i++)
+ gb[ns++] = gi[i] + off;
+ off += vf[j].object;
+ }
+ gb[ns] = off;
+ li = vf->info['*'];
+ free(li->buffer);
+ li->buffer = gb;
+ li->bufSize = ns+1;
+ vf->group = ns;
+ }
+
+ // Stitch the object index together
+
+ { int ns;
+ I64 *gb, *gi, off;
+
+ ns = 0;
+ for (j = 0; j < len; j++)
+ ns += vf[j].object;
+ gb = new (ns, I64);
+
+ ns = 0;
+ off = 0;
+ for (j = 0; j < len; j++)
+ { li = vf[j].info['&'];
+ gi = (I64 *) (li->buffer);
+ for (i = 0; i < vf[j].object; i++)
+ gb[ns++] = gi[i] + off;
+ off += ftello(vf[j].f);
+ }
+
+ li = vf->info['&'];
+ free(li->buffer);
+ li->buffer = gb;
+ li->bufSize = ns;
+ vf->object = ns;
+ }
+}
+
+// automatically rewrites header if allowed when writing
+
+void oneFileClose (OneFile *vf)
+{
+ assert (vf->share >= 0) ;
+
+ if (vf->isWrite)
+ {
+ if (!vf->isFinal) // RD moved this here from above - surely only needed if isWrite
+ oneFinalizeCounts (vf);
+
+ if (!vf->isHeaderOut && (vf->isBinary || !vf->isNoAsciiHeader)) writeHeader (vf) ;
+
+ if (vf->share > 0)
+ { int i, pid, fid, nread;
+ char name[100], *buf;
+
+ buf = new (10000000, char);
+ pid = getpid();
+ for (i = 1; i < vf->share; i++)
+ { fclose (vf[i].f);
+ vf[i].f = NULL;
+ sprintf(name,".part.%d.%d",pid,i);
+ fid = open(name,O_RDONLY);
+ while ((nread = read(fid,buf,10000000)) > 0)
+ if ((int) fwrite(buf,1,nread,vf->f) != nread)
+ die ("ONE write error: while cat'ing thread bits (oneFileClose)");
+ if (unlink(name) < 0)
+ die ("ONE write error: could not delete thread file %s", name);
+ }
+ free(buf);
+ }
+
+ fputc ('\n', vf->f); // end of file if ascii, end of data marker if binary
+ if (vf->isBinary) // write the footer
+ oneWriteFooter (vf);
+ }
+
+ oneFileDestroy (vf);
+}
+
+/***********************************************************************************
+ *
+ * Length limited Huffman Compressor/decompressor with special 2-bit compressor for DNA
+ * Author: Gene Myers
+ * Creation date: June 27, 2019
+ *
+ * inline both compression.h and compression.c here
+ *
+ **********************************************************************************/
+
+#undef DEBUG
+#undef TEST
+
+ // To create a compressor, get an initially empty object with vcCreate, then
+ // add a significant corpus of the byte data to be compressed with vcAddToTable,
+ // and finally create a Huffman codec based on this corpus by calling
+ // vcCreateCodec. The parameter "partial" should be set if not all the data
+ // to be compressed has been scanned. At this point you have a compressor ready
+ // to operate. You can destroy/free it with vcDestroy.
+
+OneCodec *vcCreate();
+void vcAddToTable(OneCodec *vc, int len, char *bytes);
+void vcCreateCodec(OneCodec *vc, int partial);
+void vcDestroy(OneCodec *vc);
+
+ // In the instance of accumulating data over multiple threads, vcAddHistogram, will
+ // add the counts in the table for vh, to the table for vc.
+
+void vcAddHistogram(OneCodec *vc, OneCodec *vh);
+
+ // A diagnostic routine: shows you the compression scheme and if the distribution
+ // of the scanned corpus is available, it shows you that too. Output to file 'to'.
+
+void vcPrint(OneCodec *vc, FILE *to);
+
+ // You can encode and decode where ibytes/ilen are the input and the output
+ // is placed at obytes and the length of the compressed/decompressed result
+ // is returned as the value of the function. For vcEncode, ilen is the size
+ // of the uncompressed input in bytes, and the return value is the size of
+ // the compressed output in **bits**. The converse is true for vcDecode, i.e
+ // ilen is the number of bits in the compressed input, and the return value
+ // is the number of bytes in the uncompressed output. The routines are endian safe.
+
+int vcEncode(OneCodec *vc, int ilen, char *ibytes, char *obytes);
+int vcDecode(OneCodec *vc, int ilen, char *ibytes, char *obytes);
+
+ // Rather than directly reading or writing an encoding of a compressor, the routines
+ // below serialize or deserialize the compressor into/outof a user-supplied buffer.
+ // vcMaxSerialSize gives the maximum size of a serialized compressor so the user
+ // can arrange a buffer of the appropriate size. vcSerialize serializes vc into
+ // buffer 'out' and returns the # of bytes in the encoding. vcDeserialize will reverse
+ // the process given a serialization. The routines are endian-safe.
+
+int vcMaxSerialSize();
+int vcSerialize(OneCodec *vc, void *out);
+OneCodec *vcDeserialize(void *in);
+
+typedef uint64_t uint64;
+typedef uint32_t uint32;
+typedef uint16_t uint16;
+typedef uint8_t uint8;
+
+#define HUFF_CUTOFF 12 // This cannot be larger than 16 !
+
+ // Endian flipping macros
+
+#define FLIP64(p) \
+{ uint8 x = p[0]; \
+ p[0] = p[7]; \
+ p[7] = x; \
+ x = p[1]; \
+ p[1] = p[6]; \
+ p[6] = x; \
+ x = p[2]; \
+ p[2] = p[5]; \
+ p[5] = x; \
+ x = p[3]; \
+ p[3] = p[4]; \
+ p[4] = x; \
+}
+
+#define FLIP32(p) \
+{ uint8 x = p[0]; \
+ p[0] = p[3]; \
+ p[3] = x; \
+ x = p[1]; \
+ p[1] = p[2]; \
+ p[2] = x; \
+}
+
+#define FLIP16(p) \
+{ uint8 x = p[0]; \
+ p[0] = p[1]; \
+ p[1] = x; \
+}
+
+/*******************************************************************************************
+ *
+ * Routines for computing a length-limited Huffman Encoding Scheme
+ *
+ ********************************************************************************************/
+
+#define EMPTY 0 // Compressor just created, histogram zero'd
+#define FILLED 1 // Compressor histogram being filled, no codec
+#define CODED_WITH 2 // Compressor has a codec (can no longer accumulate histogram)
+#define CODED_READ 3 // Compressor has codec but no histogram as was created by read
+
+typedef struct
+ { int state; // 1 of the 4 states immediately above
+ int isbig; // endian of the current machine
+ uint16 codebits[256]; // Code esc_code is the special code for
+ uint8 codelens[256]; // non-Huffman exceptions
+ char lookup[0x10000]; // Lookup table (just for decoding)
+ int esc_code; // The special escape code (-1 if not partial)
+ int esc_len; // The length in bits of the special code (if present)
+ uint64 hist[256]; // Byte distribution for codec
+ } _OneCodec;
+
+ // The special "predefined" DNA compressor
+
+static _OneCodec _DNAcodec = { .state = CODED_READ };
+OneCodec *DNAcodec = (OneCodec *) &_DNAcodec;
+
+ // Create an EMPTY compressor object with zero'd histogram and determine machine endian
+
+OneCodec *vcCreate()
+{ _OneCodec *v;
+ int i;
+
+ v = (_OneCodec *) malloc(sizeof(_OneCodec));
+ if (v == NULL)
+ { fprintf(stderr,"vcCreate: Could not allocate compressor\n");
+ exit (1);
+ }
+
+ v->state = EMPTY;
+ for (i = 0; i < 256; i++)
+ v->hist[i] = 0;
+
+ { uint32 t;
+ uint8 *b;
+
+ t = 1;
+ b = (uint8 *) (&t);
+ v->isbig = (b[0] == 0);
+ }
+
+ return ((OneCodec *) v);
+}
+
+ // Free a compressor object
+
+void vcDestroy(OneCodec *vc)
+{ _OneCodec *v = (_OneCodec *) vc;
+ if (vc != DNAcodec)
+ free(v);
+}
+
+ // Add the frequencies of bytes in bytes[0..len) to vc's histogram
+ // State becomes FILLED
+
+void vcAddToTable(OneCodec *vc, int len, char *bytes)
+{ _OneCodec *v = (_OneCodec *) vc;
+ uint8 *data = (uint8 *) bytes;
+ int i;
+
+ for (i = 0; i < len; i++)
+ v->hist[(int) data[i]] += 1;
+ if (v->state < FILLED)
+ v->state = FILLED;
+}
+
+ // Add the frequencies of bytes in bytes[0..len) to vc's histogram
+ // State becomes FILLED
+
+void vcAddHistogram(OneCodec *vc, OneCodec *vh)
+{ _OneCodec *v = (_OneCodec *) vc;
+ _OneCodec *h = (_OneCodec *) vh;
+ int i;
+
+ if (v->state >= CODED_WITH)
+ { fprintf(stderr,"vcAddHistogram: Compressor already has a codec\n");
+ exit (1);
+ }
+ if (h->state == CODED_READ)
+ { fprintf(stderr,"vcAddHistogram: Source compressor doesn't have a histogram\n");
+ exit (1);
+ }
+
+ for (i = 0; i < 256; i++)
+ v->hist[i] += h->hist[i];
+ v->state = FILLED;
+}
+
+ // Check vc has a non-empty distribution histogram and if so then build
+ // length-limited Huffman tables for the bytes that occur in the histogram,
+ // plus a special escape code if partial is set and there is at least one byte
+ // with a zero count in the histogram. The algorithm is by Larmore & Hirschberg,
+ // JACM 73, 3 (1990).
+
+uint64 *HIST;
+
+int HSORT(const void *l, const void *r)
+{ int x = *((int *) l);
+ int y = *((int *) r);
+ return (HIST[x] - HIST[y]);
+}
+
+void vcCreateCodec(OneCodec *vc, int partial)
+{ _OneCodec *v = (_OneCodec *) vc;
+
+ uint64 *hist;
+ char *look;
+ uint8 *lens;
+ uint16 *bitv;
+
+ int code[256];
+ int leng[256];
+ uint16 bits[256];
+ int ncode, dcode, ecode;
+
+ int i;
+
+ if (v->state >= CODED_WITH)
+ { fprintf(stderr,"vcCreateCoder: Compressor already has a codec\n");
+ exit (1);
+ }
+ if (v->state == EMPTY)
+ { fprintf(stderr,"vcCreateCoder: Compressor has no byte distribution data\n");
+ exit (1);
+ }
+
+ hist = v->hist;
+ look = v->lookup;
+ lens = v->codelens;
+ bitv = v->codebits;
+
+ ecode = -partial;
+ ncode = 0;
+ for (i = 0; i < 256; i++)
+ if (hist[i] > 0)
+ code[ncode++] = i;
+ else if (ecode < 0)
+ { ecode = i;
+ code[ncode++] = i;
+ }
+ dcode = 2*ncode;
+
+ if (ecode < 0)
+ partial = 0;
+
+ HIST = hist;
+ qsort(code,ncode,sizeof(int),HSORT);
+
+#ifdef DEBUG
+ fprintf(stderr,"\nSorted Codes %d:\n",ncode);
+ for (i = 0; i < ncode; i++)
+ fprintf(stderr," %3d: %3d %10llu\n",i,code[i],hist[code[i]]);
+#endif
+
+ { uint8 matrix[HUFF_CUTOFF][dcode];
+ uint64 count1[dcode], count2[dcode], countb[ncode];
+ uint64 *lcnt, *ccnt, *swp;
+ int llen, span;
+ int j, k, n, L;
+
+ for (n = 0; n < ncode; n++)
+ { count1[n] = countb[n] = hist[code[n]];
+ leng[n] = 0;
+ }
+
+#ifdef DEBUG
+ fprintf(stderr,"\nCoin Filter:\n");
+ fprintf(stderr," Row %2d:",HUFF_CUTOFF);
+ for (n = 0; n < ncode; n++)
+ fprintf(stderr," %" PRId64 "*",countb[n]);
+ fprintf(stderr,"\n");
+#endif
+
+ lcnt = count1;
+ ccnt = count2;
+ llen = ncode-1;
+ for (L = HUFF_CUTOFF-1; L > 0; L--)
+ { j = 0;
+ k = 0;
+ for (n = 0; j < ncode || k < llen; n++)
+ { if (k >= llen || (j < ncode && countb[j] <= lcnt[k] + lcnt[k+1]))
+ { ccnt[n] = countb[j];
+ matrix[L][n] = 1;
+ j += 1;
+ }
+ else
+ { ccnt[n] = lcnt[k] + lcnt[k+1];
+ matrix[L][n] = 0;
+ k += 2;
+ }
+ }
+ llen = n-1;
+ swp = lcnt;
+ lcnt = ccnt;
+ ccnt = swp;
+
+#ifdef DEBUG
+ fprintf(stderr," Row %2d:",L);
+ for (n = 0; n <= llen; n++)
+ fprintf(stderr," %" PRId64 "%c",lcnt[n],matrix[L][n]?'*':'+');
+ fprintf(stderr,"\n");
+#endif
+ }
+
+ span = 2*(ncode-1);
+ for (L = 1; L < HUFF_CUTOFF; L++)
+ { j = 0;
+ for (n = 0; n < span; n++)
+ { if (matrix[L][n])
+ leng[j++] += 1;
+ }
+ span = 2*(span-j);
+ }
+ for (n = 0; n < span; n++)
+ leng[n] += 1;
+
+#ifdef DEBUG
+ fprintf(stderr,"\nBack Trace:\n");
+ span = 2*(ncode-1);
+ for (L = 1; L < HUFF_CUTOFF; L++)
+ { j = 0;
+ fprintf(stderr," Row %2d:",L);
+ for (n = 0; n < span; n++)
+ { if (matrix[L][n])
+ j += 1;
+ fprintf(stderr," %c",matrix[L][n]?'*':'+');
+ }
+ fprintf(stderr,"\n");
+ span = 2*(span-j);
+ }
+ fprintf(stderr," Length:");
+ for (n = 0; n < ncode; n++)
+ fprintf(stderr," %d",leng[n]);
+ fprintf(stderr,"\n");
+#endif
+ }
+
+ { int n, llen;
+ uint16 lbits;
+
+ llen = leng[0];
+ lbits = bits[0] = (1 << llen) - 1;
+ for (n = 1; n < ncode; n++)
+ { while ((lbits & 0x1) == 0)
+ { lbits >>= 1;
+ llen -= 1;
+ }
+ lbits -= 1;
+ while (llen < leng[n])
+ { lbits = (lbits << 1) | 0x1;
+ llen += 1;
+ }
+ bits[n] = lbits;
+ }
+
+#ifdef DEBUG
+ { int j;
+
+ fprintf(stderr,"\nCodes:\n");
+ for (n = 0; n < ncode; n++)
+ { fprintf(stderr," %3d: %2d ",code[n],leng[n]);
+ for (j = leng[n]-1; j >= 0; j--)
+ fprintf(stderr,"%x",(bits[n]>>j)&0x1);
+ fprintf(stderr,"\n");
+ }
+ }
+#endif
+ }
+
+ for (i = 0; i < 256; i++)
+ { lens[i] = 0;
+ bitv[i] = 0;
+ }
+
+ for (i = 0; i < ncode; i++)
+ { lens[code[i]] = leng[i];
+ bitv[code[i]] = bits[i];
+ }
+
+ { int j, powr; // Fill in a decoder table giving the next Huffman code
+ uint16 base; // that is a prefix of the next 16 bits
+
+ for (i = 0; i < 256; i++)
+ { if (lens[i] > 0)
+ { base = (bitv[i] << (16-lens[i]));
+ powr = (1 << (16-lens[i]));
+ for (j = 0; j < powr; j++)
+ look[base+j] = i;
+ }
+ }
+ }
+
+ if (partial)
+ { v->esc_code = ecode;
+ v->esc_len = lens[ecode];
+ lens[ecode] = 0;
+ }
+ else
+ v->esc_code = -1;
+ v->state = CODED_WITH;
+}
+
+ // For debug, give a nice print out of the distribution histogram (if present)
+ // and the Huffman codec
+
+void vcPrint(OneCodec *vc, FILE *to)
+{ _OneCodec *v = (_OneCodec *) vc;
+
+ uint64 total_bits, ucomp_bits, count;
+ uint16 mask, code, *bits;
+ uint64 *hist;
+ uint8 *lens;
+ int clen;
+ int hashist;
+ int i, k;
+
+ if (vc == DNAcodec)
+ { fprintf(to," DNAcompressor\n");
+ return;
+ }
+
+ if (v->state < CODED_WITH)
+ { fprintf(stderr,"vcPrint: Compressor has no codec\n");
+ exit (1);
+ }
+ hashist = (v->state == CODED_WITH);
+
+ bits = v->codebits;
+ lens = v->codelens;
+ hist = v->hist; // only needed if hashist, but compiler warning if assignment is conditional
+
+ if (hashist)
+ { total_bits = 0;
+ ucomp_bits = 0;
+
+ count = 0;
+ for (i = 0; i < 256; i++)
+ count += hist[i];
+
+ fprintf(to,"\nHistogram:\n");
+ for (i = 0; i < 256; i++)
+ if (hist[i] > 0)
+ { if (isprint(i))
+ fprintf(to," %c: %12" PRIu64 " %5.1f%%\n",i,hist[i],(hist[i]*100.)/count);
+ else
+ fprintf(to," %3d: %12" PRIu64 " %5.1f%%\n",i,hist[i],(hist[i]*100.)/count);
+ }
+ }
+
+ fprintf(to,"\nCode Table:\n");
+ for (i = 0; i < 256; i++)
+ { clen = lens[i];
+ if (i == v->esc_code)
+ clen = v->esc_len;
+ if (clen > 0)
+ { mask = (1 << clen);
+ code = bits[i];
+ if (isprint(i))
+ fprintf(to," %c: %2d ",i,clen);
+ else
+ fprintf(to," %3d: %2d ",i,clen);
+ for (k = 0; k < clen; k++)
+ { mask >>= 1;
+ if (code & mask)
+ fprintf(to,"1");
+ else
+ fprintf(to,"0");
+ }
+ if (i == v->esc_code)
+ fprintf(to," ***\n");
+ else
+ { fprintf(to,"\n");
+ if (hashist)
+ { total_bits += clen*hist[i];
+ ucomp_bits += (hist[i]<<3);
+ }
+ }
+ }
+ }
+ if (hashist)
+ fprintf(to,"\nTotal Bytes = %" PRIu64 " (%.2f%%)\n",(total_bits-1)/8+1,(100.*total_bits)/ucomp_bits);
+}
+
+
+/*******************************************************************************************
+ *
+ * Read and Write Huffman Schemes (actually just (de)serialize)
+ *
+ ********************************************************************************************/
+
+ // Maximum # of bytes in a serialized compressor code
+
+int vcMaxSerialSize()
+{ return (257 + 2*sizeof(int) + 256*sizeof(uint16)); }
+
+ // Code the compressor into blob 'out' and return number of bytes in the code
+
+int vcSerialize(OneCodec *vc, void *out)
+{ _OneCodec *v = (_OneCodec *) vc;
+
+ int i;
+ uint16 *bits;
+ uint8 *lens, *o;
+
+ if (vc == DNAcodec)
+ return (0);
+
+ if (v->state < CODED_WITH)
+ { fprintf(stderr,"vcWrite: Compressor does not have a codec\n");
+ exit (1);
+ }
+
+ lens = v->codelens;
+ bits = v->codebits;
+ o = (uint8 *) out;
+
+ // Only need to record endian, escape code, code lengths, and codes for those
+ // with non-zero length
+
+ *o++ = v->isbig;
+ memcpy(o,&(v->esc_code),sizeof(int));
+ o += sizeof(int);
+ memcpy(o,&(v->esc_len),sizeof(int));
+ o += sizeof(int);
+ for (i = 0; i < 256; i++)
+ { *o++ = lens[i];
+ if (lens[i] > 0 || i == v->esc_code)
+ { memcpy(o,bits+i,sizeof(uint16));
+ o += sizeof(uint16);
+ }
+ }
+ return (o - (uint8 *) out);
+}
+
+ // Create a compressor object from the serialized code in blob 'in'.
+ // The compressor does not have the original histogram from which
+ // its codec was created. If the endian of the current machine and
+ // the one that serialized the compressor don't match, then all relevant
+ // items are byte-flipped.
+
+OneCodec *vcDeserialize(void *in)
+{ _OneCodec *v;
+
+ char *look;
+ uint8 *lens, *ip;
+ uint16 *bits, base;
+ int i, j, powr;
+
+ v = (_OneCodec *) malloc(sizeof(_OneCodec));
+ if (v == NULL)
+ { fprintf(stderr,"vcRead: Could not allocate compressor\n");
+ exit (1);
+ }
+
+ v->state = CODED_READ;
+ lens = v->codelens;
+ bits = v->codebits;
+ look = v->lookup;
+ ip = (uint8 *) in;
+
+ { uint32 t;
+ uint8 *b;
+
+ t = 1;
+ b = (uint8 *) (&t);
+ v->isbig = (b[0] == 0);
+ }
+
+ if (v->isbig != *ip++) // If endians out and in don't match then flip item bytes as needed
+ { FLIP32(ip)
+ memcpy(&(v->esc_code),ip,sizeof(int));
+ ip += sizeof(int);
+ FLIP32(ip)
+ memcpy(&(v->esc_len),ip,sizeof(int));
+ ip += sizeof(int);
+ for (i = 0; i < 256; i++)
+ { lens[i] = *ip++;
+ if (lens[i] > 0 || i == v->esc_code)
+ { FLIP16(ip)
+ memcpy(bits+i,ip,sizeof(uint16));
+ ip += sizeof(uint16);
+ }
+ else
+ bits[i] = 0;
+ }
+ }
+ else
+ { memcpy(&(v->esc_code),ip,sizeof(int));
+ ip += sizeof(int);
+ memcpy(&(v->esc_len),ip,sizeof(int));
+ ip += sizeof(int);
+ for (i = 0; i < 256; i++)
+ { lens[i] = *ip++;
+ if (lens[i] > 0 || i == v->esc_code)
+ { memcpy(bits+i,ip,sizeof(uint16));
+ ip += sizeof(uint16);
+ }
+ else
+ bits[i] = 0;
+ }
+ }
+
+ if (v->esc_code >= 0)
+ lens[v->esc_code] = v->esc_len;
+ for (i = 0; i < 256; i++)
+ { if (lens[i] > 0)
+ { base = (bits[i] << (16-lens[i]));
+ powr = (1 << (16-lens[i]));
+ for (j = 0; j < powr; j++)
+ look[base+j] = i;
+ }
+ }
+ if (v->esc_code >= 0)
+ lens[v->esc_code] = 0;
+
+ return ((OneCodec *) v);
+}
+
+
+/*******************************************************************************************
+ *
+ * Encoders and Decoders
+ *
+ ********************************************************************************************/
+
+static uint8 Number[128] =
+ { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 0, 0, 0, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 3, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 0, 0, 0, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 3, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ };
+
+ // Compress DNA into 2-bits per base
+
+int Compress_DNA(int len, char *s, char *t)
+{ int i, j;
+ uint8 *s0, *s1, *s2, *s3;
+
+ s0 = (uint8 *) s;
+ s1 = s0+1;
+ s2 = s1+1;
+ s3 = s2+1;
+
+ len -= 3;
+ for (i = j = 0; i < len; i += 4)
+ t[j++] = (Number[s0[i]] << 6) | (Number[s1[i]] << 4) | (Number[s2[i]] << 2) | Number[s3[i]];
+ switch (i-len)
+ { case 0:
+ t[j++] = (Number[s0[i]] << 6) | (Number[s1[i]] << 4) | (Number[s2[i]] << 2);
+ break;
+ case 1:
+ t[j++] = (Number[s0[i]] << 6) | (Number[s1[i]] << 4);
+ break;
+ case 2:
+ t[j++] = (Number[s0[i]] << 6);
+ break;
+ default:
+ break;
+ }
+
+ return ((len+3)<<1);
+}
+
+ // Encode ibytes[0..ilen) according to compressor vc and place in obytes
+ // Return the # of bits used.
+
+int vcEncode(OneCodec *vc, int ilen, char *ibytes, char *obytes)
+{ _OneCodec *v = (_OneCodec *) vc;
+
+ uint64 c, ocode, *ob;
+ int n, k, rem, tbits, ibits, esc, elen;
+ uint8 *clens, x, *bcode, *bb;
+ uint16 *cbits;
+
+ if (vc == DNAcodec)
+ return (Compress_DNA(ilen,ibytes,obytes));
+
+ if (v->state < CODED_WITH)
+ { fprintf(stderr,"vcEncode: Compressor does not have a codec\n");
+ exit (1);
+ }
+
+ esc = v->esc_code;
+ elen = v->esc_len;
+ clens = v->codelens;
+ cbits = v->codebits;
+ ibits = (ilen << 3);
+ bcode = (uint8 *) &ocode;
+
+#define OCODE(L,C) \
+{ rem -= L; \
+ if (rem <= 0) \
+ { ocode |= (C >> (-rem)); \
+ *ob++ = ocode; \
+ if (rem < 0) \
+ { rem += 64; \
+ ocode = (C << rem); \
+ } \
+ else \
+ { rem = 64; \
+ ocode = 0; \
+ } \
+ } \
+ else \
+ ocode |= (C << rem); \
+}
+
+ ob = (uint64 *) obytes;
+ tbits = 2;
+ rem = 62;
+ if (v->isbig)
+ ocode = 0x4000000000000000llu;
+ else
+ ocode = 0;
+ for (k = 0; k < ilen; k++)
+ { x = ibytes[k];
+ n = clens[x];
+ if (n == 0)
+ { if (esc < 0)
+ { fprintf(stderr,"Compression lib: No code for %c(%x) and no escape code\n",x,x);
+ exit (1);
+ }
+ c = cbits[esc];
+ tbits += 8+elen;
+ if (tbits > ibits)
+ break;
+ OCODE(elen,c);
+ c = x;
+ OCODE(8,c);
+ }
+ else
+ { tbits += n;
+ if (tbits > ibits)
+ break;
+ c = cbits[x];
+ OCODE(n,c);
+ }
+ }
+
+ if (k < ilen)
+ { *obytes = 0xff;
+ memcpy(obytes+1,ibytes,ilen);
+ return (ibits+8);
+ }
+
+ bb = (uint8 *) ob;
+ if (v->isbig)
+ { rem = ((71-rem)>>3);
+ for (k = 0; k < rem; k++)
+ *bb++ = bcode[k];
+ }
+ else
+ { rem = 7 - ((63-rem)>>3);
+ for (k = 7; k >= rem; k--)
+ *bb++ = bcode[k];
+ }
+
+ if (tbits >= 64 && !v->isbig)
+ { x = obytes[7];
+ obytes[7] = obytes[0];
+ obytes[0] = x;
+ }
+
+ return (tbits);
+}
+
+ // Uncompress read from 2-bits per base into [0-3] per byte representation
+
+static char Base[4] = { 'a', 'c', 'g', 't' };
+
+int Uncompress_DNA(char *s, int len, char *t)
+{ int i, tlen, byte;
+ char *t0, *t1, *t2, *t3;
+
+ t0 = t;
+ t1 = t0+1;
+ t2 = t1+1;
+ t3 = t2+1;
+
+ tlen = len-3;
+ for (i = 0; i < tlen; i += 4)
+ { byte = *s++;
+ t0[i] = Base[(byte >> 6) & 0x3];
+ t1[i] = Base[(byte >> 4) & 0x3];
+ t2[i] = Base[(byte >> 2) & 0x3];
+ t3[i] = Base[byte & 0x3];
+ }
+
+ switch (i-tlen)
+ { case 0:
+ byte = *s++;
+ t0[i] = Base[(byte >> 6) & 0x3];
+ t1[i] = Base[(byte >> 4) & 0x3];
+ t2[i] = Base[(byte >> 2) & 0x3];
+ break;
+ case 1:
+ byte = *s++;
+ t0[i] = Base[(byte >> 6) & 0x3];
+ t1[i] = Base[(byte >> 4) & 0x3];
+ break;
+ case 2:
+ byte = *s++;
+ t0[i] = Base[(byte >> 6) & 0x3];
+ break;
+ default:
+ break;
+ }
+
+ return (len);
+}
+
+ // Decode ilen bits in ibytes, into obytes according to vc's codec
+ // Return the number of bytes decoded.
+
+int vcDecode(OneCodec *vc, int ilen, char *ibytes, char *obytes)
+{ _OneCodec *v = (_OneCodec *) vc;
+
+ char *look;
+ uint8 *lens, *q;
+ uint64 icode, ncode, *p;
+ int rem, nem;
+ uint8 c, *o;
+ int n, k, elen, inbig, esc;
+
+ if (vc == DNAcodec)
+ return (Uncompress_DNA(ibytes,ilen>>1,obytes));
+
+ if (v->state < CODED_WITH)
+ { fprintf(stderr,"vcDecode: Compressor does not have a codec\n");
+ exit (1);
+ }
+
+ if (*((uint8 *) ibytes) == 0xff)
+ { int olen = (ilen>>3)-1;
+ memcpy(obytes,ibytes+1,olen);
+ return (olen);
+ }
+
+ p = (uint64 *) ibytes;
+
+ inbig = (*ibytes & 0x40);
+ if (!inbig && ilen >= 64)
+ { uint8 x = ibytes[7];
+ ibytes[7] = ibytes[0];
+ ibytes[0] = x;
+ }
+
+ if (inbig != v->isbig)
+ { q = (uint8 *) ibytes;
+ for (k = 64; k <= ilen; k += 64)
+ { FLIP64(q)
+ q += 8;
+ }
+ }
+
+ lens = v->codelens;
+ look = v->lookup;
+ esc = v->esc_code;
+ elen = v->esc_len;
+
+#define GET(n) \
+ ilen -= n; \
+ icode <<= n; \
+ rem -= n; \
+ while (rem < 16) \
+ { int z = 64-rem; \
+ icode |= (ncode >> rem); \
+ if (nem > z) \
+ { nem -= z; \
+ ncode <<= z; \
+ rem = 64; \
+ break; \
+ } \
+ else \
+ { rem += nem; \
+ if (rem >= ilen) \
+ break; \
+ else if (ilen-rem < 64) \
+ { nem = ilen-rem; \
+ q = (uint8 *) p; \
+ ncode = 0; \
+ for (k = 0; k < nem; k += 8) \
+ ncode |= (((uint64) (*q++)) << (56-k)); \
+ } \
+ else \
+ { ncode = *p++; \
+ nem = 64; \
+ } \
+ } \
+ }
+
+ if (ilen < 64)
+ { q = (uint8 *) ibytes;
+ icode = 0;
+ for (k = 0; k < ilen; k += 8)
+ icode |= (((uint64) (*q++)) << (56-k));
+ }
+ else
+ icode = *p++;
+ o = (uint8 *) obytes;
+ icode <<= 2;
+ ilen -= 2;
+ rem = 62;
+ if (rem > ilen)
+ rem = ilen;
+ ncode = 0;
+ nem = 0;
+ while (ilen > 0)
+ { c = look[icode >> 48];
+ if (c == esc)
+ { GET(elen)
+ c = (icode >> 56);
+ GET(8);
+ }
+ else
+ { n = lens[(int) c];
+ GET(n)
+ }
+ *o++ = c;
+ }
+
+ return (o - (uint8 *) obytes);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+//
+// integer compression for write/read of fields
+//
+// top bit of first byte: number is negative
+// second bit: one-byte: next six bits give number (make negative if top bit set)
+// third bit: two-byte: next 13 bits give number (make negative if top bit set)
+// if second and third bits are not set, remaining 5 bits give number of bytes to read
+
+static inline int intGet (unsigned char *u, I64 *pval)
+{
+ switch (u[0] >> 5)
+ {
+ case 2: case 3: // single byte positive
+ *pval = (I64) (u[0] & 0x3f) ; return 1 ;
+ case 6: case 7: // single byte negative
+ *pval = (I64) u[0] | 0xffffffffffffff00 ; return 1 ;
+ case 1: // two bytes positive
+ *pval = (I64) (u[0] & 0x1f) << 8 | (I64)u[1] ; return 2 ;
+ *pval = - ((I64) (u[0] & 0x1f) << 8 | (I64)u[1]) ; return 2 ;
+ case 0:
+ switch (u[0] & 0x07)
+ {
+ case 0: die ("int packing error") ;
+ case 1: *pval = *(I64*)(u+1) & 0x0000000000ffff ; return 3 ;
+ case 2: *pval = *(I64*)(u+1) & 0x00000000ffffff ; return 4 ;
+ case 3: *pval = *(I64*)(u+1) & 0x000000ffffffff ; return 5 ;
+ case 4: *pval = *(I64*)(u+1) & 0x0000ffffffffff ; return 6 ;
+ case 5: *pval = *(I64*)(u+1) & 0x00ffffffffffff ; return 7 ;
+ case 6: *pval = *(I64*)(u+1) & 0xffffffffffffff ; return 8 ;
+ case 7: *pval = *(I64*)(u+1) ; return 9 ;
+ }
+ case 4:
+ switch (u[0] & 0x07)
+ {
+ case 0: die ("int packing error") ;
+ case 1: *pval = *(I64*)(u+1) | 0xffffffffffff0000 ; return 3 ;
+ case 2: *pval = *(I64*)(u+1) | 0xffffffffff000000 ; return 4 ;
+ case 3: *pval = *(I64*)(u+1) | 0xffffffff00000000 ; return 5 ;
+ case 4: *pval = *(I64*)(u+1) | 0xffffff0000000000 ; return 6 ;
+ case 5: *pval = *(I64*)(u+1) | 0xffff000000000000 ; return 7 ;
+ case 6: *pval = *(I64*)(u+1) | 0xff00000000000000 ; return 8 ;
+ case 7: *pval = *(I64*)(u+1) ; return 9 ;
+ }
+ }
+ return 0 ; // shouldn't get here, but needed for compiler happiness
+}
+
+static inline int intPut (unsigned char *u, I64 val)
+{
+ if (val >= 0)
+ { if ( !(val & 0xffffffffffffffc0)) { *u = val | 0x40 ; return 1 ; }
+ else if (!(val & 0xffffffffffffe000)) { *u++ = (val >> 8) | 0x20 ; *u = val & 0xff ; return 2 ; }
+ else if (!(val & 0xffffffffffff0000)) { *u++ = 1 ; *(I64*)u = val ; return 3 ; }
+ else if (!(val & 0xffffffffff000000)) { *u++ = 2 ; *(I64*)u = val ; return 4 ; }
+ else if (!(val & 0xffffffff00000000)) { *u++ = 3 ; *(I64*)u = val ; return 5 ; }
+ else if (!(val & 0xffffff0000000000)) { *u++ = 4 ; *(I64*)u = val ; return 6 ; }
+ else if (!(val & 0xffff000000000000)) { *u++ = 5 ; *(I64*)u = val ; return 7 ; }
+ else if (!(val & 0xff00000000000000)) { *u++ = 6 ; *(I64*)u = val ; return 8 ; }
+ else { *u++ = 7 ; *(I64*)u = val ; return 9 ; }
+ }
+ else
+ { if ( !(~val & 0xffffffffffffffc0)) { *u = val | 0x40 ; return 1 ; }
+ // else if (!(~val & 0xffffffffffffe000)) { *u++ = (val >> 8) | 0x20 ; *u = val & 0xff ; return 2 ; }
+ else if (!(~val & 0xffffffffffff0000)) { *u++ = 0x81 ; *(I64*)u = val ; return 3 ; }
+ else if (!(~val & 0xffffffffff000000)) { *u++ = 0x82 ; *(I64*)u = val ; return 4 ; }
+ else if (!(~val & 0xffffffff00000000)) { *u++ = 0x83 ; *(I64*)u = val ; return 5 ; }
+ else if (!(~val & 0xffffff0000000000)) { *u++ = 0x84 ; *(I64*)u = val ; return 6 ; }
+ else if (!(~val & 0xffff000000000000)) { *u++ = 0x85 ; *(I64*)u = val ; return 7 ; }
+ else if (!(~val & 0xff00000000000000)) { *u++ = 0x86 ; *(I64*)u = val ; return 8 ; }
+ else { *u++ = 0x87 ; *(I64*)u = val ; return 9 ; }
+ }
+}
+
+static inline I64 ltfRead (FILE *f)
+{
+ unsigned char u[16] ;
+ I64 val ;
+
+ u[0] = getc (f) ;
+ if (u[0] & 0x40)
+ { intGet (u, &val) ;
+ // printf ("read %d n 1 u %02x\n", (int)val, u[0]) ;
+ }
+ else if (u[0] & 0x20)
+ { u[1] = getc (f) ; intGet (u, &val) ;
+ // printf ("read %d n 2 u %02x %02x\n", (int)val, u[0], u[1]) ;
+ }
+ else
+ { int n = 1 + (u[0] & 0x0f) ;
+ unsigned char *v = &u[1] ;
+ while (n--) *v++ = getc(f) ;
+ n = intGet (u, &val) ;
+ // printf ("read %d n %d u", (int)val, n) ;
+ // { int i ; for (i = 0 ; i< n ; ++i) printf (" %02x", u[i]) ; putchar ('\n') ; }
+ }
+
+ return val ;
+}
+
+static inline int ltfWrite (I64 x, FILE *f)
+{
+ unsigned char u[16] ;
+ int n = intPut (u, x) ;
+
+ // printf ("write %d n %d u", (int)x, n) ;
+ // { int i ; for (i = 0 ; i< n ; ++i) printf (" %02x", u[i]) ; putchar ('\n') ; }
+
+ fwrite (u, 1, n, f) ;
+ return n ;
+}
+
+#if defined(TEST_LTF) || defined(TEST_INT)
+
+// these are the original routines from James Bonfield on which this is based
+// incorporated here for credit and for comparisons in the TEST functionality
+
+/***********************************************************************************
+ *
+ * LTF encoding for integers
+ * adapted from htslib/cram/cram_io.h with copyright statement:
+
+Copyright (c) 2012-2019 Genome Research Ltd.
+Author: James Bonfield <jkb@sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+ 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ *
+ **********************************************************************************/
+
+/* 64-bit itf8 variant */
+
+static inline int ltf8_put(char *cp, int64_t val) {
+ unsigned char *up = (unsigned char *)cp;
+ if (!(val & ~((1LL<<7)-1))) {
+ *up = val;
+ return 1;
+ } else if (!(val & ~((1LL<<(6+8))-1))) {
+ *up++ = (val >> 8 ) | 0x80;
+ *up = val & 0xff;
+ return 2;
+ } else if (!(val & ~((1LL<<(5+2*8))-1))) {
+ *up++ = (val >> 16) | 0xc0;
+ *up++ = (val >> 8 ) & 0xff;
+ *up = val & 0xff;
+ return 3;
+ } else if (!(val & ~((1LL<<(4+3*8))-1))) {
+ *up++ = (val >> 24) | 0xe0;
+ *up++ = (val >> 16) & 0xff;
+ *up++ = (val >> 8 ) & 0xff;
+ *up = val & 0xff;
+ return 4;
+ } else if (!(val & ~((1LL<<(3+4*8))-1))) {
+ *up++ = (val >> 32) | 0xf0;
+ *up++ = (val >> 24) & 0xff;
+ *up++ = (val >> 16) & 0xff;
+ *up++ = (val >> 8 ) & 0xff;
+ *up = val & 0xff;
+ return 5;
+ } else if (!(val & ~((1LL<<(2+5*8))-1))) {
+ *up++ = (val >> 40) | 0xf8;
+ *up++ = (val >> 32) & 0xff;
+ *up++ = (val >> 24) & 0xff;
+ *up++ = (val >> 16) & 0xff;
+ *up++ = (val >> 8 ) & 0xff;
+ *up = val & 0xff;
+ return 6;
+ } else if (!(val & ~((1LL<<(1+6*8))-1))) {
+ *up++ = (val >> 48) | 0xfc;
+ *up++ = (val >> 40) & 0xff;
+ *up++ = (val >> 32) & 0xff;
+ *up++ = (val >> 24) & 0xff;
+ *up++ = (val >> 16) & 0xff;
+ *up++ = (val >> 8 ) & 0xff;
+ *up = val & 0xff;
+ return 7;
+ } else if (!(val & ~((1LL<<(7*8))-1))) {
+ *up++ = (val >> 56) | 0xfe;
+ *up++ = (val >> 48) & 0xff;
+ *up++ = (val >> 40) & 0xff;
+ *up++ = (val >> 32) & 0xff;
+ *up++ = (val >> 24) & 0xff;
+ *up++ = (val >> 16) & 0xff;
+ *up++ = (val >> 8 ) & 0xff;
+ *up = val & 0xff;
+ return 8;
+ } else {
+ *up++ = 0xff;
+ *up++ = (val >> 56) & 0xff;
+ *up++ = (val >> 48) & 0xff;
+ *up++ = (val >> 40) & 0xff;
+ *up++ = (val >> 32) & 0xff;
+ *up++ = (val >> 24) & 0xff;
+ *up++ = (val >> 16) & 0xff;
+ *up++ = (val >> 8 ) & 0xff;
+ *up = val & 0xff;
+ return 9;
+ }
+}
+
+static inline int ltf8_get(char *cp, int64_t *val_p) {
+ unsigned char *up = (unsigned char *)cp;
+
+ if (up[0] < 0x80) {
+ *val_p = up[0];
+ return 1;
+ } else if (up[0] < 0xc0) {
+ *val_p = (((uint64_t)up[0]<< 8) |
+ (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
+ return 2;
+ } else if (up[0] < 0xe0) {
+ *val_p = (((uint64_t)up[0]<<16) |
+ ((uint64_t)up[1]<< 8) |
+ (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
+ return 3;
+ } else if (up[0] < 0xf0) {
+ *val_p = (((uint64_t)up[0]<<24) |
+ ((uint64_t)up[1]<<16) |
+ ((uint64_t)up[2]<< 8) |
+ (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
+ return 4;
+ } else if (up[0] < 0xf8) {
+ *val_p = (((uint64_t)up[0]<<32) |
+ ((uint64_t)up[1]<<24) |
+ ((uint64_t)up[2]<<16) |
+ ((uint64_t)up[3]<< 8) |
+ (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
+ return 5;
+ } else if (up[0] < 0xfc) {
+ *val_p = (((uint64_t)up[0]<<40) |
+ ((uint64_t)up[1]<<32) |
+ ((uint64_t)up[2]<<24) |
+ ((uint64_t)up[3]<<16) |
+ ((uint64_t)up[4]<< 8) |
+ (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
+ return 6;
+ } else if (up[0] < 0xfe) {
+ *val_p = (((uint64_t)up[0]<<48) |
+ ((uint64_t)up[1]<<40) |
+ ((uint64_t)up[2]<<32) |
+ ((uint64_t)up[3]<<24) |
+ ((uint64_t)up[4]<<16) |
+ ((uint64_t)up[5]<< 8) |
+ (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
+ return 7;
+ } else if (up[0] < 0xff) {
+ *val_p = (((uint64_t)up[1]<<48) |
+ ((uint64_t)up[2]<<40) |
+ ((uint64_t)up[3]<<32) |
+ ((uint64_t)up[4]<<24) |
+ ((uint64_t)up[5]<<16) |
+ ((uint64_t)up[6]<< 8) |
+ (uint64_t)up[7]) & ((1LL<<(7*8))-1);
+ return 8;
+ } else {
+ *val_p = (((uint64_t)up[1]<<56) |
+ ((uint64_t)up[2]<<48) |
+ ((uint64_t)up[3]<<40) |
+ ((uint64_t)up[4]<<32) |
+ ((uint64_t)up[5]<<24) |
+ ((uint64_t)up[6]<<16) |
+ ((uint64_t)up[7]<< 8) |
+ (uint64_t)up[8]);
+ return 9;
+ }
+}
+
+#include <sys/resource.h>
+#ifndef RUSAGE_SELF /* to prevent "RUSAGE_SELF redefined" gcc warning, fixme if this is more intricate */
+#define RUSAGE_SELF 0
+#endif
+
+void timeUpdate (FILE *f)
+{
+ static bool isFirst = 1 ;
+ static struct rusage rOld, rFirst ;
+ struct rusage rNew ;
+ int secs, usecs ;
+
+ getrusage (RUSAGE_SELF, &rNew) ;
+ if (!isFirst)
+ { secs = rNew.ru_utime.tv_sec - rOld.ru_utime.tv_sec ;
+ usecs = rNew.ru_utime.tv_usec - rOld.ru_utime.tv_usec ;
+ if (usecs < 0) { usecs += 1000000 ; secs -= 1 ; }
+ fprintf (f, "user\t%d.%06d", secs, usecs) ;
+ secs = rNew.ru_stime.tv_sec - rOld.ru_stime.tv_sec ;
+ usecs = rNew.ru_stime.tv_usec - rOld.ru_stime.tv_usec ;
+ if (usecs < 0) { usecs += 1000000 ; secs -= 1 ; }
+ fprintf (f, "\tsystem\t%d.%06d", secs, usecs) ;
+ fprintf (f, "\tmax_RSS\t%ld", rNew.ru_maxrss - rOld.ru_maxrss) ;
+ fputc ('\n', f) ;
+ }
+ else
+ { rFirst = rNew ;
+ isFirst = false ;
+ }
+
+ rOld = rNew ;
+}
+
+int main (int argc, char *argv[])
+{
+ I64 i, j, x, n, tot, mod ;
+ FILE *f ;
+ static unsigned char buffer[9*(1<<20)] ;
+
+ if (argc < 3) die ("usage: ./test <start> <n> [mod]") ;
+
+// { int t = 1;
+// char *b = (char *) (&t);
+// if (*b == 0) printf ("bigEndian\n") ; else printf ("smallEndian\n") ;
+// }
+
+ timeUpdate (0) ;
+
+ x = atoi(argv[1]) ;
+ n = atoi(argv[2]) ;
+ if (argc == 4) mod = atoi(argv[3]) ; else mod = 0 ;
+ tot = 0 ;
+#ifdef TEST_INT
+ f = fopen ("int.test", "w") ;
+#endif
+#ifdef TEST_LTF
+ f = fopen ("ltf.test", "w") ;
+#endif
+ if (argc > 4)
+ for (i = 0 ; i < n ; ++i) { tot += ltfWrite (x++, f) ; if (x == mod) x = 0 ; }
+ else
+ { while (n)
+ { int m = (n > 1<<20) ? 1<<20 : n ;
+ unsigned char *u = buffer ;
+ for (i = 0 ; i < m ; ++i)
+ {
+#ifdef TEST_INT
+ u += intPut (u, x++) ;
+#endif
+#ifdef TEST_LTF
+ u += ltf8_put ((char*)u, x++) ;
+#endif
+ if (x == mod) x = 0 ;
+ }
+ tot += (u-buffer) ;
+ fwrite (buffer, 1, (u-buffer), f) ;
+ n -= m ;
+ }
+ }
+ fclose (f) ;
+ printf ("wrote %" PRId64 " bytes: ", tot) ;
+ timeUpdate (stdout) ;
+
+ x = atoi(argv[1]) ;
+ n = atoi(argv[2]) ;
+ tot = 0 ;
+#ifdef TEST_INT
+ f = fopen ("int.test", "r") ;
+#endif
+#ifdef TEST_LTF
+ f = fopen ("ltf.test", "r") ;
+#endif
+ if (argc > 4)
+ { for (j = 0 ; j < n ; ++j)
+ if ((i = ltfRead (f)) != j)
+ die ("ltf wrote %d read %d", (int)j, (int)i) ;
+ }
+ else
+ { unsigned char *u = buffer, *v = buffer ; // u is current pos, v is end of section read in
+ while (n)
+ { int m = (n > 1<<20) ? 1<<20 : n ;
+// printf ("attempting to read %d chars: n %d (v-u) %d\n", (int)(9*m - (v-u)), m, (int)(v-u)) ;
+ unsigned char *v0 = v, *u0 = u ;
+ v += fread (v, 1, 9*m - (v-u), f) ;
+// printf (" read %d chars\n", (int)(v-v0)) ;
+ for (i = 0 ; i < m ; ++i)
+#ifdef TEST_INT
+ u += intGet (u, &j) ;
+#endif
+#ifdef TEST_LTF
+ u += ltf8_get ((char*)u, &j) ;
+#endif
+// printf (" intGet m %d ints used %d chars\n", m, (int)(u-u0)) ;
+ n -= m ;
+ m = v - u ;
+// printf (" memmove %d\n", m) ;
+ memmove (buffer, u, m) ;
+ tot += (u-buffer) ;
+ v -= (u-buffer) ; u = buffer ;
+ }
+ }
+ fclose (f) ;
+
+ printf ("read %" PRId64 " bytes: ", tot) ;
+ timeUpdate (stdout) ;
+}
+#endif // TEST_LTF
+
+/***********************************************************************************
+ *
+ * UTILITIES: memory allocation, file opening, timer
+ * adapted from Richard's utilities
+ *
+ **********************************************************************************/
+
+static void die(char *format, ...)
+{ va_list args;
+
+ va_start (args, format);
+ fprintf (stderr, "FATAL ERROR: ");
+ vfprintf (stderr, format, args);
+ fprintf (stderr, "\n");
+ va_end (args);
+ exit (-1);
+}
+
+static I64 nAlloc = 0;
+static I64 totalAlloc = 0;
+
+static void *myalloc(size_t size)
+{ void *p;
+
+ p = malloc(size);
+ if (p == NULL) die("myalloc failure requesting %d bytes", size);
+ nAlloc += 1;
+ totalAlloc += size;
+ return (p);
+}
+
+static void *mycalloc(size_t number, size_t size)
+{ void *p;
+
+ p = calloc(number,size);
+ if (p == NULL) die("mycalloc failure requesting %d objects of size %d", number, size);
+ nAlloc += 1;
+ totalAlloc += size*number;
+ return p;
+}
+
+/********************* end of file ***********************/
diff --git a/ONElib.h b/ONElib.h
new file mode 100644
index 0000000..319ef6b
--- /dev/null
+++ b/ONElib.h
@@ -0,0 +1,410 @@
+/******************************************************************************************
+ *
+ * File: ONElib.h
+ * Header for ONE file reading and writing
+ *
+ * Authors: Richard Durbin (rd109@cam.ac.uk), Gene Myers (myers@mpi-cbg.de)
+ * Copyright (C) Richard Durbin, Gene Myers, 2019-
+ *
+ * HISTORY:
+ * Last edited: Dec 3 06:08 2022 (rd109)
+ * * Dec 3 06:01 2022 (rd109): remove oneWriteHeader(), switch to stdarg for oneWriteComment etc.
+ * * Dec 27 09:46 2019 (gene): style edits
+ * * Created: Sat Feb 23 10:12:43 2019 (rd109)
+ *
+ *****************************************************************************************/
+
+#ifndef ONE_DEFINED
+#define ONE_DEFINED
+
+#include <stdio.h> // for FILE etc.
+#include <stdarg.h> // for formatted writing in oneWriteComment(), oneAddProvenance()
+#include <inttypes.h> // for standard size int types and their PRI print macros
+#include <stdbool.h> // for standard bool types
+#include <limits.h> // for INT_MAX etc.
+#include <pthread.h>
+
+/***********************************************************************************
+ *
+ * DATA TYPES
+ *
+ **********************************************************************************/
+
+// Basic Types
+#ifndef U8_DEFINED
+#define U8_DEFINED
+
+typedef int8_t I8;
+typedef int16_t I16;
+typedef int32_t I32;
+typedef int64_t I64;
+typedef unsigned char U8;
+
+#endif // U8_DEFINED
+
+typedef enum { oneINT = 1, oneREAL, oneCHAR, oneSTRING,
+ oneINT_LIST, oneREAL_LIST, oneSTRING_LIST, oneDNA } OneType;
+extern char* oneTypeString[] ;
+// = { 0, "INT", "REAL", "CHAR", "STRING", "INT_LIST", "REAL_LIST", "STRING_LIST", "DNA" } ;
+
+typedef union
+ { I64 i;
+ double r;
+ char c;
+ I64 len; // For lists : top 8 bits encode excess bytes, low 56 length
+ } OneField;
+
+typedef struct
+ { char *program;
+ char *version;
+ char *command;
+ char *date;
+ } OneProvenance;
+
+typedef struct
+ { char *filename;
+ I64 count;
+ } OneReference;
+
+typedef struct
+ { I64 count;
+ I64 max;
+ I64 total;
+ I64 groupCount;
+ I64 groupTotal;
+ } OneCounts;
+
+ // OneCodecs are a private package for binary one file compression
+
+typedef void OneCodec; // forward declaration of opaque type for compression codecs
+
+ // DNAcodec is a special pre-existing compressor one should use for DNA.
+ // It compresses every base to 2-bits, where any non-ACGT letter is
+ // effectively converted to an A. Compression is case insensitive,
+ // but decompression always delivers lower-case.
+
+extern OneCodec *DNAcodec;
+
+ // Record for a particular line type. There is at most one list element.
+
+typedef struct
+ { OneCounts accum; // counts read or written to this moment
+ OneCounts given; // counts read from header
+ I64 gCount; // used internally to calculate groupCount and groupTotal
+ I64 gTotal;
+ I64 oCount; // # of objects in prefix before first group (if any)
+ I64 oTotal; // + of objects in prefix (these 2 are for thread parallel apps)
+
+ int nField; // number of fields
+ OneType *fieldType; // type of each field
+ int listEltSize; // size of list field elements (if present, else 0)
+ int listField; // field index of list
+ char *comment; // the comment on the definition line in the schema
+
+ bool isUserBuf; // flag for whether buffer is owned by user
+ I64 bufSize; // system buffer and size if not user supplied
+ void *buffer;
+
+ OneCodec *listCodec; // compression codec and flags
+ bool isUseListCodec; // on once enough data collected to train associated codec
+ char binaryTypePack; // binary code for line type, bit 8 set.
+ // bit 0: list compressed
+ I64 listTack; // accumulated training data for this threads codeCodec (master)
+ } OneInfo;
+
+ // the schema type - the first record is the header spec, then a linked list of primary classes
+
+typedef struct OneSchema
+ {
+ char *primary ;
+ int nSecondary ;
+ char **secondary ;
+ OneInfo *info[128] ;
+ int nFieldMax ;
+ char objectType ;
+ char groupType ;
+ struct OneSchema *nxt ;
+ } OneSchema ;
+
+typedef struct OneHeaderText
+ { char *text ;
+ struct OneHeaderText *nxt ;
+ } OneHeaderText ;
+
+ // The main OneFile type - this is the primary handle used by the end user
+
+typedef struct
+ {
+ // this field may be set by the user
+
+ bool isCheckString; // set if want to validate string char by char
+
+ // these fields may be read by user - but don't change them!
+
+ char *fileType;
+ char *subType;
+ char lineType; // current lineType
+ char objectType; // line designation character for primary objects
+ char groupType; // line designation character for groups (optional)
+ I64 line; // current line number
+ I64 byte; // current byte position when writing binary
+ I64 object; // current object - incremented when object line read
+ I64 group; // current group - incremented when group line read
+ OneProvenance *provenance; // if non-zero then count['!'] entries
+ OneReference *reference; // if non-zero then count['<'] entries
+ OneReference *deferred; // if non-zero then count['>'] entries
+ OneField *field; // used to hold the current line - accessed by macros
+ OneInfo *info[128]; // all the per-linetype information
+ I64 codecTrainingSize; // amount of data to see before building codec
+
+ // fields below here are private to the package
+
+ FILE *f;
+
+ bool isWrite; // true if open for writing
+ bool isHeaderOut; // true if header already written
+ bool isBinary; // true if writing a binary file
+ bool inGroup; // set once inside a group
+ bool isLastLineBinary; // needed to deal with newlines on ascii files
+ bool isIndexIn; // index read in
+ bool isBig; // are we on a big-endian machine?
+ bool isNoAsciiHeader; // backdoor for ONEview to avoid writing header in ascii
+
+ char lineBuf[128]; // working buffers
+ char numberBuf[32];
+ int nFieldMax;
+ I64 codecBufSize;
+ char *codecBuf;
+ I64 nBits; // number of bits of list currently in codecBuf
+ I64 intListBytes; // number of bytes per integer in the compacted INT_LIST
+ I64 linePos; // current line position
+ OneHeaderText *headerText; // arbitrary descriptive text that goes with the header
+
+ char binaryTypeUnpack[256]; // invert binary line code to ASCII line character.
+ int share; // index if slave of threaded write, +nthreads > 0 if master
+ int isFinal; // oneFinalizeCounts has been called on file
+ pthread_mutex_t fieldLock; // Mutexs to protect training accumumulation stats when threadded
+ pthread_mutex_t listLock;
+ } OneFile; // the footer will be in the concatenated result.
+
+
+/***********************************************************************************
+ *
+ * ROUTINES FOR READING & WRITING ONE FILES IN BOTH ASCII & BINARY (TRANSPARENTLY)
+ *
+ **********************************************************************************/
+
+// CREATING AND DESTROYING SCHEMAS
+
+OneSchema *oneSchemaCreateFromFile (char *path) ;
+OneSchema *oneSchemaCreateFromText (char *text) ;
+
+ // These functions create a schema handle that can be used to open One-code data files
+ // for reading and writing. A schema file is itself a One-code file, consisting of
+ // a set of objects, one per primary file type. Valid lines in this file are:
+ // P <primary file type> // a short string
+ // S <secondary file type> // a short string - any number of these
+ // O <char> <field_list> // definition of object type
+ // G <char> <field_list> // definition of group type - first field must be an int
+ // D <char> <field_list> // definition of line
+ // <char> must be a lower or upper case letter.
+ // <field_list> is a list of field types from:
+ // CHAR, INT, REAL, STRING, INT_LIST, REAL_LIST, STRING_LIST, DNA
+ // Only one list type (STRING, *_LIST or DNA) is allowed per line type.
+ // All the D lines following an O line apply to that object type.
+ // By convention comments on each line explain the definition.
+ // Example, with lists and strings preceded by their length in OneCode style
+ // P 3 seq this is a sequence file
+ // O S 1 3 DNA the DNA sequence - each S line starts an object
+ // D Q 1 6 STRING the phred encoded quality score + ASCII 33
+ // D N 4 4 REAL 4 REAL 4 REAL 4 REAL signal to noise ratio in A, C, G, T channels
+ // G g 2 3 INT 6 STRING group designator: number of objects, name
+ // The ...FromText() alternative writes the text to a temp file and reads it with
+ // oneSchemaCreateFromFile(). This allows code to set the schema.
+ // Internally a schema is a linked list of OneSchema objects, with the first holding
+ // the (hard-coded) schema for the header and footer, and the remainder each
+ // corresponding to one primary file type.
+
+void oneSchemaDestroy (OneSchema *schema) ;
+
+// READING ONE FILES:
+
+OneFile *oneFileOpenRead (const char *path, OneSchema *schema, char *type, int nthreads) ;
+
+ // Open ONE file 'path', either binary or ascii encoded, for reading.
+ // If the file doesn't have a header, then 'type' must be specified,
+ // otherwise, if 'type' is non-zero it must match the header type.
+ // All header information (if present) is read.
+ // 'schema' is also optional. If it is NULL then the file must contain its own schema.
+ // If 'schema' is present then it must support 'type', and if the file contains its
+ // own schema, then that must be a subset of the one for this type in 'schema'.
+ // If nthreads > 1 then nthreadds OneFiles are generated as an array and the pointer
+ // to the first, called the master, is returned. The other nthreads-1 files are
+ // called slaves. The package routines are aware of when a OneFile argument is a
+ // slave or master in a parallel group. The master recieves provenance, counts, etc.
+ // The slaves only read data and have the virtue of sharing indices and codecs with
+ // the master if relevant.
+
+bool oneFileCheckSchema (OneFile *vf, char *textSchema) ;
+
+ // Checks if file schema is consistent with text schema. Mismatches are reported to stderr.
+ // Filetype and all linetypes in text must match. File schema can contain additional linetypes.
+ // e.g. if (! oneFileCheckSchema (vf, "P 3 seq\nD S 1 3 DNA\nD Q 1 6 STRING\nD P 0\n")) die () ;
+ // This is provided to enable a program to ensure that its assumptions about data layout
+ // are satisfied.
+
+char oneReadLine (OneFile *vf) ;
+
+ // Read the next ONE formatted line returning the line type of the line, or 0
+ // if at the end of the data section. The content macros immediately below are
+ // used to access the information of the line most recently read.
+
+void *_oneList (OneFile *vf) ; // lazy codec decompression if required
+void *_oneCompressedList (OneFile *vf) ; // lazy codec compression if required
+
+#define oneInt(vf,x) ((vf)->field[x].i)
+#define oneReal(vf,x) ((vf)->field[x].r)
+#define oneChar(vf,x) ((vf)->field[x].c)
+#define _LF(vf) ((vf)->info[(int)(vf)->lineType]->listField)
+#define oneLen(vf) ((vf)->field[_LF(vf)].len & 0xffffffffffffffll)
+#define oneString(vf) (char *) _oneList(vf)
+#define oneDNAchar(vf) (char *) _oneList(vf)
+#define oneDNA2bit(vf) (U8 *) _oneCompressedList(vf)
+#define oneIntList(vf) (I64 *) _oneList(vf)
+#define oneRealList(vf) (double *) _oneList(vf)
+#define oneNextString(vf,s) (s + strlen(s) + 1)
+
+ // Access field information. The index x of a list object is not required as there is
+ // only one list per line, stored in ->buffer.
+ // A "string list" is implicitly supported, get the first string with oneString, and
+ // subsequent strings sequentially with oneNextString, e.g.:
+ //
+ // char *s = oneString(vf);
+ // for (i = 0; i < oneLen(vf); i++)
+ // { // do something with i'th string
+ // s = oneNextString(vf,s);
+ // }
+
+char *oneReadComment (OneFile *vf);
+
+ // Can be called after oneReadLine() to read any optional comment text after the fixed fields.
+ // Returns NULL if there is no comment.
+
+// WRITING ONE FILES:
+
+OneFile *oneFileOpenWriteNew (const char *path, OneSchema *schema, char *type,
+ bool isBinary, int nthreads);
+OneFile *oneFileOpenWriteFrom (const char *path, OneFile *vfIn,
+ bool isBinary, int nthreads);
+
+ // Create a new oneFile that will be written to 'path'. For the 'New' variant supply
+ // the file type, subtype (if non-zero), and whether it should be binary or ASCII.
+ // For the 'From' variant, specify binary or ASCII, schema and all other header
+ // information is inherited from 'vfIn', where the count stats are from vfIn's
+ // accumulation (assumes vfIn has been fully read or written) if 'useAccum is true,
+ // and from vfIn's header otherwise.
+ // If nthreads > 1 then nthreads OneFiles are generated as an array and the pointer
+ // to the first, called the master, is returned. The other nthreads-1 files are
+ // called slaves. The package routines are aware of when a OneFile argument is a
+ // slave or master in a parallel group. The slaves are expected to only write data
+ // lines, with the master adding provenance, producing the header, and then some
+ // segment of the initial data lines. Upon close the final result is effectively
+ // the concatenation of the master, followed by the output of each slave in sequence.
+
+bool oneInheritProvenance (OneFile *vf, OneFile *source);
+bool oneInheritReference (OneFile *vf, OneFile *source);
+bool oneInheritDeferred (OneFile *vf, OneFile *source);
+
+ // Add all provenance/reference/deferred entries in source to header of vf. Must be
+ // called before first call to oneWriteLine.
+
+bool oneAddProvenance (OneFile *vf, char *prog, char *version, char *format, ...);
+bool oneAddReference (OneFile *vf, char *filename, I64 count);
+bool oneAddDeferred (OneFile *vf, char *filename);
+
+ // Append provenance/reference/deferred to header information. Must be called before
+ // first call to oneWriteLine.
+
+ // For ASCII output, if you want the header to contain count information then you must
+ // create and fill the relevant OneCounts objects before the first call to oneWriteLine.
+ // For BINARY output, the OneCounts information is accumulated and written automatically.
+
+void oneWriteLine (OneFile *vf, char lineType, I64 listLen, void *listBuf);
+
+ // Set up a line for output just as it would be returned by oneReadLine and then call
+ // this routine to output the line (ASCII or binary).
+ // Use the macros above on the l.h.s. of assignments to fill fields (e.g. oneInt(vf,2) = 3).
+ // For lists, give the length in the listLen argument, and either place the list data in your
+ // own buffer and give it as listBuf, or put in the line's buffer and set listBuf == NULL.
+
+void oneWriteLineFrom (OneFile *vf, OneFile *source) ; // copies a line from source into vf
+void oneWriteLineDNA2bit (OneFile *vf, char lineType, I64 listLen, U8 *dnaBuf);
+
+// Minor variants of oneWriteLine().
+// Use oneWriteLineDNA2bit for DNA lists if your DNA is already 2-bit compressed.
+
+void oneWriteComment (OneFile *vf, char *format, ...); // can not include newline \n chars
+
+ // Adds a comment to the current line. Extends line in ascii, adds special line type in binary.
+
+// CLOSING FILES (FOR BOTH READ & WRITE)
+
+void oneFileClose (OneFile *vf);
+
+ // Close vf (opened either for reading or writing). Finalizes counts, merges theaded files,
+ // and writes footer if binary. Frees all non-user memory associated with vf.
+
+// GOTO & BUFFER MANAGEMENT
+
+void oneUserBuffer (OneFile *vf, char lineType, void *buffer);
+
+ // A buffer is used to capture the list element of each line type that has one.
+ // This routine allows you to reassign the buffer to one you've allocated, or
+ // to revert to a default system buffer if 'buffer' = NULL. The previous buffer
+ // (if any) is freed. The user must ensure that a buffer they supply is large
+ // enough. BTW, this buffer is overwritten with each new line read of the given type.
+
+bool oneGotoObject (OneFile *vf, I64 i);
+
+ // Goto i'th object in the file. This only works on binary files, which have an index.
+
+I64 oneGotoGroup (OneFile *vf, I64 i);
+
+ // Goto the first object in group i. Return the size (in objects) of the group, or 0
+ // if an error (i out of range or vf has not group type). Only works for binary files.
+
+/***********************************************************************************
+ *
+ * A BIT ABOUT THE FORMAT OF BINARY FILES
+ *
+ **********************************************************************************/
+
+ // <bin file> <- <ASCII Prolog> <$-line> <binary data> <footer> <^-line> <footer-size:int64>
+ //
+ // '$'-line flags file is binary and gives endian
+ // The data block ends with a blank line consisting of '\n'
+ //
+ // EWM: Removed '-' line, simply write off_t to footer start
+ //
+ // <ASCII Prolog> <- <'1'-line> [<'2'-line>] ( <'!'-line> | <'<'-line> | <'>'-line> )*
+ //
+ // The ASCII prolog contains the type, subtype, provenance, reference, and deferred lines
+ // in the ASCII format. The ONE count statistic lines for each data line type are found
+ // in the footer along with binary ';' and ':' lines that encode their compressors as
+ // needed. The footer also contains binary '&' and '*' lines that encode the object index
+ // and group indices, respectively.
+ //
+ // <Binary line> <- <Binary line code + tags> <fields> [<list data>]
+ //
+ // Line codes are >= 128 for binary encoded lines. The low two order bits of these are flags,
+ // so each binary-encoded line type has 4 codes and a table maps these to the ASCII code.
+ // Bit 0 indicates if the fields of the line type are compressed, and Bit 1 indicates if
+ // the list data (if present) is compressed.
+ //
+ // If a field is a list, then the field array element for that field is the list's length
+ // where the low 56 bits encode length, and the high 8 bits encode the # of high-order
+ // 0-bytes in every list element if an INT_LIST (0 otherwise).
+
+#endif // ONE_DEFINED
+
+/******************* end of file **************/
diff --git a/README.md b/README.md
index 93b703f..edcb213 100644
--- a/README.md
+++ b/README.md
@@ -250,73 +250,68 @@ scoring chain and + indicates an alternate near optimal chain (controlled by the
-n parameter to damapper). Each additional LA of a chain is marked with a - character.
```
-5a. LAdump [-cdtlo] <src1:db|dam> [ <src2:db|dam> ]
- <align:las> [ <reads:FILE> | <reads:range> ... ]
+5a. LA2ONE [-cto] <src1:db|dam> [ <src2:db|dam> ]
+ <align:las> [ <reads:FILE> | <reads:range> ... ] > (.dal file)
-5b. dumpLA <align.las>
+5b. ONE2LA <align.dal> > (.las file)
```
-Like LAshow, LAdump allows one to display the local alignments (LAs) of a subset of the
-piles in an .las file and select which information to show about them. The difference
-is that the information is written in a very simple "1-code" ASCII format that makes it
-easy for one to read and parse the information for further use. For each LA the pair of
-reads is output on a line. -c requests that one further output the coordinates of the
-LA segments be output. The -d option requests that the number of difference in the LA
-be output, -t requests that the tracepoint information be output, and -l requests the
-length of the two reads be output. Finally, -o requests that only LAs that are proper
-overlaps be output.
-
-The format is very simple. Each requested piece of information occurs on a line. The
-first character of every line is a "1-code" character that tells you what information
-to expect on the line. The rest of the line contains information where each item is
-separated by a single blank space. The trace point line gives the number of trace
-point intervals in the LA and is immediately followed by that many lines containing
-a pair of integers giving the number of differences and b-displacement in each successive
-trace point interval.
-
+LA2ONE produces a .dal 1-code data file of all or a portion of the contents of a .las file.
+[1-code](https://www.github.com/thegenemyers/ONEcode)
+is a powerful self-describing, simple to use, data system with built in compression.
+Typically a .dal file will be 45% the size of its corresponding .dal file.
+By default only the pairwise overlapped pairs, their orientation, and chaining (if any) are
+output. The -c option
+requests that one further output the coordinates of the
+LA segments, the lengths of the reads and the # of differences in each LA.
+The -t option requests that the trace point alignment details are output for each LA.
+If -t is set then -c must be set and if both are set then all of the information about each
+LA is effectively output. The -o option requests that only LAs that are proper
+overlaps be output. Only the overlaps for particular A-reads may be specified as per the same command line arguments as documented for LAshow above.
+
+ONE2LA converts a 1-code .dal file back into a .las file. It requires that the .dal file contains all the information about each LA therein, that is, it must contain all the information that is output by LA2ONE with the options -ct set.
+
+The .dal format is quite simple where the primary object is considered to be a **pile**, i.e., the set of all LAs for a given a-read.
+The encoding of all the LAs for a pile is given by several lines in the 1-code format, where each line type is designated by the first character in the line.
+The P-line designated the start of a pile object and consists of giving the a-read and then a list of all the b-reads in the pile. We term the length of this list the size of the pile. All other line type lists for this given pile should have the same size or a multiple thereof.
+By default the P-line is followed by O- and C-lines that give the orientation and chaining directives (if any) for each b-read in the pile list as strings, one character per b-read.
+
+```
+ P <a-read: int> <pile b-reads: int_list>
+ O <orientations: string over [nc]> // n is normal, c is complement
+ C <chain directives: string over [>+-.]> // '>' start of best chain,
+ // '+' start of alternate chain,
+ // '-' continuation of chain, or
+ // '.' no chains (in the file).
```
- P #a #b #o #c - (#a,#b^#o) have an LA between them where #o is 'n' or 'c' and
- #c is '>' (start of best chain), '+' (start of alternate chain),
- '-' (continuation of chain), or '.' (no chains in file).
- L #la #lb - #la is the length of the a-read and #lb that of the b-read
- C #ab #ae #bb #be - #a[#ab,#ae] aligns with #b^#o[#bb,#be]
- D # - there are # differences in the LA
- T #n - there are #n trace point intervals for the LA
- (#d #y )^#n - there are #d difference aligning the #y bp's of B with the
- next fixed-size interval of A
- + X # - Total amount of X (X = P or T)
- % X # - Maximum amount of X in any pile (X = P or T)
- @ T # - Maximum number of trace points in any trace
+If the -c flag is set, then LA2ONE also outputs 1-code lines describing the intervals (ab,ae) of
+the a-read that align with intervals (bb,be) of the b-reads and the diffs in the aligned segments. The lengths of the a-read and all the b-reads is also output for convenience.
+The A-line outputs a list of the pairs (ab,ae) in the same order as the b-reads are listed in
+the object P-line. The B-line outputs a list of the pairs (bb,be) and the D-line outputs a list
+of the corresponding difference in each aligned pair of segments. Note carefully that the length
+of the A-line and B-line lists are *twice* that of the pile size. Lastly, the L-line gives the
+length of the a-read and then the lengths of the b-reads in the same order as the P-line.
+
+```
+ A <a-read intervals: int_list>
+ B <b-read intervals: int_list>
+ D <alignment diffs: int_list>
+ L <a-read length: int> <b-read lengths: int_list>
```
-
-1-code lines that begin with +, %, or @ are always the first lines in the output.
-They give size information about what is contained in the output. Specifically,
-'+ X #' gives the total number of LAs (X=P), or the total number of trace point
-intervals (X=T) in the file . '% X #' gives the maximum number of LAs (X=P) or
-the maximum number of trace point intervals (X=T) in a given *pile* (collection of
-LAs all with the same a-read (applies only to sorted .las files). A final line: '@ T #',
-gives the maximum # of trace point intervals in any trace within the file.
-After these lines and before the start of the lines describing alignment records is a
-single line of the form 'X #' where the number is the trace point spacing for all
-alignments.
-
-The command dumpLA reads a 1-code file from the standard input and if possible produces a .las
-file for it. The 1-code file is any legitimate coding of alignments as might be produced by LAdump.
-The 1-code file must contain the P-, C-, and T-lines as well as the X-line and the header lines
-beginning with +, %, or @. So for example, a 1-code file produced by LAdump with the -c and -t
-options is invertible.
+If the -t flag is set, then LA2ONE outputs a pair of T- and Q-lines, one for each alignment in
+the pile in the order listed in the controlling P-line. The relative order of the T- and Q-lines
+does not matter as long as the i'th occurrence of a T- or Q-line is for the i'th alignment in the
+pile. The T-line gives the b-segment lengths for a trace-point encoding of the alignment and the Q-line gives the difference in each segment. In addition the a-read spacing (which is the same for all alignments) is given in an X-line at the start of the data file.
```
-6a. LAa2b
-6b. LAb2a
+ T <b-displacements: int_list>
+ Q <segment diffs: int_list>
+
+ X <trace spacing:int> // 1st data line before any objects
```
-Pipes (stdin to stdout) that convert an ASCII output produced by LAdump into a compressed
-binary representation (LAa2b) and vice verse (LAb2a). The idea is to save disk space by
-keeping the dumps in a more compressed format.
-
```
-7. LAcat [-v] <source:las> ... > <target>.las
+6. LAcat [-v] <source:las> ... > <target>.las
```
The sequence of \<source\> files (that can contain @-sign block ranges) are
@@ -326,7 +321,7 @@ option reports the files concatenated and the number of la's within them to
standard error (as the standard output receives the concatenated file).
```
-8. LAsplit [-v] <target:las> (<parts:int> | <path:db|dam>) < <source>.las
+7. LAsplit [-v] <target:las> (<parts:int> | <path:db|dam>) < <source>.las
```
If the second argument is an integer n, then divide the alignment file \<source\>, piped
@@ -342,7 +337,7 @@ in \<path\>.i.db are in the i'th file generated from the template \<target\>. T
option reports the files produced and the number of la's within them to standard error.
```
-9. LAcheck [-vaS] <src1:db|dam> [ <src2:db|dam> ] <align:las> ...
+8. LAcheck [-vaS] <src1:db|dam> [ <src2:db|dam> ] <align:las> ...
```
LAcheck checks each .las file for structural integrity, where the a- and b-sequences
@@ -361,7 +356,7 @@ information, and if it does, then it checks the validity of chains and checks th
sorting order of chains as a unit according to the -a option.
```
-10. HPC.daligner [-vad] [-t<int>] [-w<int(6)>] [-l<int(1500)] [-s<int(100)] [-M<int>]
+9. HPC.daligner [-vad] [-t<int>] [-w<int(6)>] [-l<int(1500)] [-s<int(100)] [-M<int>]
[-P<dir(/tmp)>] [-B<int(4)>] [-T<int(4)>] [-f<name>]
( [-k<int(16)>] [-h<int(50)>] [-e<double(.75)] [-H<int>]
[-k<int(20)>] [-h<int(50)>] [-e<double(.85)] <ref:db|dam> )
diff --git a/debian/changelog b/debian/changelog
index 03c1782..84a5049 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,10 @@
+daligner (1.0+git20221215.bd26967-1) unstable; urgency=medium
+
+ * New upstream version
+ * Standards-Version: 4.6.2 (routine-update)
+
+ -- Andreas Tille <tille@debian.org> Mon, 06 Feb 2023 11:01:05 +0100
+
daligner (1.0+git20210330.f61b8cf-1) unstable; urgency=medium
* New upstream version
diff --git a/debian/control b/debian/control
index 280d4b6..b2867f7 100644
--- a/debian/control
+++ b/debian/control
@@ -5,7 +5,7 @@ Uploaders: Andreas Tille <tille@debian.org>,
Section: science
Priority: optional
Build-Depends: debhelper-compat (= 13)
-Standards-Version: 4.6.1
+Standards-Version: 4.6.2
Vcs-Browser: https://salsa.debian.org/med-team/daligner
Vcs-Git: https://salsa.debian.org/med-team/daligner.git
Homepage: https://dazzlerblog.wordpress.com
diff --git a/debian/patches/cflags.patch b/debian/patches/cflags.patch
index c962c7f..1d640ea 100644
--- a/debian/patches/cflags.patch
+++ b/debian/patches/cflags.patch
@@ -11,5 +11,5 @@ Last-Update: 2016-01-08
-CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing
+CFLAGS += -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing
- ALL = daligner HPC.daligner LAsort LAmerge LAsplit LAcat LAshow LAdump LAcheck LAa2b LAb2a dumpLA
+ ALL = daligner HPC.daligner LAsort LAmerge LAsplit LAcat LAshow LA2ONE LAcheck ONE2LA
diff --git a/debian/patches/cppflags.patch b/debian/patches/cppflags.patch
index 07b863b..8a14c18 100644
--- a/debian/patches/cppflags.patch
+++ b/debian/patches/cppflags.patch
@@ -5,9 +5,9 @@ Forwarded: not-needed
Last-Update: 2022-08-03
---
This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
---- daligner.orig/Makefile
-+++ daligner/Makefile
-@@ -10,40 +10,40 @@
+--- a/Makefile
++++ b/Makefile
+@@ -10,34 +10,34 @@ ALL = daligner HPC.daligner LAsort LAmer
all: $(ALL)
daligner: daligner.c filter.c filter.h lsd.sort.c lsd.sort.h align.c align.h DB.c DB.h QV.c QV.h
@@ -30,9 +30,9 @@ This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
- $(CC) $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS)
+ $(CC) $(CPPFLAGS) $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS)
- LAdump: LAdump.c align.c align.h DB.c DB.h QV.c QV.h
-- $(CC) $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm $(LDFLAGS)
-+ $(CC) $(CPPFLAGS) $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm $(LDFLAGS)
+ LA2ONE: LA2ONE.c align.c align.h DB.c DB.h QV.c QV.h ONElib.c ONElib.h
+- $(CC) $(CFLAGS) -o LA2ONE LA2ONE.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS)
++ $(CC) $(CPPFLAGS) $(CFLAGS) -o LA2ONE LA2ONE.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS)
LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h
- $(CC) $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS)
@@ -46,17 +46,9 @@ This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
- $(CC) $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS)
+ $(CC) $(CPPFLAGS) $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS)
- LAa2b: LAa2b.c align.c align.h DB.c DB.h QV.c QV.h
-- $(CC) $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm $(LDFLAGS)
-+ $(CC) $(CPPFLAGS) $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm $(LDFLAGS)
-
- LAb2a: LAb2a.c align.c align.h DB.c DB.h QV.c QV.h
-- $(CC) $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm $(LDFLAGS)
-+ $(CC) $(CPPFLAGS) $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm $(LDFLAGS)
-
- dumpLA: dumpLA.c align.c align.h DB.c DB.h QV.c QV.h
-- $(CC) $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm $(LDFLAGS)
-+ $(CC) $(CPPFLAGS) $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm $(LDFLAGS)
+ ONE2LA: ONE2LA.c align.c align.h DB.c DB.h QV.c QV.h ONElib.c ONElib.h
+- $(CC) $(CFLAGS) -o ONE2LA ONE2LA.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS)
++ $(CC) $(CPPFLAGS) $(CFLAGS) -o ONE2LA ONE2LA.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS)
clean:
rm -f $(ALL)
diff --git a/debian/patches/cross.patch b/debian/patches/cross.patch
index 2d1f3b3..0b4babc 100644
--- a/debian/patches/cross.patch
+++ b/debian/patches/cross.patch
@@ -5,7 +5,7 @@ Bug-Debian: https://bugs.debian.org/968434
--- a/Makefile
+++ b/Makefile
-@@ -10,40 +10,40 @@ ALL = daligner HPC.daligner LAsort LAmer
+@@ -10,34 +10,34 @@ ALL = daligner HPC.daligner LAsort LAmer
all: $(ALL)
daligner: daligner.c filter.c filter.h lsd.sort.c lsd.sort.h align.c align.h DB.c DB.h QV.c QV.h
@@ -28,9 +28,9 @@ Bug-Debian: https://bugs.debian.org/968434
- gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS)
+ $(CC) $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS)
- LAdump: LAdump.c align.c align.h DB.c DB.h QV.c QV.h
-- gcc $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm $(LDFLAGS)
-+ $(CC) $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm $(LDFLAGS)
+ LA2ONE: LA2ONE.c align.c align.h DB.c DB.h QV.c QV.h ONElib.c ONElib.h
+- gcc $(CFLAGS) -o LA2ONE LA2ONE.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS)
++ $(CC) $(CFLAGS) -o LA2ONE LA2ONE.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS)
LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h
- gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS)
@@ -44,17 +44,9 @@ Bug-Debian: https://bugs.debian.org/968434
- gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS)
+ $(CC) $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS)
- LAa2b: LAa2b.c align.c align.h DB.c DB.h QV.c QV.h
-- gcc $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm $(LDFLAGS)
-+ $(CC) $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm $(LDFLAGS)
-
- LAb2a: LAb2a.c align.c align.h DB.c DB.h QV.c QV.h
-- gcc $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm $(LDFLAGS)
-+ $(CC) $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm $(LDFLAGS)
-
- dumpLA: dumpLA.c align.c align.h DB.c DB.h QV.c QV.h
-- gcc $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm $(LDFLAGS)
-+ $(CC) $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm $(LDFLAGS)
+ ONE2LA: ONE2LA.c align.c align.h DB.c DB.h QV.c QV.h ONElib.c ONElib.h
+- gcc $(CFLAGS) -o ONE2LA ONE2LA.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS)
++ $(CC) $(CFLAGS) -o ONE2LA ONE2LA.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS)
clean:
rm -f $(ALL)
diff --git a/debian/patches/destdir-install.patch b/debian/patches/destdir-install.patch
index 8f98f17..48e60f8 100644
--- a/debian/patches/destdir-install.patch
+++ b/debian/patches/destdir-install.patch
@@ -4,7 +4,7 @@ Forwarded: not-needed
Last-Update: 2016-01-08
--- a/Makefile
+++ b/Makefile
-@@ -51,7 +51,8 @@ clean:
+@@ -45,7 +45,8 @@ clean:
rm -f daligner.tar.gz
install:
diff --git a/debian/patches/lddflags.patch b/debian/patches/lddflags.patch
index fb8230e..d4f5a52 100644
--- a/debian/patches/lddflags.patch
+++ b/debian/patches/lddflags.patch
@@ -7,7 +7,7 @@ Last-Update: 2016-01-08
--- a/Makefile
+++ b/Makefile
-@@ -10,40 +10,40 @@ ALL = daligner HPC.daligner LAsort LAmer
+@@ -10,34 +10,34 @@ ALL = daligner HPC.daligner LAsort LAmer
all: $(ALL)
daligner: daligner.c filter.c filter.h lsd.sort.c lsd.sort.h align.c align.h DB.c DB.h QV.c QV.h
@@ -30,9 +30,9 @@ Last-Update: 2016-01-08
- gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm
+ gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS)
- LAdump: LAdump.c align.c align.h DB.c DB.h QV.c QV.h
-- gcc $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm
-+ gcc $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm $(LDFLAGS)
+ LA2ONE: LA2ONE.c align.c align.h DB.c DB.h QV.c QV.h ONElib.c ONElib.h
+- gcc $(CFLAGS) -o LA2ONE LA2ONE.c align.c DB.c QV.c ONElib.c -lm
++ gcc $(CFLAGS) -o LA2ONE LA2ONE.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS)
LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h
- gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm
@@ -46,17 +46,9 @@ Last-Update: 2016-01-08
- gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm
+ gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS)
- LAa2b: LAa2b.c align.c align.h DB.c DB.h QV.c QV.h
-- gcc $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm
-+ gcc $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm $(LDFLAGS)
-
- LAb2a: LAb2a.c align.c align.h DB.c DB.h QV.c QV.h
-- gcc $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm
-+ gcc $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm $(LDFLAGS)
-
- dumpLA: dumpLA.c align.c align.h DB.c DB.h QV.c QV.h
-- gcc $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm
-+ gcc $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm $(LDFLAGS)
+ ONE2LA: ONE2LA.c align.c align.h DB.c DB.h QV.c QV.h ONElib.c ONElib.h
+- gcc $(CFLAGS) -o ONE2LA ONE2LA.c align.c DB.c QV.c ONElib.c -lm
++ gcc $(CFLAGS) -o ONE2LA ONE2LA.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS)
clean:
rm -f $(ALL)
diff --git a/dumpLA.c b/dumpLA.c
deleted file mode 100644
index 89e52eb..0000000
--- a/dumpLA.c
+++ /dev/null
@@ -1,177 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "DB.h"
-#include "align.h"
-
-int main(int argc, char *argv[])
-{ char code, which;
- int64 total;
- int aread, bread;
- char orient, chain;
- int alen, blen;
- int ab, ae, bb, be;
- int diffs;
- int len;
- int tspace, small, tbytes;
- uint8 *tbuffer = NULL;
- uint16 *sbuffer = NULL;
- int hasNext, haveC, haveT, haveD;
- int nline;
- int i;
- int64 novls;
- Overlap _ovl, *ovl = &_ovl;
- Path *path = & (ovl->path);
- FILE *output;
- char *pwd, *root;
-
- // Process arguments
-
- if (argc != 2)
- { fprintf(stderr,"Usage: dumpLA <align:las> < (ascii dump)\n");
- exit (1);
- }
-
- pwd = PathTo(argv[1]);
- root = Root(argv[1],".las");
- if ((output = fopen(Catenate(pwd,"/",root,".las"),"w")) == NULL)
- { fprintf(stderr,"DumpLA: Cannot open %s for writing\n",argv[1]);
- exit (1);
- }
- free(root);
- free(pwd);
-
- nline = 1;
- small = 0;
- tbytes = 2;
- hasNext = 0;
- while (scanf(" %c",&code) == 1) // Header lines
- if (code == '@' || code == '+' || code == '%')
- { scanf(" %c %lld",&which,&total);
- if (code == '@' && which == 'T')
- { tbuffer = (uint8 *) malloc(2*total*sizeof(uint16));
- sbuffer = (uint16 *) tbuffer;
- }
- nline += 1;
- }
- else
- { if (tbuffer == NULL)
- { fprintf(stderr,"DumpLA: Line %d: .las dump must contain trace header lines\n",nline);
- exit (1);
- }
- if (code != 'X')
- { fprintf(stderr,"DumpLA: Line %d: .las dump must have an X-line after header\n",nline);
- exit (1);
- }
- scanf(" %d",&tspace);
- if (tspace <= TRACE_XOVR && tspace != 0)
- { small = 1;
- tbytes = 1;
- }
- else
- { small = 0;
- tbytes = 2;
- }
- nline += 1;
- if (scanf(" %c",&code) == 1)
- { if (code != 'P')
- { fprintf(stderr,"DumpLA: Line %d: .las dump data must being with a P-line\n",nline);
- exit (1);
- }
- hasNext = 1;
- }
- break;
- }
-
- novls = 0;
- fwrite(&novls,sizeof(int64),1,output);
- fwrite(&tspace,sizeof(int),1,output);
-
- while (hasNext) // For each data line do
- { scanf(" %d %d %c %c",&aread,&bread,&orient,&chain);
- nline += 1;
- haveC = haveT = haveD = 0;
- hasNext = 0;
- while (scanf(" %c",&code) == 1) // For each data line do
- if (code == 'P')
- { hasNext = 1;
- break;
- }
- else
- switch (code)
- { case 'L': // Read lengths
- scanf(" %d %d",&alen,&blen);
- nline += 1;
- break;
- case 'C': // Coordinate intervals
- scanf(" %d %d %d %d",&ab,&ae,&bb,&be);
- nline += 1;
- haveC = 1;
- break;
- case 'D': // Differences
- scanf(" %d",&diffs);
- nline += 1;
- haveD = 1;
- break;
- case 'T': // Mask
- haveT = 1;
- scanf(" %d",&len);
- nline += (len+1);
- len *= 2;
- if (small)
- { for (int i = 0; i < len; i += 2)
- scanf(" %hhd %hhd",tbuffer+i,tbuffer+(i+1));
- }
- else
- { for (int i = 0; i < len; i += 2)
- scanf(" %hd %hd",sbuffer+i,sbuffer+(i+1));
- }
- break;
- default:
- fprintf(stderr,"DumpLA: Line %d: Unrecognized line type '%c'\n",nline,code);
- exit (1);
- }
- if (!haveC)
- { fprintf(stderr,"DumpLA: Line %d: Alignment record does not have a C-line\n",nline);
- exit (1);
- }
- if (!haveT)
- { fprintf(stderr,"DumpLA: Line %d: Alignment record does not have a T-line\n",nline);
- exit (1);
- }
- if (!haveD)
- { diffs = 0;
- for (i = 0; i < len; i += 2)
- diffs += tbuffer[i];
- }
-
- novls += 1;
- ovl->aread = aread-1;
- ovl->bread = bread-1;
- ovl->flags = 0;
- if (orient == 'c')
- ovl->flags |= COMP_FLAG;
- if (chain == '-')
- ovl->flags |= NEXT_FLAG;
- else if (chain == '>')
- ovl->flags |= BEST_FLAG;
- else if (chain == '+')
- ovl->flags |= START_FLAG;
- path->abpos = ab;
- path->aepos = ae;
- path->bbpos = bb;
- path->bepos = be;
- path->diffs = diffs;
- path->tlen = len;
- path->trace = (void *) tbuffer;
-
- Write_Overlap(output,ovl,tbytes);
- }
-
- rewind(output);
- fwrite(&novls,sizeof(int64),1,output);
-
- fclose(output);
-
- exit (0);
-}